1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 refcount_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(&changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio), 178 BIOSET_NEED_BVECS); 179 if (!btrfs_bioset) 180 goto free_buffer_cache; 181 182 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 183 goto free_bioset; 184 185 return 0; 186 187 free_bioset: 188 bioset_free(btrfs_bioset); 189 btrfs_bioset = NULL; 190 191 free_buffer_cache: 192 kmem_cache_destroy(extent_buffer_cache); 193 extent_buffer_cache = NULL; 194 195 free_state_cache: 196 kmem_cache_destroy(extent_state_cache); 197 extent_state_cache = NULL; 198 return -ENOMEM; 199 } 200 201 void extent_io_exit(void) 202 { 203 btrfs_leak_debug_check(); 204 205 /* 206 * Make sure all delayed rcu free are flushed before we 207 * destroy caches. 208 */ 209 rcu_barrier(); 210 kmem_cache_destroy(extent_state_cache); 211 kmem_cache_destroy(extent_buffer_cache); 212 if (btrfs_bioset) 213 bioset_free(btrfs_bioset); 214 } 215 216 void extent_io_tree_init(struct extent_io_tree *tree, 217 struct address_space *mapping) 218 { 219 tree->state = RB_ROOT; 220 tree->ops = NULL; 221 tree->dirty_bytes = 0; 222 spin_lock_init(&tree->lock); 223 tree->mapping = mapping; 224 } 225 226 static struct extent_state *alloc_extent_state(gfp_t mask) 227 { 228 struct extent_state *state; 229 230 /* 231 * The given mask might be not appropriate for the slab allocator, 232 * drop the unsupported bits 233 */ 234 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 235 state = kmem_cache_alloc(extent_state_cache, mask); 236 if (!state) 237 return state; 238 state->state = 0; 239 state->failrec = NULL; 240 RB_CLEAR_NODE(&state->rb_node); 241 btrfs_leak_debug_add(&state->leak_list, &states); 242 refcount_set(&state->refs, 1); 243 init_waitqueue_head(&state->wq); 244 trace_alloc_extent_state(state, mask, _RET_IP_); 245 return state; 246 } 247 248 void free_extent_state(struct extent_state *state) 249 { 250 if (!state) 251 return; 252 if (refcount_dec_and_test(&state->refs)) { 253 WARN_ON(extent_state_in_tree(state)); 254 btrfs_leak_debug_del(&state->leak_list); 255 trace_free_extent_state(state, _RET_IP_); 256 kmem_cache_free(extent_state_cache, state); 257 } 258 } 259 260 static struct rb_node *tree_insert(struct rb_root *root, 261 struct rb_node *search_start, 262 u64 offset, 263 struct rb_node *node, 264 struct rb_node ***p_in, 265 struct rb_node **parent_in) 266 { 267 struct rb_node **p; 268 struct rb_node *parent = NULL; 269 struct tree_entry *entry; 270 271 if (p_in && parent_in) { 272 p = *p_in; 273 parent = *parent_in; 274 goto do_insert; 275 } 276 277 p = search_start ? &search_start : &root->rb_node; 278 while (*p) { 279 parent = *p; 280 entry = rb_entry(parent, struct tree_entry, rb_node); 281 282 if (offset < entry->start) 283 p = &(*p)->rb_left; 284 else if (offset > entry->end) 285 p = &(*p)->rb_right; 286 else 287 return parent; 288 } 289 290 do_insert: 291 rb_link_node(node, parent, p); 292 rb_insert_color(node, root); 293 return NULL; 294 } 295 296 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 297 struct rb_node **prev_ret, 298 struct rb_node **next_ret, 299 struct rb_node ***p_ret, 300 struct rb_node **parent_ret) 301 { 302 struct rb_root *root = &tree->state; 303 struct rb_node **n = &root->rb_node; 304 struct rb_node *prev = NULL; 305 struct rb_node *orig_prev = NULL; 306 struct tree_entry *entry; 307 struct tree_entry *prev_entry = NULL; 308 309 while (*n) { 310 prev = *n; 311 entry = rb_entry(prev, struct tree_entry, rb_node); 312 prev_entry = entry; 313 314 if (offset < entry->start) 315 n = &(*n)->rb_left; 316 else if (offset > entry->end) 317 n = &(*n)->rb_right; 318 else 319 return *n; 320 } 321 322 if (p_ret) 323 *p_ret = n; 324 if (parent_ret) 325 *parent_ret = prev; 326 327 if (prev_ret) { 328 orig_prev = prev; 329 while (prev && offset > prev_entry->end) { 330 prev = rb_next(prev); 331 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 332 } 333 *prev_ret = prev; 334 prev = orig_prev; 335 } 336 337 if (next_ret) { 338 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 339 while (prev && offset < prev_entry->start) { 340 prev = rb_prev(prev); 341 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 342 } 343 *next_ret = prev; 344 } 345 return NULL; 346 } 347 348 static inline struct rb_node * 349 tree_search_for_insert(struct extent_io_tree *tree, 350 u64 offset, 351 struct rb_node ***p_ret, 352 struct rb_node **parent_ret) 353 { 354 struct rb_node *prev = NULL; 355 struct rb_node *ret; 356 357 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 358 if (!ret) 359 return prev; 360 return ret; 361 } 362 363 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 364 u64 offset) 365 { 366 return tree_search_for_insert(tree, offset, NULL, NULL); 367 } 368 369 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 370 struct extent_state *other) 371 { 372 if (tree->ops && tree->ops->merge_extent_hook) 373 tree->ops->merge_extent_hook(tree->mapping->host, new, 374 other); 375 } 376 377 /* 378 * utility function to look for merge candidates inside a given range. 379 * Any extents with matching state are merged together into a single 380 * extent in the tree. Extents with EXTENT_IO in their state field 381 * are not merged because the end_io handlers need to be able to do 382 * operations on them without sleeping (or doing allocations/splits). 383 * 384 * This should be called with the tree lock held. 385 */ 386 static void merge_state(struct extent_io_tree *tree, 387 struct extent_state *state) 388 { 389 struct extent_state *other; 390 struct rb_node *other_node; 391 392 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 393 return; 394 395 other_node = rb_prev(&state->rb_node); 396 if (other_node) { 397 other = rb_entry(other_node, struct extent_state, rb_node); 398 if (other->end == state->start - 1 && 399 other->state == state->state) { 400 merge_cb(tree, state, other); 401 state->start = other->start; 402 rb_erase(&other->rb_node, &tree->state); 403 RB_CLEAR_NODE(&other->rb_node); 404 free_extent_state(other); 405 } 406 } 407 other_node = rb_next(&state->rb_node); 408 if (other_node) { 409 other = rb_entry(other_node, struct extent_state, rb_node); 410 if (other->start == state->end + 1 && 411 other->state == state->state) { 412 merge_cb(tree, state, other); 413 state->end = other->end; 414 rb_erase(&other->rb_node, &tree->state); 415 RB_CLEAR_NODE(&other->rb_node); 416 free_extent_state(other); 417 } 418 } 419 } 420 421 static void set_state_cb(struct extent_io_tree *tree, 422 struct extent_state *state, unsigned *bits) 423 { 424 if (tree->ops && tree->ops->set_bit_hook) 425 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 426 } 427 428 static void clear_state_cb(struct extent_io_tree *tree, 429 struct extent_state *state, unsigned *bits) 430 { 431 if (tree->ops && tree->ops->clear_bit_hook) 432 tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), 433 state, bits); 434 } 435 436 static void set_state_bits(struct extent_io_tree *tree, 437 struct extent_state *state, unsigned *bits, 438 struct extent_changeset *changeset); 439 440 /* 441 * insert an extent_state struct into the tree. 'bits' are set on the 442 * struct before it is inserted. 443 * 444 * This may return -EEXIST if the extent is already there, in which case the 445 * state struct is freed. 446 * 447 * The tree lock is not taken internally. This is a utility function and 448 * probably isn't what you want to call (see set/clear_extent_bit). 449 */ 450 static int insert_state(struct extent_io_tree *tree, 451 struct extent_state *state, u64 start, u64 end, 452 struct rb_node ***p, 453 struct rb_node **parent, 454 unsigned *bits, struct extent_changeset *changeset) 455 { 456 struct rb_node *node; 457 458 if (end < start) 459 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 460 end, start); 461 state->start = start; 462 state->end = end; 463 464 set_state_bits(tree, state, bits, changeset); 465 466 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 467 if (node) { 468 struct extent_state *found; 469 found = rb_entry(node, struct extent_state, rb_node); 470 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 471 found->start, found->end, start, end); 472 return -EEXIST; 473 } 474 merge_state(tree, state); 475 return 0; 476 } 477 478 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 479 u64 split) 480 { 481 if (tree->ops && tree->ops->split_extent_hook) 482 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 483 } 484 485 /* 486 * split a given extent state struct in two, inserting the preallocated 487 * struct 'prealloc' as the newly created second half. 'split' indicates an 488 * offset inside 'orig' where it should be split. 489 * 490 * Before calling, 491 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 492 * are two extent state structs in the tree: 493 * prealloc: [orig->start, split - 1] 494 * orig: [ split, orig->end ] 495 * 496 * The tree locks are not taken by this function. They need to be held 497 * by the caller. 498 */ 499 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 500 struct extent_state *prealloc, u64 split) 501 { 502 struct rb_node *node; 503 504 split_cb(tree, orig, split); 505 506 prealloc->start = orig->start; 507 prealloc->end = split - 1; 508 prealloc->state = orig->state; 509 orig->start = split; 510 511 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 512 &prealloc->rb_node, NULL, NULL); 513 if (node) { 514 free_extent_state(prealloc); 515 return -EEXIST; 516 } 517 return 0; 518 } 519 520 static struct extent_state *next_state(struct extent_state *state) 521 { 522 struct rb_node *next = rb_next(&state->rb_node); 523 if (next) 524 return rb_entry(next, struct extent_state, rb_node); 525 else 526 return NULL; 527 } 528 529 /* 530 * utility function to clear some bits in an extent state struct. 531 * it will optionally wake up any one waiting on this state (wake == 1). 532 * 533 * If no bits are set on the state struct after clearing things, the 534 * struct is freed and removed from the tree 535 */ 536 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 537 struct extent_state *state, 538 unsigned *bits, int wake, 539 struct extent_changeset *changeset) 540 { 541 struct extent_state *next; 542 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 543 544 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 545 u64 range = state->end - state->start + 1; 546 WARN_ON(range > tree->dirty_bytes); 547 tree->dirty_bytes -= range; 548 } 549 clear_state_cb(tree, state, bits); 550 add_extent_changeset(state, bits_to_clear, changeset, 0); 551 state->state &= ~bits_to_clear; 552 if (wake) 553 wake_up(&state->wq); 554 if (state->state == 0) { 555 next = next_state(state); 556 if (extent_state_in_tree(state)) { 557 rb_erase(&state->rb_node, &tree->state); 558 RB_CLEAR_NODE(&state->rb_node); 559 free_extent_state(state); 560 } else { 561 WARN_ON(1); 562 } 563 } else { 564 merge_state(tree, state); 565 next = next_state(state); 566 } 567 return next; 568 } 569 570 static struct extent_state * 571 alloc_extent_state_atomic(struct extent_state *prealloc) 572 { 573 if (!prealloc) 574 prealloc = alloc_extent_state(GFP_ATOMIC); 575 576 return prealloc; 577 } 578 579 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 580 { 581 btrfs_panic(tree_fs_info(tree), err, 582 "Locking error: Extent tree was modified by another thread while locked."); 583 } 584 585 /* 586 * clear some bits on a range in the tree. This may require splitting 587 * or inserting elements in the tree, so the gfp mask is used to 588 * indicate which allocations or sleeping are allowed. 589 * 590 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 591 * the given range from the tree regardless of state (ie for truncate). 592 * 593 * the range [start, end] is inclusive. 594 * 595 * This takes the tree lock, and returns 0 on success and < 0 on error. 596 */ 597 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 598 unsigned bits, int wake, int delete, 599 struct extent_state **cached_state, 600 gfp_t mask, struct extent_changeset *changeset) 601 { 602 struct extent_state *state; 603 struct extent_state *cached; 604 struct extent_state *prealloc = NULL; 605 struct rb_node *node; 606 u64 last_end; 607 int err; 608 int clear = 0; 609 610 btrfs_debug_check_extent_io_range(tree, start, end); 611 612 if (bits & EXTENT_DELALLOC) 613 bits |= EXTENT_NORESERVE; 614 615 if (delete) 616 bits |= ~EXTENT_CTLBITS; 617 bits |= EXTENT_FIRST_DELALLOC; 618 619 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 620 clear = 1; 621 again: 622 if (!prealloc && gfpflags_allow_blocking(mask)) { 623 /* 624 * Don't care for allocation failure here because we might end 625 * up not needing the pre-allocated extent state at all, which 626 * is the case if we only have in the tree extent states that 627 * cover our input range and don't cover too any other range. 628 * If we end up needing a new extent state we allocate it later. 629 */ 630 prealloc = alloc_extent_state(mask); 631 } 632 633 spin_lock(&tree->lock); 634 if (cached_state) { 635 cached = *cached_state; 636 637 if (clear) { 638 *cached_state = NULL; 639 cached_state = NULL; 640 } 641 642 if (cached && extent_state_in_tree(cached) && 643 cached->start <= start && cached->end > start) { 644 if (clear) 645 refcount_dec(&cached->refs); 646 state = cached; 647 goto hit_next; 648 } 649 if (clear) 650 free_extent_state(cached); 651 } 652 /* 653 * this search will find the extents that end after 654 * our range starts 655 */ 656 node = tree_search(tree, start); 657 if (!node) 658 goto out; 659 state = rb_entry(node, struct extent_state, rb_node); 660 hit_next: 661 if (state->start > end) 662 goto out; 663 WARN_ON(state->end < start); 664 last_end = state->end; 665 666 /* the state doesn't have the wanted bits, go ahead */ 667 if (!(state->state & bits)) { 668 state = next_state(state); 669 goto next; 670 } 671 672 /* 673 * | ---- desired range ---- | 674 * | state | or 675 * | ------------- state -------------- | 676 * 677 * We need to split the extent we found, and may flip 678 * bits on second half. 679 * 680 * If the extent we found extends past our range, we 681 * just split and search again. It'll get split again 682 * the next time though. 683 * 684 * If the extent we found is inside our range, we clear 685 * the desired bit on it. 686 */ 687 688 if (state->start < start) { 689 prealloc = alloc_extent_state_atomic(prealloc); 690 BUG_ON(!prealloc); 691 err = split_state(tree, state, prealloc, start); 692 if (err) 693 extent_io_tree_panic(tree, err); 694 695 prealloc = NULL; 696 if (err) 697 goto out; 698 if (state->end <= end) { 699 state = clear_state_bit(tree, state, &bits, wake, 700 changeset); 701 goto next; 702 } 703 goto search_again; 704 } 705 /* 706 * | ---- desired range ---- | 707 * | state | 708 * We need to split the extent, and clear the bit 709 * on the first half 710 */ 711 if (state->start <= end && state->end > end) { 712 prealloc = alloc_extent_state_atomic(prealloc); 713 BUG_ON(!prealloc); 714 err = split_state(tree, state, prealloc, end + 1); 715 if (err) 716 extent_io_tree_panic(tree, err); 717 718 if (wake) 719 wake_up(&state->wq); 720 721 clear_state_bit(tree, prealloc, &bits, wake, changeset); 722 723 prealloc = NULL; 724 goto out; 725 } 726 727 state = clear_state_bit(tree, state, &bits, wake, changeset); 728 next: 729 if (last_end == (u64)-1) 730 goto out; 731 start = last_end + 1; 732 if (start <= end && state && !need_resched()) 733 goto hit_next; 734 735 search_again: 736 if (start > end) 737 goto out; 738 spin_unlock(&tree->lock); 739 if (gfpflags_allow_blocking(mask)) 740 cond_resched(); 741 goto again; 742 743 out: 744 spin_unlock(&tree->lock); 745 if (prealloc) 746 free_extent_state(prealloc); 747 748 return 0; 749 750 } 751 752 static void wait_on_state(struct extent_io_tree *tree, 753 struct extent_state *state) 754 __releases(tree->lock) 755 __acquires(tree->lock) 756 { 757 DEFINE_WAIT(wait); 758 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 759 spin_unlock(&tree->lock); 760 schedule(); 761 spin_lock(&tree->lock); 762 finish_wait(&state->wq, &wait); 763 } 764 765 /* 766 * waits for one or more bits to clear on a range in the state tree. 767 * The range [start, end] is inclusive. 768 * The tree lock is taken by this function 769 */ 770 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 771 unsigned long bits) 772 { 773 struct extent_state *state; 774 struct rb_node *node; 775 776 btrfs_debug_check_extent_io_range(tree, start, end); 777 778 spin_lock(&tree->lock); 779 again: 780 while (1) { 781 /* 782 * this search will find all the extents that end after 783 * our range starts 784 */ 785 node = tree_search(tree, start); 786 process_node: 787 if (!node) 788 break; 789 790 state = rb_entry(node, struct extent_state, rb_node); 791 792 if (state->start > end) 793 goto out; 794 795 if (state->state & bits) { 796 start = state->start; 797 refcount_inc(&state->refs); 798 wait_on_state(tree, state); 799 free_extent_state(state); 800 goto again; 801 } 802 start = state->end + 1; 803 804 if (start > end) 805 break; 806 807 if (!cond_resched_lock(&tree->lock)) { 808 node = rb_next(node); 809 goto process_node; 810 } 811 } 812 out: 813 spin_unlock(&tree->lock); 814 } 815 816 static void set_state_bits(struct extent_io_tree *tree, 817 struct extent_state *state, 818 unsigned *bits, struct extent_changeset *changeset) 819 { 820 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 821 822 set_state_cb(tree, state, bits); 823 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 824 u64 range = state->end - state->start + 1; 825 tree->dirty_bytes += range; 826 } 827 add_extent_changeset(state, bits_to_set, changeset, 1); 828 state->state |= bits_to_set; 829 } 830 831 static void cache_state_if_flags(struct extent_state *state, 832 struct extent_state **cached_ptr, 833 unsigned flags) 834 { 835 if (cached_ptr && !(*cached_ptr)) { 836 if (!flags || (state->state & flags)) { 837 *cached_ptr = state; 838 refcount_inc(&state->refs); 839 } 840 } 841 } 842 843 static void cache_state(struct extent_state *state, 844 struct extent_state **cached_ptr) 845 { 846 return cache_state_if_flags(state, cached_ptr, 847 EXTENT_IOBITS | EXTENT_BOUNDARY); 848 } 849 850 /* 851 * set some bits on a range in the tree. This may require allocations or 852 * sleeping, so the gfp mask is used to indicate what is allowed. 853 * 854 * If any of the exclusive bits are set, this will fail with -EEXIST if some 855 * part of the range already has the desired bits set. The start of the 856 * existing range is returned in failed_start in this case. 857 * 858 * [start, end] is inclusive This takes the tree lock. 859 */ 860 861 static int __must_check 862 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 863 unsigned bits, unsigned exclusive_bits, 864 u64 *failed_start, struct extent_state **cached_state, 865 gfp_t mask, struct extent_changeset *changeset) 866 { 867 struct extent_state *state; 868 struct extent_state *prealloc = NULL; 869 struct rb_node *node; 870 struct rb_node **p; 871 struct rb_node *parent; 872 int err = 0; 873 u64 last_start; 874 u64 last_end; 875 876 btrfs_debug_check_extent_io_range(tree, start, end); 877 878 bits |= EXTENT_FIRST_DELALLOC; 879 again: 880 if (!prealloc && gfpflags_allow_blocking(mask)) { 881 /* 882 * Don't care for allocation failure here because we might end 883 * up not needing the pre-allocated extent state at all, which 884 * is the case if we only have in the tree extent states that 885 * cover our input range and don't cover too any other range. 886 * If we end up needing a new extent state we allocate it later. 887 */ 888 prealloc = alloc_extent_state(mask); 889 } 890 891 spin_lock(&tree->lock); 892 if (cached_state && *cached_state) { 893 state = *cached_state; 894 if (state->start <= start && state->end > start && 895 extent_state_in_tree(state)) { 896 node = &state->rb_node; 897 goto hit_next; 898 } 899 } 900 /* 901 * this search will find all the extents that end after 902 * our range starts. 903 */ 904 node = tree_search_for_insert(tree, start, &p, &parent); 905 if (!node) { 906 prealloc = alloc_extent_state_atomic(prealloc); 907 BUG_ON(!prealloc); 908 err = insert_state(tree, prealloc, start, end, 909 &p, &parent, &bits, changeset); 910 if (err) 911 extent_io_tree_panic(tree, err); 912 913 cache_state(prealloc, cached_state); 914 prealloc = NULL; 915 goto out; 916 } 917 state = rb_entry(node, struct extent_state, rb_node); 918 hit_next: 919 last_start = state->start; 920 last_end = state->end; 921 922 /* 923 * | ---- desired range ---- | 924 * | state | 925 * 926 * Just lock what we found and keep going 927 */ 928 if (state->start == start && state->end <= end) { 929 if (state->state & exclusive_bits) { 930 *failed_start = state->start; 931 err = -EEXIST; 932 goto out; 933 } 934 935 set_state_bits(tree, state, &bits, changeset); 936 cache_state(state, cached_state); 937 merge_state(tree, state); 938 if (last_end == (u64)-1) 939 goto out; 940 start = last_end + 1; 941 state = next_state(state); 942 if (start < end && state && state->start == start && 943 !need_resched()) 944 goto hit_next; 945 goto search_again; 946 } 947 948 /* 949 * | ---- desired range ---- | 950 * | state | 951 * or 952 * | ------------- state -------------- | 953 * 954 * We need to split the extent we found, and may flip bits on 955 * second half. 956 * 957 * If the extent we found extends past our 958 * range, we just split and search again. It'll get split 959 * again the next time though. 960 * 961 * If the extent we found is inside our range, we set the 962 * desired bit on it. 963 */ 964 if (state->start < start) { 965 if (state->state & exclusive_bits) { 966 *failed_start = start; 967 err = -EEXIST; 968 goto out; 969 } 970 971 prealloc = alloc_extent_state_atomic(prealloc); 972 BUG_ON(!prealloc); 973 err = split_state(tree, state, prealloc, start); 974 if (err) 975 extent_io_tree_panic(tree, err); 976 977 prealloc = NULL; 978 if (err) 979 goto out; 980 if (state->end <= end) { 981 set_state_bits(tree, state, &bits, changeset); 982 cache_state(state, cached_state); 983 merge_state(tree, state); 984 if (last_end == (u64)-1) 985 goto out; 986 start = last_end + 1; 987 state = next_state(state); 988 if (start < end && state && state->start == start && 989 !need_resched()) 990 goto hit_next; 991 } 992 goto search_again; 993 } 994 /* 995 * | ---- desired range ---- | 996 * | state | or | state | 997 * 998 * There's a hole, we need to insert something in it and 999 * ignore the extent we found. 1000 */ 1001 if (state->start > start) { 1002 u64 this_end; 1003 if (end < last_start) 1004 this_end = end; 1005 else 1006 this_end = last_start - 1; 1007 1008 prealloc = alloc_extent_state_atomic(prealloc); 1009 BUG_ON(!prealloc); 1010 1011 /* 1012 * Avoid to free 'prealloc' if it can be merged with 1013 * the later extent. 1014 */ 1015 err = insert_state(tree, prealloc, start, this_end, 1016 NULL, NULL, &bits, changeset); 1017 if (err) 1018 extent_io_tree_panic(tree, err); 1019 1020 cache_state(prealloc, cached_state); 1021 prealloc = NULL; 1022 start = this_end + 1; 1023 goto search_again; 1024 } 1025 /* 1026 * | ---- desired range ---- | 1027 * | state | 1028 * We need to split the extent, and set the bit 1029 * on the first half 1030 */ 1031 if (state->start <= end && state->end > end) { 1032 if (state->state & exclusive_bits) { 1033 *failed_start = start; 1034 err = -EEXIST; 1035 goto out; 1036 } 1037 1038 prealloc = alloc_extent_state_atomic(prealloc); 1039 BUG_ON(!prealloc); 1040 err = split_state(tree, state, prealloc, end + 1); 1041 if (err) 1042 extent_io_tree_panic(tree, err); 1043 1044 set_state_bits(tree, prealloc, &bits, changeset); 1045 cache_state(prealloc, cached_state); 1046 merge_state(tree, prealloc); 1047 prealloc = NULL; 1048 goto out; 1049 } 1050 1051 search_again: 1052 if (start > end) 1053 goto out; 1054 spin_unlock(&tree->lock); 1055 if (gfpflags_allow_blocking(mask)) 1056 cond_resched(); 1057 goto again; 1058 1059 out: 1060 spin_unlock(&tree->lock); 1061 if (prealloc) 1062 free_extent_state(prealloc); 1063 1064 return err; 1065 1066 } 1067 1068 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1069 unsigned bits, u64 * failed_start, 1070 struct extent_state **cached_state, gfp_t mask) 1071 { 1072 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1073 cached_state, mask, NULL); 1074 } 1075 1076 1077 /** 1078 * convert_extent_bit - convert all bits in a given range from one bit to 1079 * another 1080 * @tree: the io tree to search 1081 * @start: the start offset in bytes 1082 * @end: the end offset in bytes (inclusive) 1083 * @bits: the bits to set in this range 1084 * @clear_bits: the bits to clear in this range 1085 * @cached_state: state that we're going to cache 1086 * 1087 * This will go through and set bits for the given range. If any states exist 1088 * already in this range they are set with the given bit and cleared of the 1089 * clear_bits. This is only meant to be used by things that are mergeable, ie 1090 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1091 * boundary bits like LOCK. 1092 * 1093 * All allocations are done with GFP_NOFS. 1094 */ 1095 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1096 unsigned bits, unsigned clear_bits, 1097 struct extent_state **cached_state) 1098 { 1099 struct extent_state *state; 1100 struct extent_state *prealloc = NULL; 1101 struct rb_node *node; 1102 struct rb_node **p; 1103 struct rb_node *parent; 1104 int err = 0; 1105 u64 last_start; 1106 u64 last_end; 1107 bool first_iteration = true; 1108 1109 btrfs_debug_check_extent_io_range(tree, start, end); 1110 1111 again: 1112 if (!prealloc) { 1113 /* 1114 * Best effort, don't worry if extent state allocation fails 1115 * here for the first iteration. We might have a cached state 1116 * that matches exactly the target range, in which case no 1117 * extent state allocations are needed. We'll only know this 1118 * after locking the tree. 1119 */ 1120 prealloc = alloc_extent_state(GFP_NOFS); 1121 if (!prealloc && !first_iteration) 1122 return -ENOMEM; 1123 } 1124 1125 spin_lock(&tree->lock); 1126 if (cached_state && *cached_state) { 1127 state = *cached_state; 1128 if (state->start <= start && state->end > start && 1129 extent_state_in_tree(state)) { 1130 node = &state->rb_node; 1131 goto hit_next; 1132 } 1133 } 1134 1135 /* 1136 * this search will find all the extents that end after 1137 * our range starts. 1138 */ 1139 node = tree_search_for_insert(tree, start, &p, &parent); 1140 if (!node) { 1141 prealloc = alloc_extent_state_atomic(prealloc); 1142 if (!prealloc) { 1143 err = -ENOMEM; 1144 goto out; 1145 } 1146 err = insert_state(tree, prealloc, start, end, 1147 &p, &parent, &bits, NULL); 1148 if (err) 1149 extent_io_tree_panic(tree, err); 1150 cache_state(prealloc, cached_state); 1151 prealloc = NULL; 1152 goto out; 1153 } 1154 state = rb_entry(node, struct extent_state, rb_node); 1155 hit_next: 1156 last_start = state->start; 1157 last_end = state->end; 1158 1159 /* 1160 * | ---- desired range ---- | 1161 * | state | 1162 * 1163 * Just lock what we found and keep going 1164 */ 1165 if (state->start == start && state->end <= end) { 1166 set_state_bits(tree, state, &bits, NULL); 1167 cache_state(state, cached_state); 1168 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1169 if (last_end == (u64)-1) 1170 goto out; 1171 start = last_end + 1; 1172 if (start < end && state && state->start == start && 1173 !need_resched()) 1174 goto hit_next; 1175 goto search_again; 1176 } 1177 1178 /* 1179 * | ---- desired range ---- | 1180 * | state | 1181 * or 1182 * | ------------- state -------------- | 1183 * 1184 * We need to split the extent we found, and may flip bits on 1185 * second half. 1186 * 1187 * If the extent we found extends past our 1188 * range, we just split and search again. It'll get split 1189 * again the next time though. 1190 * 1191 * If the extent we found is inside our range, we set the 1192 * desired bit on it. 1193 */ 1194 if (state->start < start) { 1195 prealloc = alloc_extent_state_atomic(prealloc); 1196 if (!prealloc) { 1197 err = -ENOMEM; 1198 goto out; 1199 } 1200 err = split_state(tree, state, prealloc, start); 1201 if (err) 1202 extent_io_tree_panic(tree, err); 1203 prealloc = NULL; 1204 if (err) 1205 goto out; 1206 if (state->end <= end) { 1207 set_state_bits(tree, state, &bits, NULL); 1208 cache_state(state, cached_state); 1209 state = clear_state_bit(tree, state, &clear_bits, 0, 1210 NULL); 1211 if (last_end == (u64)-1) 1212 goto out; 1213 start = last_end + 1; 1214 if (start < end && state && state->start == start && 1215 !need_resched()) 1216 goto hit_next; 1217 } 1218 goto search_again; 1219 } 1220 /* 1221 * | ---- desired range ---- | 1222 * | state | or | state | 1223 * 1224 * There's a hole, we need to insert something in it and 1225 * ignore the extent we found. 1226 */ 1227 if (state->start > start) { 1228 u64 this_end; 1229 if (end < last_start) 1230 this_end = end; 1231 else 1232 this_end = last_start - 1; 1233 1234 prealloc = alloc_extent_state_atomic(prealloc); 1235 if (!prealloc) { 1236 err = -ENOMEM; 1237 goto out; 1238 } 1239 1240 /* 1241 * Avoid to free 'prealloc' if it can be merged with 1242 * the later extent. 1243 */ 1244 err = insert_state(tree, prealloc, start, this_end, 1245 NULL, NULL, &bits, NULL); 1246 if (err) 1247 extent_io_tree_panic(tree, err); 1248 cache_state(prealloc, cached_state); 1249 prealloc = NULL; 1250 start = this_end + 1; 1251 goto search_again; 1252 } 1253 /* 1254 * | ---- desired range ---- | 1255 * | state | 1256 * We need to split the extent, and set the bit 1257 * on the first half 1258 */ 1259 if (state->start <= end && state->end > end) { 1260 prealloc = alloc_extent_state_atomic(prealloc); 1261 if (!prealloc) { 1262 err = -ENOMEM; 1263 goto out; 1264 } 1265 1266 err = split_state(tree, state, prealloc, end + 1); 1267 if (err) 1268 extent_io_tree_panic(tree, err); 1269 1270 set_state_bits(tree, prealloc, &bits, NULL); 1271 cache_state(prealloc, cached_state); 1272 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1273 prealloc = NULL; 1274 goto out; 1275 } 1276 1277 search_again: 1278 if (start > end) 1279 goto out; 1280 spin_unlock(&tree->lock); 1281 cond_resched(); 1282 first_iteration = false; 1283 goto again; 1284 1285 out: 1286 spin_unlock(&tree->lock); 1287 if (prealloc) 1288 free_extent_state(prealloc); 1289 1290 return err; 1291 } 1292 1293 /* wrappers around set/clear extent bit */ 1294 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1295 unsigned bits, struct extent_changeset *changeset) 1296 { 1297 /* 1298 * We don't support EXTENT_LOCKED yet, as current changeset will 1299 * record any bits changed, so for EXTENT_LOCKED case, it will 1300 * either fail with -EEXIST or changeset will record the whole 1301 * range. 1302 */ 1303 BUG_ON(bits & EXTENT_LOCKED); 1304 1305 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1306 changeset); 1307 } 1308 1309 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1310 unsigned bits, int wake, int delete, 1311 struct extent_state **cached, gfp_t mask) 1312 { 1313 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1314 cached, mask, NULL); 1315 } 1316 1317 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1318 unsigned bits, struct extent_changeset *changeset) 1319 { 1320 /* 1321 * Don't support EXTENT_LOCKED case, same reason as 1322 * set_record_extent_bits(). 1323 */ 1324 BUG_ON(bits & EXTENT_LOCKED); 1325 1326 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1327 changeset); 1328 } 1329 1330 /* 1331 * either insert or lock state struct between start and end use mask to tell 1332 * us if waiting is desired. 1333 */ 1334 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1335 struct extent_state **cached_state) 1336 { 1337 int err; 1338 u64 failed_start; 1339 1340 while (1) { 1341 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1342 EXTENT_LOCKED, &failed_start, 1343 cached_state, GFP_NOFS, NULL); 1344 if (err == -EEXIST) { 1345 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1346 start = failed_start; 1347 } else 1348 break; 1349 WARN_ON(start > end); 1350 } 1351 return err; 1352 } 1353 1354 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1355 { 1356 int err; 1357 u64 failed_start; 1358 1359 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1360 &failed_start, NULL, GFP_NOFS, NULL); 1361 if (err == -EEXIST) { 1362 if (failed_start > start) 1363 clear_extent_bit(tree, start, failed_start - 1, 1364 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1365 return 0; 1366 } 1367 return 1; 1368 } 1369 1370 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1371 { 1372 unsigned long index = start >> PAGE_SHIFT; 1373 unsigned long end_index = end >> PAGE_SHIFT; 1374 struct page *page; 1375 1376 while (index <= end_index) { 1377 page = find_get_page(inode->i_mapping, index); 1378 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1379 clear_page_dirty_for_io(page); 1380 put_page(page); 1381 index++; 1382 } 1383 } 1384 1385 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1386 { 1387 unsigned long index = start >> PAGE_SHIFT; 1388 unsigned long end_index = end >> PAGE_SHIFT; 1389 struct page *page; 1390 1391 while (index <= end_index) { 1392 page = find_get_page(inode->i_mapping, index); 1393 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1394 __set_page_dirty_nobuffers(page); 1395 account_page_redirty(page); 1396 put_page(page); 1397 index++; 1398 } 1399 } 1400 1401 /* 1402 * helper function to set both pages and extents in the tree writeback 1403 */ 1404 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1405 { 1406 unsigned long index = start >> PAGE_SHIFT; 1407 unsigned long end_index = end >> PAGE_SHIFT; 1408 struct page *page; 1409 1410 while (index <= end_index) { 1411 page = find_get_page(tree->mapping, index); 1412 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1413 set_page_writeback(page); 1414 put_page(page); 1415 index++; 1416 } 1417 } 1418 1419 /* find the first state struct with 'bits' set after 'start', and 1420 * return it. tree->lock must be held. NULL will returned if 1421 * nothing was found after 'start' 1422 */ 1423 static struct extent_state * 1424 find_first_extent_bit_state(struct extent_io_tree *tree, 1425 u64 start, unsigned bits) 1426 { 1427 struct rb_node *node; 1428 struct extent_state *state; 1429 1430 /* 1431 * this search will find all the extents that end after 1432 * our range starts. 1433 */ 1434 node = tree_search(tree, start); 1435 if (!node) 1436 goto out; 1437 1438 while (1) { 1439 state = rb_entry(node, struct extent_state, rb_node); 1440 if (state->end >= start && (state->state & bits)) 1441 return state; 1442 1443 node = rb_next(node); 1444 if (!node) 1445 break; 1446 } 1447 out: 1448 return NULL; 1449 } 1450 1451 /* 1452 * find the first offset in the io tree with 'bits' set. zero is 1453 * returned if we find something, and *start_ret and *end_ret are 1454 * set to reflect the state struct that was found. 1455 * 1456 * If nothing was found, 1 is returned. If found something, return 0. 1457 */ 1458 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1459 u64 *start_ret, u64 *end_ret, unsigned bits, 1460 struct extent_state **cached_state) 1461 { 1462 struct extent_state *state; 1463 struct rb_node *n; 1464 int ret = 1; 1465 1466 spin_lock(&tree->lock); 1467 if (cached_state && *cached_state) { 1468 state = *cached_state; 1469 if (state->end == start - 1 && extent_state_in_tree(state)) { 1470 n = rb_next(&state->rb_node); 1471 while (n) { 1472 state = rb_entry(n, struct extent_state, 1473 rb_node); 1474 if (state->state & bits) 1475 goto got_it; 1476 n = rb_next(n); 1477 } 1478 free_extent_state(*cached_state); 1479 *cached_state = NULL; 1480 goto out; 1481 } 1482 free_extent_state(*cached_state); 1483 *cached_state = NULL; 1484 } 1485 1486 state = find_first_extent_bit_state(tree, start, bits); 1487 got_it: 1488 if (state) { 1489 cache_state_if_flags(state, cached_state, 0); 1490 *start_ret = state->start; 1491 *end_ret = state->end; 1492 ret = 0; 1493 } 1494 out: 1495 spin_unlock(&tree->lock); 1496 return ret; 1497 } 1498 1499 /* 1500 * find a contiguous range of bytes in the file marked as delalloc, not 1501 * more than 'max_bytes'. start and end are used to return the range, 1502 * 1503 * 1 is returned if we find something, 0 if nothing was in the tree 1504 */ 1505 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1506 u64 *start, u64 *end, u64 max_bytes, 1507 struct extent_state **cached_state) 1508 { 1509 struct rb_node *node; 1510 struct extent_state *state; 1511 u64 cur_start = *start; 1512 u64 found = 0; 1513 u64 total_bytes = 0; 1514 1515 spin_lock(&tree->lock); 1516 1517 /* 1518 * this search will find all the extents that end after 1519 * our range starts. 1520 */ 1521 node = tree_search(tree, cur_start); 1522 if (!node) { 1523 if (!found) 1524 *end = (u64)-1; 1525 goto out; 1526 } 1527 1528 while (1) { 1529 state = rb_entry(node, struct extent_state, rb_node); 1530 if (found && (state->start != cur_start || 1531 (state->state & EXTENT_BOUNDARY))) { 1532 goto out; 1533 } 1534 if (!(state->state & EXTENT_DELALLOC)) { 1535 if (!found) 1536 *end = state->end; 1537 goto out; 1538 } 1539 if (!found) { 1540 *start = state->start; 1541 *cached_state = state; 1542 refcount_inc(&state->refs); 1543 } 1544 found++; 1545 *end = state->end; 1546 cur_start = state->end + 1; 1547 node = rb_next(node); 1548 total_bytes += state->end - state->start + 1; 1549 if (total_bytes >= max_bytes) 1550 break; 1551 if (!node) 1552 break; 1553 } 1554 out: 1555 spin_unlock(&tree->lock); 1556 return found; 1557 } 1558 1559 static int __process_pages_contig(struct address_space *mapping, 1560 struct page *locked_page, 1561 pgoff_t start_index, pgoff_t end_index, 1562 unsigned long page_ops, pgoff_t *index_ret); 1563 1564 static noinline void __unlock_for_delalloc(struct inode *inode, 1565 struct page *locked_page, 1566 u64 start, u64 end) 1567 { 1568 unsigned long index = start >> PAGE_SHIFT; 1569 unsigned long end_index = end >> PAGE_SHIFT; 1570 1571 ASSERT(locked_page); 1572 if (index == locked_page->index && end_index == index) 1573 return; 1574 1575 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1576 PAGE_UNLOCK, NULL); 1577 } 1578 1579 static noinline int lock_delalloc_pages(struct inode *inode, 1580 struct page *locked_page, 1581 u64 delalloc_start, 1582 u64 delalloc_end) 1583 { 1584 unsigned long index = delalloc_start >> PAGE_SHIFT; 1585 unsigned long index_ret = index; 1586 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1587 int ret; 1588 1589 ASSERT(locked_page); 1590 if (index == locked_page->index && index == end_index) 1591 return 0; 1592 1593 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1594 end_index, PAGE_LOCK, &index_ret); 1595 if (ret == -EAGAIN) 1596 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1597 (u64)index_ret << PAGE_SHIFT); 1598 return ret; 1599 } 1600 1601 /* 1602 * find a contiguous range of bytes in the file marked as delalloc, not 1603 * more than 'max_bytes'. start and end are used to return the range, 1604 * 1605 * 1 is returned if we find something, 0 if nothing was in the tree 1606 */ 1607 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1608 struct extent_io_tree *tree, 1609 struct page *locked_page, u64 *start, 1610 u64 *end, u64 max_bytes) 1611 { 1612 u64 delalloc_start; 1613 u64 delalloc_end; 1614 u64 found; 1615 struct extent_state *cached_state = NULL; 1616 int ret; 1617 int loops = 0; 1618 1619 again: 1620 /* step one, find a bunch of delalloc bytes starting at start */ 1621 delalloc_start = *start; 1622 delalloc_end = 0; 1623 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1624 max_bytes, &cached_state); 1625 if (!found || delalloc_end <= *start) { 1626 *start = delalloc_start; 1627 *end = delalloc_end; 1628 free_extent_state(cached_state); 1629 return 0; 1630 } 1631 1632 /* 1633 * start comes from the offset of locked_page. We have to lock 1634 * pages in order, so we can't process delalloc bytes before 1635 * locked_page 1636 */ 1637 if (delalloc_start < *start) 1638 delalloc_start = *start; 1639 1640 /* 1641 * make sure to limit the number of pages we try to lock down 1642 */ 1643 if (delalloc_end + 1 - delalloc_start > max_bytes) 1644 delalloc_end = delalloc_start + max_bytes - 1; 1645 1646 /* step two, lock all the pages after the page that has start */ 1647 ret = lock_delalloc_pages(inode, locked_page, 1648 delalloc_start, delalloc_end); 1649 if (ret == -EAGAIN) { 1650 /* some of the pages are gone, lets avoid looping by 1651 * shortening the size of the delalloc range we're searching 1652 */ 1653 free_extent_state(cached_state); 1654 cached_state = NULL; 1655 if (!loops) { 1656 max_bytes = PAGE_SIZE; 1657 loops = 1; 1658 goto again; 1659 } else { 1660 found = 0; 1661 goto out_failed; 1662 } 1663 } 1664 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1665 1666 /* step three, lock the state bits for the whole range */ 1667 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1668 1669 /* then test to make sure it is all still delalloc */ 1670 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1671 EXTENT_DELALLOC, 1, cached_state); 1672 if (!ret) { 1673 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1674 &cached_state, GFP_NOFS); 1675 __unlock_for_delalloc(inode, locked_page, 1676 delalloc_start, delalloc_end); 1677 cond_resched(); 1678 goto again; 1679 } 1680 free_extent_state(cached_state); 1681 *start = delalloc_start; 1682 *end = delalloc_end; 1683 out_failed: 1684 return found; 1685 } 1686 1687 static int __process_pages_contig(struct address_space *mapping, 1688 struct page *locked_page, 1689 pgoff_t start_index, pgoff_t end_index, 1690 unsigned long page_ops, pgoff_t *index_ret) 1691 { 1692 unsigned long nr_pages = end_index - start_index + 1; 1693 unsigned long pages_locked = 0; 1694 pgoff_t index = start_index; 1695 struct page *pages[16]; 1696 unsigned ret; 1697 int err = 0; 1698 int i; 1699 1700 if (page_ops & PAGE_LOCK) { 1701 ASSERT(page_ops == PAGE_LOCK); 1702 ASSERT(index_ret && *index_ret == start_index); 1703 } 1704 1705 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1706 mapping_set_error(mapping, -EIO); 1707 1708 while (nr_pages > 0) { 1709 ret = find_get_pages_contig(mapping, index, 1710 min_t(unsigned long, 1711 nr_pages, ARRAY_SIZE(pages)), pages); 1712 if (ret == 0) { 1713 /* 1714 * Only if we're going to lock these pages, 1715 * can we find nothing at @index. 1716 */ 1717 ASSERT(page_ops & PAGE_LOCK); 1718 err = -EAGAIN; 1719 goto out; 1720 } 1721 1722 for (i = 0; i < ret; i++) { 1723 if (page_ops & PAGE_SET_PRIVATE2) 1724 SetPagePrivate2(pages[i]); 1725 1726 if (pages[i] == locked_page) { 1727 put_page(pages[i]); 1728 pages_locked++; 1729 continue; 1730 } 1731 if (page_ops & PAGE_CLEAR_DIRTY) 1732 clear_page_dirty_for_io(pages[i]); 1733 if (page_ops & PAGE_SET_WRITEBACK) 1734 set_page_writeback(pages[i]); 1735 if (page_ops & PAGE_SET_ERROR) 1736 SetPageError(pages[i]); 1737 if (page_ops & PAGE_END_WRITEBACK) 1738 end_page_writeback(pages[i]); 1739 if (page_ops & PAGE_UNLOCK) 1740 unlock_page(pages[i]); 1741 if (page_ops & PAGE_LOCK) { 1742 lock_page(pages[i]); 1743 if (!PageDirty(pages[i]) || 1744 pages[i]->mapping != mapping) { 1745 unlock_page(pages[i]); 1746 put_page(pages[i]); 1747 err = -EAGAIN; 1748 goto out; 1749 } 1750 } 1751 put_page(pages[i]); 1752 pages_locked++; 1753 } 1754 nr_pages -= ret; 1755 index += ret; 1756 cond_resched(); 1757 } 1758 out: 1759 if (err && index_ret) 1760 *index_ret = start_index + pages_locked - 1; 1761 return err; 1762 } 1763 1764 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1765 u64 delalloc_end, struct page *locked_page, 1766 unsigned clear_bits, 1767 unsigned long page_ops) 1768 { 1769 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1770 NULL, GFP_NOFS); 1771 1772 __process_pages_contig(inode->i_mapping, locked_page, 1773 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1774 page_ops, NULL); 1775 } 1776 1777 /* 1778 * count the number of bytes in the tree that have a given bit(s) 1779 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1780 * cached. The total number found is returned. 1781 */ 1782 u64 count_range_bits(struct extent_io_tree *tree, 1783 u64 *start, u64 search_end, u64 max_bytes, 1784 unsigned bits, int contig) 1785 { 1786 struct rb_node *node; 1787 struct extent_state *state; 1788 u64 cur_start = *start; 1789 u64 total_bytes = 0; 1790 u64 last = 0; 1791 int found = 0; 1792 1793 if (WARN_ON(search_end <= cur_start)) 1794 return 0; 1795 1796 spin_lock(&tree->lock); 1797 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1798 total_bytes = tree->dirty_bytes; 1799 goto out; 1800 } 1801 /* 1802 * this search will find all the extents that end after 1803 * our range starts. 1804 */ 1805 node = tree_search(tree, cur_start); 1806 if (!node) 1807 goto out; 1808 1809 while (1) { 1810 state = rb_entry(node, struct extent_state, rb_node); 1811 if (state->start > search_end) 1812 break; 1813 if (contig && found && state->start > last + 1) 1814 break; 1815 if (state->end >= cur_start && (state->state & bits) == bits) { 1816 total_bytes += min(search_end, state->end) + 1 - 1817 max(cur_start, state->start); 1818 if (total_bytes >= max_bytes) 1819 break; 1820 if (!found) { 1821 *start = max(cur_start, state->start); 1822 found = 1; 1823 } 1824 last = state->end; 1825 } else if (contig && found) { 1826 break; 1827 } 1828 node = rb_next(node); 1829 if (!node) 1830 break; 1831 } 1832 out: 1833 spin_unlock(&tree->lock); 1834 return total_bytes; 1835 } 1836 1837 /* 1838 * set the private field for a given byte offset in the tree. If there isn't 1839 * an extent_state there already, this does nothing. 1840 */ 1841 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1842 struct io_failure_record *failrec) 1843 { 1844 struct rb_node *node; 1845 struct extent_state *state; 1846 int ret = 0; 1847 1848 spin_lock(&tree->lock); 1849 /* 1850 * this search will find all the extents that end after 1851 * our range starts. 1852 */ 1853 node = tree_search(tree, start); 1854 if (!node) { 1855 ret = -ENOENT; 1856 goto out; 1857 } 1858 state = rb_entry(node, struct extent_state, rb_node); 1859 if (state->start != start) { 1860 ret = -ENOENT; 1861 goto out; 1862 } 1863 state->failrec = failrec; 1864 out: 1865 spin_unlock(&tree->lock); 1866 return ret; 1867 } 1868 1869 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1870 struct io_failure_record **failrec) 1871 { 1872 struct rb_node *node; 1873 struct extent_state *state; 1874 int ret = 0; 1875 1876 spin_lock(&tree->lock); 1877 /* 1878 * this search will find all the extents that end after 1879 * our range starts. 1880 */ 1881 node = tree_search(tree, start); 1882 if (!node) { 1883 ret = -ENOENT; 1884 goto out; 1885 } 1886 state = rb_entry(node, struct extent_state, rb_node); 1887 if (state->start != start) { 1888 ret = -ENOENT; 1889 goto out; 1890 } 1891 *failrec = state->failrec; 1892 out: 1893 spin_unlock(&tree->lock); 1894 return ret; 1895 } 1896 1897 /* 1898 * searches a range in the state tree for a given mask. 1899 * If 'filled' == 1, this returns 1 only if every extent in the tree 1900 * has the bits set. Otherwise, 1 is returned if any bit in the 1901 * range is found set. 1902 */ 1903 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1904 unsigned bits, int filled, struct extent_state *cached) 1905 { 1906 struct extent_state *state = NULL; 1907 struct rb_node *node; 1908 int bitset = 0; 1909 1910 spin_lock(&tree->lock); 1911 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1912 cached->end > start) 1913 node = &cached->rb_node; 1914 else 1915 node = tree_search(tree, start); 1916 while (node && start <= end) { 1917 state = rb_entry(node, struct extent_state, rb_node); 1918 1919 if (filled && state->start > start) { 1920 bitset = 0; 1921 break; 1922 } 1923 1924 if (state->start > end) 1925 break; 1926 1927 if (state->state & bits) { 1928 bitset = 1; 1929 if (!filled) 1930 break; 1931 } else if (filled) { 1932 bitset = 0; 1933 break; 1934 } 1935 1936 if (state->end == (u64)-1) 1937 break; 1938 1939 start = state->end + 1; 1940 if (start > end) 1941 break; 1942 node = rb_next(node); 1943 if (!node) { 1944 if (filled) 1945 bitset = 0; 1946 break; 1947 } 1948 } 1949 spin_unlock(&tree->lock); 1950 return bitset; 1951 } 1952 1953 /* 1954 * helper function to set a given page up to date if all the 1955 * extents in the tree for that page are up to date 1956 */ 1957 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1958 { 1959 u64 start = page_offset(page); 1960 u64 end = start + PAGE_SIZE - 1; 1961 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1962 SetPageUptodate(page); 1963 } 1964 1965 int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) 1966 { 1967 int ret; 1968 int err = 0; 1969 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 1970 1971 set_state_failrec(failure_tree, rec->start, NULL); 1972 ret = clear_extent_bits(failure_tree, rec->start, 1973 rec->start + rec->len - 1, 1974 EXTENT_LOCKED | EXTENT_DIRTY); 1975 if (ret) 1976 err = ret; 1977 1978 ret = clear_extent_bits(&inode->io_tree, rec->start, 1979 rec->start + rec->len - 1, 1980 EXTENT_DAMAGED); 1981 if (ret && !err) 1982 err = ret; 1983 1984 kfree(rec); 1985 return err; 1986 } 1987 1988 /* 1989 * this bypasses the standard btrfs submit functions deliberately, as 1990 * the standard behavior is to write all copies in a raid setup. here we only 1991 * want to write the one bad copy. so we do the mapping for ourselves and issue 1992 * submit_bio directly. 1993 * to avoid any synchronization issues, wait for the data after writing, which 1994 * actually prevents the read that triggered the error from finishing. 1995 * currently, there can be no more than two copies of every data bit. thus, 1996 * exactly one rewrite is required. 1997 */ 1998 int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 1999 u64 logical, struct page *page, 2000 unsigned int pg_offset, int mirror_num) 2001 { 2002 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2003 struct bio *bio; 2004 struct btrfs_device *dev; 2005 u64 map_length = 0; 2006 u64 sector; 2007 struct btrfs_bio *bbio = NULL; 2008 int ret; 2009 2010 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2011 BUG_ON(!mirror_num); 2012 2013 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2014 if (!bio) 2015 return -EIO; 2016 bio->bi_iter.bi_size = 0; 2017 map_length = length; 2018 2019 /* 2020 * Avoid races with device replace and make sure our bbio has devices 2021 * associated to its stripes that don't go away while we are doing the 2022 * read repair operation. 2023 */ 2024 btrfs_bio_counter_inc_blocked(fs_info); 2025 if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { 2026 /* 2027 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2028 * to update all raid stripes, but here we just want to correct 2029 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2030 * stripe's dev and sector. 2031 */ 2032 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2033 &map_length, &bbio, 0); 2034 if (ret) { 2035 btrfs_bio_counter_dec(fs_info); 2036 bio_put(bio); 2037 return -EIO; 2038 } 2039 ASSERT(bbio->mirror_num == 1); 2040 } else { 2041 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2042 &map_length, &bbio, mirror_num); 2043 if (ret) { 2044 btrfs_bio_counter_dec(fs_info); 2045 bio_put(bio); 2046 return -EIO; 2047 } 2048 BUG_ON(mirror_num != bbio->mirror_num); 2049 } 2050 2051 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2052 bio->bi_iter.bi_sector = sector; 2053 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2054 btrfs_put_bbio(bbio); 2055 if (!dev || !dev->bdev || !dev->writeable) { 2056 btrfs_bio_counter_dec(fs_info); 2057 bio_put(bio); 2058 return -EIO; 2059 } 2060 bio->bi_bdev = dev->bdev; 2061 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2062 bio_add_page(bio, page, length, pg_offset); 2063 2064 if (btrfsic_submit_bio_wait(bio)) { 2065 /* try to remap that extent elsewhere? */ 2066 btrfs_bio_counter_dec(fs_info); 2067 bio_put(bio); 2068 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2069 return -EIO; 2070 } 2071 2072 btrfs_info_rl_in_rcu(fs_info, 2073 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2074 btrfs_ino(inode), start, 2075 rcu_str_deref(dev->name), sector); 2076 btrfs_bio_counter_dec(fs_info); 2077 bio_put(bio); 2078 return 0; 2079 } 2080 2081 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2082 struct extent_buffer *eb, int mirror_num) 2083 { 2084 u64 start = eb->start; 2085 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2086 int ret = 0; 2087 2088 if (fs_info->sb->s_flags & MS_RDONLY) 2089 return -EROFS; 2090 2091 for (i = 0; i < num_pages; i++) { 2092 struct page *p = eb->pages[i]; 2093 2094 ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, 2095 PAGE_SIZE, start, p, 2096 start - page_offset(p), mirror_num); 2097 if (ret) 2098 break; 2099 start += PAGE_SIZE; 2100 } 2101 2102 return ret; 2103 } 2104 2105 /* 2106 * each time an IO finishes, we do a fast check in the IO failure tree 2107 * to see if we need to process or clean up an io_failure_record 2108 */ 2109 int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, 2110 unsigned int pg_offset) 2111 { 2112 u64 private; 2113 struct io_failure_record *failrec; 2114 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2115 struct extent_state *state; 2116 int num_copies; 2117 int ret; 2118 2119 private = 0; 2120 ret = count_range_bits(&inode->io_failure_tree, &private, 2121 (u64)-1, 1, EXTENT_DIRTY, 0); 2122 if (!ret) 2123 return 0; 2124 2125 ret = get_state_failrec(&inode->io_failure_tree, start, 2126 &failrec); 2127 if (ret) 2128 return 0; 2129 2130 BUG_ON(!failrec->this_mirror); 2131 2132 if (failrec->in_validation) { 2133 /* there was no real error, just free the record */ 2134 btrfs_debug(fs_info, 2135 "clean_io_failure: freeing dummy error at %llu", 2136 failrec->start); 2137 goto out; 2138 } 2139 if (fs_info->sb->s_flags & MS_RDONLY) 2140 goto out; 2141 2142 spin_lock(&inode->io_tree.lock); 2143 state = find_first_extent_bit_state(&inode->io_tree, 2144 failrec->start, 2145 EXTENT_LOCKED); 2146 spin_unlock(&inode->io_tree.lock); 2147 2148 if (state && state->start <= failrec->start && 2149 state->end >= failrec->start + failrec->len - 1) { 2150 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2151 failrec->len); 2152 if (num_copies > 1) { 2153 repair_io_failure(inode, start, failrec->len, 2154 failrec->logical, page, 2155 pg_offset, failrec->failed_mirror); 2156 } 2157 } 2158 2159 out: 2160 free_io_failure(inode, failrec); 2161 2162 return 0; 2163 } 2164 2165 /* 2166 * Can be called when 2167 * - hold extent lock 2168 * - under ordered extent 2169 * - the inode is freeing 2170 */ 2171 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2172 { 2173 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2174 struct io_failure_record *failrec; 2175 struct extent_state *state, *next; 2176 2177 if (RB_EMPTY_ROOT(&failure_tree->state)) 2178 return; 2179 2180 spin_lock(&failure_tree->lock); 2181 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2182 while (state) { 2183 if (state->start > end) 2184 break; 2185 2186 ASSERT(state->end <= end); 2187 2188 next = next_state(state); 2189 2190 failrec = state->failrec; 2191 free_extent_state(state); 2192 kfree(failrec); 2193 2194 state = next; 2195 } 2196 spin_unlock(&failure_tree->lock); 2197 } 2198 2199 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2200 struct io_failure_record **failrec_ret) 2201 { 2202 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2203 struct io_failure_record *failrec; 2204 struct extent_map *em; 2205 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2206 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2207 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2208 int ret; 2209 u64 logical; 2210 2211 ret = get_state_failrec(failure_tree, start, &failrec); 2212 if (ret) { 2213 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2214 if (!failrec) 2215 return -ENOMEM; 2216 2217 failrec->start = start; 2218 failrec->len = end - start + 1; 2219 failrec->this_mirror = 0; 2220 failrec->bio_flags = 0; 2221 failrec->in_validation = 0; 2222 2223 read_lock(&em_tree->lock); 2224 em = lookup_extent_mapping(em_tree, start, failrec->len); 2225 if (!em) { 2226 read_unlock(&em_tree->lock); 2227 kfree(failrec); 2228 return -EIO; 2229 } 2230 2231 if (em->start > start || em->start + em->len <= start) { 2232 free_extent_map(em); 2233 em = NULL; 2234 } 2235 read_unlock(&em_tree->lock); 2236 if (!em) { 2237 kfree(failrec); 2238 return -EIO; 2239 } 2240 2241 logical = start - em->start; 2242 logical = em->block_start + logical; 2243 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2244 logical = em->block_start; 2245 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2246 extent_set_compress_type(&failrec->bio_flags, 2247 em->compress_type); 2248 } 2249 2250 btrfs_debug(fs_info, 2251 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2252 logical, start, failrec->len); 2253 2254 failrec->logical = logical; 2255 free_extent_map(em); 2256 2257 /* set the bits in the private failure tree */ 2258 ret = set_extent_bits(failure_tree, start, end, 2259 EXTENT_LOCKED | EXTENT_DIRTY); 2260 if (ret >= 0) 2261 ret = set_state_failrec(failure_tree, start, failrec); 2262 /* set the bits in the inode's tree */ 2263 if (ret >= 0) 2264 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2265 if (ret < 0) { 2266 kfree(failrec); 2267 return ret; 2268 } 2269 } else { 2270 btrfs_debug(fs_info, 2271 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2272 failrec->logical, failrec->start, failrec->len, 2273 failrec->in_validation); 2274 /* 2275 * when data can be on disk more than twice, add to failrec here 2276 * (e.g. with a list for failed_mirror) to make 2277 * clean_io_failure() clean all those errors at once. 2278 */ 2279 } 2280 2281 *failrec_ret = failrec; 2282 2283 return 0; 2284 } 2285 2286 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2287 struct io_failure_record *failrec, int failed_mirror) 2288 { 2289 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2290 int num_copies; 2291 2292 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2293 if (num_copies == 1) { 2294 /* 2295 * we only have a single copy of the data, so don't bother with 2296 * all the retry and error correction code that follows. no 2297 * matter what the error is, it is very likely to persist. 2298 */ 2299 btrfs_debug(fs_info, 2300 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2301 num_copies, failrec->this_mirror, failed_mirror); 2302 return 0; 2303 } 2304 2305 /* 2306 * there are two premises: 2307 * a) deliver good data to the caller 2308 * b) correct the bad sectors on disk 2309 */ 2310 if (failed_bio->bi_vcnt > 1) { 2311 /* 2312 * to fulfill b), we need to know the exact failing sectors, as 2313 * we don't want to rewrite any more than the failed ones. thus, 2314 * we need separate read requests for the failed bio 2315 * 2316 * if the following BUG_ON triggers, our validation request got 2317 * merged. we need separate requests for our algorithm to work. 2318 */ 2319 BUG_ON(failrec->in_validation); 2320 failrec->in_validation = 1; 2321 failrec->this_mirror = failed_mirror; 2322 } else { 2323 /* 2324 * we're ready to fulfill a) and b) alongside. get a good copy 2325 * of the failed sector and if we succeed, we have setup 2326 * everything for repair_io_failure to do the rest for us. 2327 */ 2328 if (failrec->in_validation) { 2329 BUG_ON(failrec->this_mirror != failed_mirror); 2330 failrec->in_validation = 0; 2331 failrec->this_mirror = 0; 2332 } 2333 failrec->failed_mirror = failed_mirror; 2334 failrec->this_mirror++; 2335 if (failrec->this_mirror == failed_mirror) 2336 failrec->this_mirror++; 2337 } 2338 2339 if (failrec->this_mirror > num_copies) { 2340 btrfs_debug(fs_info, 2341 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2342 num_copies, failrec->this_mirror, failed_mirror); 2343 return 0; 2344 } 2345 2346 return 1; 2347 } 2348 2349 2350 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2351 struct io_failure_record *failrec, 2352 struct page *page, int pg_offset, int icsum, 2353 bio_end_io_t *endio_func, void *data) 2354 { 2355 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2356 struct bio *bio; 2357 struct btrfs_io_bio *btrfs_failed_bio; 2358 struct btrfs_io_bio *btrfs_bio; 2359 2360 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2361 if (!bio) 2362 return NULL; 2363 2364 bio->bi_end_io = endio_func; 2365 bio->bi_iter.bi_sector = failrec->logical >> 9; 2366 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2367 bio->bi_iter.bi_size = 0; 2368 bio->bi_private = data; 2369 2370 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2371 if (btrfs_failed_bio->csum) { 2372 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2373 2374 btrfs_bio = btrfs_io_bio(bio); 2375 btrfs_bio->csum = btrfs_bio->csum_inline; 2376 icsum *= csum_size; 2377 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2378 csum_size); 2379 } 2380 2381 bio_add_page(bio, page, failrec->len, pg_offset); 2382 2383 return bio; 2384 } 2385 2386 /* 2387 * this is a generic handler for readpage errors (default 2388 * readpage_io_failed_hook). if other copies exist, read those and write back 2389 * good data to the failed position. does not investigate in remapping the 2390 * failed extent elsewhere, hoping the device will be smart enough to do this as 2391 * needed 2392 */ 2393 2394 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2395 struct page *page, u64 start, u64 end, 2396 int failed_mirror) 2397 { 2398 struct io_failure_record *failrec; 2399 struct inode *inode = page->mapping->host; 2400 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2401 struct bio *bio; 2402 int read_mode = 0; 2403 blk_status_t status; 2404 int ret; 2405 2406 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2407 2408 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2409 if (ret) 2410 return ret; 2411 2412 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2413 if (!ret) { 2414 free_io_failure(BTRFS_I(inode), failrec); 2415 return -EIO; 2416 } 2417 2418 if (failed_bio->bi_vcnt > 1) 2419 read_mode |= REQ_FAILFAST_DEV; 2420 2421 phy_offset >>= inode->i_sb->s_blocksize_bits; 2422 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2423 start - page_offset(page), 2424 (int)phy_offset, failed_bio->bi_end_io, 2425 NULL); 2426 if (!bio) { 2427 free_io_failure(BTRFS_I(inode), failrec); 2428 return -EIO; 2429 } 2430 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2431 2432 btrfs_debug(btrfs_sb(inode->i_sb), 2433 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2434 read_mode, failrec->this_mirror, failrec->in_validation); 2435 2436 status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2437 failrec->bio_flags, 0); 2438 if (status) { 2439 free_io_failure(BTRFS_I(inode), failrec); 2440 bio_put(bio); 2441 ret = blk_status_to_errno(status); 2442 } 2443 2444 return ret; 2445 } 2446 2447 /* lots and lots of room for performance fixes in the end_bio funcs */ 2448 2449 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2450 { 2451 int uptodate = (err == 0); 2452 struct extent_io_tree *tree; 2453 int ret = 0; 2454 2455 tree = &BTRFS_I(page->mapping->host)->io_tree; 2456 2457 if (tree->ops && tree->ops->writepage_end_io_hook) 2458 tree->ops->writepage_end_io_hook(page, start, end, NULL, 2459 uptodate); 2460 2461 if (!uptodate) { 2462 ClearPageUptodate(page); 2463 SetPageError(page); 2464 ret = err < 0 ? err : -EIO; 2465 mapping_set_error(page->mapping, ret); 2466 } 2467 } 2468 2469 /* 2470 * after a writepage IO is done, we need to: 2471 * clear the uptodate bits on error 2472 * clear the writeback bits in the extent tree for this IO 2473 * end_page_writeback if the page has no more pending IO 2474 * 2475 * Scheduling is not allowed, so the extent state tree is expected 2476 * to have one and only one object corresponding to this IO. 2477 */ 2478 static void end_bio_extent_writepage(struct bio *bio) 2479 { 2480 int error = blk_status_to_errno(bio->bi_status); 2481 struct bio_vec *bvec; 2482 u64 start; 2483 u64 end; 2484 int i; 2485 2486 bio_for_each_segment_all(bvec, bio, i) { 2487 struct page *page = bvec->bv_page; 2488 struct inode *inode = page->mapping->host; 2489 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2490 2491 /* We always issue full-page reads, but if some block 2492 * in a page fails to read, blk_update_request() will 2493 * advance bv_offset and adjust bv_len to compensate. 2494 * Print a warning for nonzero offsets, and an error 2495 * if they don't add up to a full page. */ 2496 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2497 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2498 btrfs_err(fs_info, 2499 "partial page write in btrfs with offset %u and length %u", 2500 bvec->bv_offset, bvec->bv_len); 2501 else 2502 btrfs_info(fs_info, 2503 "incomplete page write in btrfs with offset %u and length %u", 2504 bvec->bv_offset, bvec->bv_len); 2505 } 2506 2507 start = page_offset(page); 2508 end = start + bvec->bv_offset + bvec->bv_len - 1; 2509 2510 end_extent_writepage(page, error, start, end); 2511 end_page_writeback(page); 2512 } 2513 2514 bio_put(bio); 2515 } 2516 2517 static void 2518 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2519 int uptodate) 2520 { 2521 struct extent_state *cached = NULL; 2522 u64 end = start + len - 1; 2523 2524 if (uptodate && tree->track_uptodate) 2525 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2526 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2527 } 2528 2529 /* 2530 * after a readpage IO is done, we need to: 2531 * clear the uptodate bits on error 2532 * set the uptodate bits if things worked 2533 * set the page up to date if all extents in the tree are uptodate 2534 * clear the lock bit in the extent tree 2535 * unlock the page if there are no other extents locked for it 2536 * 2537 * Scheduling is not allowed, so the extent state tree is expected 2538 * to have one and only one object corresponding to this IO. 2539 */ 2540 static void end_bio_extent_readpage(struct bio *bio) 2541 { 2542 struct bio_vec *bvec; 2543 int uptodate = !bio->bi_status; 2544 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2545 struct extent_io_tree *tree; 2546 u64 offset = 0; 2547 u64 start; 2548 u64 end; 2549 u64 len; 2550 u64 extent_start = 0; 2551 u64 extent_len = 0; 2552 int mirror; 2553 int ret; 2554 int i; 2555 2556 bio_for_each_segment_all(bvec, bio, i) { 2557 struct page *page = bvec->bv_page; 2558 struct inode *inode = page->mapping->host; 2559 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2560 2561 btrfs_debug(fs_info, 2562 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2563 (u64)bio->bi_iter.bi_sector, bio->bi_status, 2564 io_bio->mirror_num); 2565 tree = &BTRFS_I(inode)->io_tree; 2566 2567 /* We always issue full-page reads, but if some block 2568 * in a page fails to read, blk_update_request() will 2569 * advance bv_offset and adjust bv_len to compensate. 2570 * Print a warning for nonzero offsets, and an error 2571 * if they don't add up to a full page. */ 2572 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2573 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2574 btrfs_err(fs_info, 2575 "partial page read in btrfs with offset %u and length %u", 2576 bvec->bv_offset, bvec->bv_len); 2577 else 2578 btrfs_info(fs_info, 2579 "incomplete page read in btrfs with offset %u and length %u", 2580 bvec->bv_offset, bvec->bv_len); 2581 } 2582 2583 start = page_offset(page); 2584 end = start + bvec->bv_offset + bvec->bv_len - 1; 2585 len = bvec->bv_len; 2586 2587 mirror = io_bio->mirror_num; 2588 if (likely(uptodate && tree->ops)) { 2589 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2590 page, start, end, 2591 mirror); 2592 if (ret) 2593 uptodate = 0; 2594 else 2595 clean_io_failure(BTRFS_I(inode), start, 2596 page, 0); 2597 } 2598 2599 if (likely(uptodate)) 2600 goto readpage_ok; 2601 2602 if (tree->ops) { 2603 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2604 if (ret == -EAGAIN) { 2605 /* 2606 * Data inode's readpage_io_failed_hook() always 2607 * returns -EAGAIN. 2608 * 2609 * The generic bio_readpage_error handles errors 2610 * the following way: If possible, new read 2611 * requests are created and submitted and will 2612 * end up in end_bio_extent_readpage as well (if 2613 * we're lucky, not in the !uptodate case). In 2614 * that case it returns 0 and we just go on with 2615 * the next page in our bio. If it can't handle 2616 * the error it will return -EIO and we remain 2617 * responsible for that page. 2618 */ 2619 ret = bio_readpage_error(bio, offset, page, 2620 start, end, mirror); 2621 if (ret == 0) { 2622 uptodate = !bio->bi_status; 2623 offset += len; 2624 continue; 2625 } 2626 } 2627 2628 /* 2629 * metadata's readpage_io_failed_hook() always returns 2630 * -EIO and fixes nothing. -EIO is also returned if 2631 * data inode error could not be fixed. 2632 */ 2633 ASSERT(ret == -EIO); 2634 } 2635 readpage_ok: 2636 if (likely(uptodate)) { 2637 loff_t i_size = i_size_read(inode); 2638 pgoff_t end_index = i_size >> PAGE_SHIFT; 2639 unsigned off; 2640 2641 /* Zero out the end if this page straddles i_size */ 2642 off = i_size & (PAGE_SIZE-1); 2643 if (page->index == end_index && off) 2644 zero_user_segment(page, off, PAGE_SIZE); 2645 SetPageUptodate(page); 2646 } else { 2647 ClearPageUptodate(page); 2648 SetPageError(page); 2649 } 2650 unlock_page(page); 2651 offset += len; 2652 2653 if (unlikely(!uptodate)) { 2654 if (extent_len) { 2655 endio_readpage_release_extent(tree, 2656 extent_start, 2657 extent_len, 1); 2658 extent_start = 0; 2659 extent_len = 0; 2660 } 2661 endio_readpage_release_extent(tree, start, 2662 end - start + 1, 0); 2663 } else if (!extent_len) { 2664 extent_start = start; 2665 extent_len = end + 1 - start; 2666 } else if (extent_start + extent_len == start) { 2667 extent_len += end + 1 - start; 2668 } else { 2669 endio_readpage_release_extent(tree, extent_start, 2670 extent_len, uptodate); 2671 extent_start = start; 2672 extent_len = end + 1 - start; 2673 } 2674 } 2675 2676 if (extent_len) 2677 endio_readpage_release_extent(tree, extent_start, extent_len, 2678 uptodate); 2679 if (io_bio->end_io) 2680 io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); 2681 bio_put(bio); 2682 } 2683 2684 /* 2685 * this allocates from the btrfs_bioset. We're returning a bio right now 2686 * but you can call btrfs_io_bio for the appropriate container_of magic 2687 */ 2688 struct bio * 2689 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2690 gfp_t gfp_flags) 2691 { 2692 struct btrfs_io_bio *btrfs_bio; 2693 struct bio *bio; 2694 2695 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2696 2697 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2698 while (!bio && (nr_vecs /= 2)) { 2699 bio = bio_alloc_bioset(gfp_flags, 2700 nr_vecs, btrfs_bioset); 2701 } 2702 } 2703 2704 if (bio) { 2705 bio->bi_bdev = bdev; 2706 bio->bi_iter.bi_sector = first_sector; 2707 btrfs_bio = btrfs_io_bio(bio); 2708 btrfs_bio->csum = NULL; 2709 btrfs_bio->csum_allocated = NULL; 2710 btrfs_bio->end_io = NULL; 2711 } 2712 return bio; 2713 } 2714 2715 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2716 { 2717 struct btrfs_io_bio *btrfs_bio; 2718 struct bio *new; 2719 2720 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2721 if (new) { 2722 btrfs_bio = btrfs_io_bio(new); 2723 btrfs_bio->csum = NULL; 2724 btrfs_bio->csum_allocated = NULL; 2725 btrfs_bio->end_io = NULL; 2726 } 2727 return new; 2728 } 2729 2730 /* this also allocates from the btrfs_bioset */ 2731 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2732 { 2733 struct btrfs_io_bio *btrfs_bio; 2734 struct bio *bio; 2735 2736 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2737 if (bio) { 2738 btrfs_bio = btrfs_io_bio(bio); 2739 btrfs_bio->csum = NULL; 2740 btrfs_bio->csum_allocated = NULL; 2741 btrfs_bio->end_io = NULL; 2742 } 2743 return bio; 2744 } 2745 2746 2747 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2748 unsigned long bio_flags) 2749 { 2750 blk_status_t ret = 0; 2751 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2752 struct page *page = bvec->bv_page; 2753 struct extent_io_tree *tree = bio->bi_private; 2754 u64 start; 2755 2756 start = page_offset(page) + bvec->bv_offset; 2757 2758 bio->bi_private = NULL; 2759 bio_get(bio); 2760 2761 if (tree->ops) 2762 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2763 mirror_num, bio_flags, start); 2764 else 2765 btrfsic_submit_bio(bio); 2766 2767 bio_put(bio); 2768 return blk_status_to_errno(ret); 2769 } 2770 2771 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2772 unsigned long offset, size_t size, struct bio *bio, 2773 unsigned long bio_flags) 2774 { 2775 int ret = 0; 2776 if (tree->ops) 2777 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2778 bio_flags); 2779 return ret; 2780 2781 } 2782 2783 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2784 struct writeback_control *wbc, 2785 struct page *page, sector_t sector, 2786 size_t size, unsigned long offset, 2787 struct block_device *bdev, 2788 struct bio **bio_ret, 2789 bio_end_io_t end_io_func, 2790 int mirror_num, 2791 unsigned long prev_bio_flags, 2792 unsigned long bio_flags, 2793 bool force_bio_submit) 2794 { 2795 int ret = 0; 2796 struct bio *bio; 2797 int contig = 0; 2798 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2799 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2800 2801 if (bio_ret && *bio_ret) { 2802 bio = *bio_ret; 2803 if (old_compressed) 2804 contig = bio->bi_iter.bi_sector == sector; 2805 else 2806 contig = bio_end_sector(bio) == sector; 2807 2808 if (prev_bio_flags != bio_flags || !contig || 2809 force_bio_submit || 2810 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2811 bio_add_page(bio, page, page_size, offset) < page_size) { 2812 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2813 if (ret < 0) { 2814 *bio_ret = NULL; 2815 return ret; 2816 } 2817 bio = NULL; 2818 } else { 2819 if (wbc) 2820 wbc_account_io(wbc, page, page_size); 2821 return 0; 2822 } 2823 } 2824 2825 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2826 GFP_NOFS | __GFP_HIGH); 2827 if (!bio) 2828 return -ENOMEM; 2829 2830 bio_add_page(bio, page, page_size, offset); 2831 bio->bi_end_io = end_io_func; 2832 bio->bi_private = tree; 2833 bio->bi_write_hint = page->mapping->host->i_write_hint; 2834 bio_set_op_attrs(bio, op, op_flags); 2835 if (wbc) { 2836 wbc_init_bio(wbc, bio); 2837 wbc_account_io(wbc, page, page_size); 2838 } 2839 2840 if (bio_ret) 2841 *bio_ret = bio; 2842 else 2843 ret = submit_one_bio(bio, mirror_num, bio_flags); 2844 2845 return ret; 2846 } 2847 2848 static void attach_extent_buffer_page(struct extent_buffer *eb, 2849 struct page *page) 2850 { 2851 if (!PagePrivate(page)) { 2852 SetPagePrivate(page); 2853 get_page(page); 2854 set_page_private(page, (unsigned long)eb); 2855 } else { 2856 WARN_ON(page->private != (unsigned long)eb); 2857 } 2858 } 2859 2860 void set_page_extent_mapped(struct page *page) 2861 { 2862 if (!PagePrivate(page)) { 2863 SetPagePrivate(page); 2864 get_page(page); 2865 set_page_private(page, EXTENT_PAGE_PRIVATE); 2866 } 2867 } 2868 2869 static struct extent_map * 2870 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2871 u64 start, u64 len, get_extent_t *get_extent, 2872 struct extent_map **em_cached) 2873 { 2874 struct extent_map *em; 2875 2876 if (em_cached && *em_cached) { 2877 em = *em_cached; 2878 if (extent_map_in_tree(em) && start >= em->start && 2879 start < extent_map_end(em)) { 2880 refcount_inc(&em->refs); 2881 return em; 2882 } 2883 2884 free_extent_map(em); 2885 *em_cached = NULL; 2886 } 2887 2888 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 2889 if (em_cached && !IS_ERR_OR_NULL(em)) { 2890 BUG_ON(*em_cached); 2891 refcount_inc(&em->refs); 2892 *em_cached = em; 2893 } 2894 return em; 2895 } 2896 /* 2897 * basic readpage implementation. Locked extent state structs are inserted 2898 * into the tree that are removed when the IO is done (by the end_io 2899 * handlers) 2900 * XXX JDM: This needs looking at to ensure proper page locking 2901 * return 0 on success, otherwise return error 2902 */ 2903 static int __do_readpage(struct extent_io_tree *tree, 2904 struct page *page, 2905 get_extent_t *get_extent, 2906 struct extent_map **em_cached, 2907 struct bio **bio, int mirror_num, 2908 unsigned long *bio_flags, int read_flags, 2909 u64 *prev_em_start) 2910 { 2911 struct inode *inode = page->mapping->host; 2912 u64 start = page_offset(page); 2913 u64 page_end = start + PAGE_SIZE - 1; 2914 u64 end; 2915 u64 cur = start; 2916 u64 extent_offset; 2917 u64 last_byte = i_size_read(inode); 2918 u64 block_start; 2919 u64 cur_end; 2920 sector_t sector; 2921 struct extent_map *em; 2922 struct block_device *bdev; 2923 int ret = 0; 2924 int nr = 0; 2925 size_t pg_offset = 0; 2926 size_t iosize; 2927 size_t disk_io_size; 2928 size_t blocksize = inode->i_sb->s_blocksize; 2929 unsigned long this_bio_flag = 0; 2930 2931 set_page_extent_mapped(page); 2932 2933 end = page_end; 2934 if (!PageUptodate(page)) { 2935 if (cleancache_get_page(page) == 0) { 2936 BUG_ON(blocksize != PAGE_SIZE); 2937 unlock_extent(tree, start, end); 2938 goto out; 2939 } 2940 } 2941 2942 if (page->index == last_byte >> PAGE_SHIFT) { 2943 char *userpage; 2944 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2945 2946 if (zero_offset) { 2947 iosize = PAGE_SIZE - zero_offset; 2948 userpage = kmap_atomic(page); 2949 memset(userpage + zero_offset, 0, iosize); 2950 flush_dcache_page(page); 2951 kunmap_atomic(userpage); 2952 } 2953 } 2954 while (cur <= end) { 2955 bool force_bio_submit = false; 2956 2957 if (cur >= last_byte) { 2958 char *userpage; 2959 struct extent_state *cached = NULL; 2960 2961 iosize = PAGE_SIZE - pg_offset; 2962 userpage = kmap_atomic(page); 2963 memset(userpage + pg_offset, 0, iosize); 2964 flush_dcache_page(page); 2965 kunmap_atomic(userpage); 2966 set_extent_uptodate(tree, cur, cur + iosize - 1, 2967 &cached, GFP_NOFS); 2968 unlock_extent_cached(tree, cur, 2969 cur + iosize - 1, 2970 &cached, GFP_NOFS); 2971 break; 2972 } 2973 em = __get_extent_map(inode, page, pg_offset, cur, 2974 end - cur + 1, get_extent, em_cached); 2975 if (IS_ERR_OR_NULL(em)) { 2976 SetPageError(page); 2977 unlock_extent(tree, cur, end); 2978 break; 2979 } 2980 extent_offset = cur - em->start; 2981 BUG_ON(extent_map_end(em) <= cur); 2982 BUG_ON(end < cur); 2983 2984 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2985 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2986 extent_set_compress_type(&this_bio_flag, 2987 em->compress_type); 2988 } 2989 2990 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2991 cur_end = min(extent_map_end(em) - 1, end); 2992 iosize = ALIGN(iosize, blocksize); 2993 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2994 disk_io_size = em->block_len; 2995 sector = em->block_start >> 9; 2996 } else { 2997 sector = (em->block_start + extent_offset) >> 9; 2998 disk_io_size = iosize; 2999 } 3000 bdev = em->bdev; 3001 block_start = em->block_start; 3002 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3003 block_start = EXTENT_MAP_HOLE; 3004 3005 /* 3006 * If we have a file range that points to a compressed extent 3007 * and it's followed by a consecutive file range that points to 3008 * to the same compressed extent (possibly with a different 3009 * offset and/or length, so it either points to the whole extent 3010 * or only part of it), we must make sure we do not submit a 3011 * single bio to populate the pages for the 2 ranges because 3012 * this makes the compressed extent read zero out the pages 3013 * belonging to the 2nd range. Imagine the following scenario: 3014 * 3015 * File layout 3016 * [0 - 8K] [8K - 24K] 3017 * | | 3018 * | | 3019 * points to extent X, points to extent X, 3020 * offset 4K, length of 8K offset 0, length 16K 3021 * 3022 * [extent X, compressed length = 4K uncompressed length = 16K] 3023 * 3024 * If the bio to read the compressed extent covers both ranges, 3025 * it will decompress extent X into the pages belonging to the 3026 * first range and then it will stop, zeroing out the remaining 3027 * pages that belong to the other range that points to extent X. 3028 * So here we make sure we submit 2 bios, one for the first 3029 * range and another one for the third range. Both will target 3030 * the same physical extent from disk, but we can't currently 3031 * make the compressed bio endio callback populate the pages 3032 * for both ranges because each compressed bio is tightly 3033 * coupled with a single extent map, and each range can have 3034 * an extent map with a different offset value relative to the 3035 * uncompressed data of our extent and different lengths. This 3036 * is a corner case so we prioritize correctness over 3037 * non-optimal behavior (submitting 2 bios for the same extent). 3038 */ 3039 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3040 prev_em_start && *prev_em_start != (u64)-1 && 3041 *prev_em_start != em->orig_start) 3042 force_bio_submit = true; 3043 3044 if (prev_em_start) 3045 *prev_em_start = em->orig_start; 3046 3047 free_extent_map(em); 3048 em = NULL; 3049 3050 /* we've found a hole, just zero and go on */ 3051 if (block_start == EXTENT_MAP_HOLE) { 3052 char *userpage; 3053 struct extent_state *cached = NULL; 3054 3055 userpage = kmap_atomic(page); 3056 memset(userpage + pg_offset, 0, iosize); 3057 flush_dcache_page(page); 3058 kunmap_atomic(userpage); 3059 3060 set_extent_uptodate(tree, cur, cur + iosize - 1, 3061 &cached, GFP_NOFS); 3062 unlock_extent_cached(tree, cur, 3063 cur + iosize - 1, 3064 &cached, GFP_NOFS); 3065 cur = cur + iosize; 3066 pg_offset += iosize; 3067 continue; 3068 } 3069 /* the get_extent function already copied into the page */ 3070 if (test_range_bit(tree, cur, cur_end, 3071 EXTENT_UPTODATE, 1, NULL)) { 3072 check_page_uptodate(tree, page); 3073 unlock_extent(tree, cur, cur + iosize - 1); 3074 cur = cur + iosize; 3075 pg_offset += iosize; 3076 continue; 3077 } 3078 /* we have an inline extent but it didn't get marked up 3079 * to date. Error out 3080 */ 3081 if (block_start == EXTENT_MAP_INLINE) { 3082 SetPageError(page); 3083 unlock_extent(tree, cur, cur + iosize - 1); 3084 cur = cur + iosize; 3085 pg_offset += iosize; 3086 continue; 3087 } 3088 3089 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3090 page, sector, disk_io_size, pg_offset, 3091 bdev, bio, 3092 end_bio_extent_readpage, mirror_num, 3093 *bio_flags, 3094 this_bio_flag, 3095 force_bio_submit); 3096 if (!ret) { 3097 nr++; 3098 *bio_flags = this_bio_flag; 3099 } else { 3100 SetPageError(page); 3101 unlock_extent(tree, cur, cur + iosize - 1); 3102 goto out; 3103 } 3104 cur = cur + iosize; 3105 pg_offset += iosize; 3106 } 3107 out: 3108 if (!nr) { 3109 if (!PageError(page)) 3110 SetPageUptodate(page); 3111 unlock_page(page); 3112 } 3113 return ret; 3114 } 3115 3116 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3117 struct page *pages[], int nr_pages, 3118 u64 start, u64 end, 3119 get_extent_t *get_extent, 3120 struct extent_map **em_cached, 3121 struct bio **bio, int mirror_num, 3122 unsigned long *bio_flags, 3123 u64 *prev_em_start) 3124 { 3125 struct inode *inode; 3126 struct btrfs_ordered_extent *ordered; 3127 int index; 3128 3129 inode = pages[0]->mapping->host; 3130 while (1) { 3131 lock_extent(tree, start, end); 3132 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3133 end - start + 1); 3134 if (!ordered) 3135 break; 3136 unlock_extent(tree, start, end); 3137 btrfs_start_ordered_extent(inode, ordered, 1); 3138 btrfs_put_ordered_extent(ordered); 3139 } 3140 3141 for (index = 0; index < nr_pages; index++) { 3142 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3143 mirror_num, bio_flags, 0, prev_em_start); 3144 put_page(pages[index]); 3145 } 3146 } 3147 3148 static void __extent_readpages(struct extent_io_tree *tree, 3149 struct page *pages[], 3150 int nr_pages, get_extent_t *get_extent, 3151 struct extent_map **em_cached, 3152 struct bio **bio, int mirror_num, 3153 unsigned long *bio_flags, 3154 u64 *prev_em_start) 3155 { 3156 u64 start = 0; 3157 u64 end = 0; 3158 u64 page_start; 3159 int index; 3160 int first_index = 0; 3161 3162 for (index = 0; index < nr_pages; index++) { 3163 page_start = page_offset(pages[index]); 3164 if (!end) { 3165 start = page_start; 3166 end = start + PAGE_SIZE - 1; 3167 first_index = index; 3168 } else if (end + 1 == page_start) { 3169 end += PAGE_SIZE; 3170 } else { 3171 __do_contiguous_readpages(tree, &pages[first_index], 3172 index - first_index, start, 3173 end, get_extent, em_cached, 3174 bio, mirror_num, bio_flags, 3175 prev_em_start); 3176 start = page_start; 3177 end = start + PAGE_SIZE - 1; 3178 first_index = index; 3179 } 3180 } 3181 3182 if (end) 3183 __do_contiguous_readpages(tree, &pages[first_index], 3184 index - first_index, start, 3185 end, get_extent, em_cached, bio, 3186 mirror_num, bio_flags, 3187 prev_em_start); 3188 } 3189 3190 static int __extent_read_full_page(struct extent_io_tree *tree, 3191 struct page *page, 3192 get_extent_t *get_extent, 3193 struct bio **bio, int mirror_num, 3194 unsigned long *bio_flags, int read_flags) 3195 { 3196 struct inode *inode = page->mapping->host; 3197 struct btrfs_ordered_extent *ordered; 3198 u64 start = page_offset(page); 3199 u64 end = start + PAGE_SIZE - 1; 3200 int ret; 3201 3202 while (1) { 3203 lock_extent(tree, start, end); 3204 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3205 PAGE_SIZE); 3206 if (!ordered) 3207 break; 3208 unlock_extent(tree, start, end); 3209 btrfs_start_ordered_extent(inode, ordered, 1); 3210 btrfs_put_ordered_extent(ordered); 3211 } 3212 3213 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3214 bio_flags, read_flags, NULL); 3215 return ret; 3216 } 3217 3218 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3219 get_extent_t *get_extent, int mirror_num) 3220 { 3221 struct bio *bio = NULL; 3222 unsigned long bio_flags = 0; 3223 int ret; 3224 3225 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3226 &bio_flags, 0); 3227 if (bio) 3228 ret = submit_one_bio(bio, mirror_num, bio_flags); 3229 return ret; 3230 } 3231 3232 static void update_nr_written(struct writeback_control *wbc, 3233 unsigned long nr_written) 3234 { 3235 wbc->nr_to_write -= nr_written; 3236 } 3237 3238 /* 3239 * helper for __extent_writepage, doing all of the delayed allocation setup. 3240 * 3241 * This returns 1 if our fill_delalloc function did all the work required 3242 * to write the page (copy into inline extent). In this case the IO has 3243 * been started and the page is already unlocked. 3244 * 3245 * This returns 0 if all went well (page still locked) 3246 * This returns < 0 if there were errors (page still locked) 3247 */ 3248 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3249 struct page *page, struct writeback_control *wbc, 3250 struct extent_page_data *epd, 3251 u64 delalloc_start, 3252 unsigned long *nr_written) 3253 { 3254 struct extent_io_tree *tree = epd->tree; 3255 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3256 u64 nr_delalloc; 3257 u64 delalloc_to_write = 0; 3258 u64 delalloc_end = 0; 3259 int ret; 3260 int page_started = 0; 3261 3262 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3263 return 0; 3264 3265 while (delalloc_end < page_end) { 3266 nr_delalloc = find_lock_delalloc_range(inode, tree, 3267 page, 3268 &delalloc_start, 3269 &delalloc_end, 3270 BTRFS_MAX_EXTENT_SIZE); 3271 if (nr_delalloc == 0) { 3272 delalloc_start = delalloc_end + 1; 3273 continue; 3274 } 3275 ret = tree->ops->fill_delalloc(inode, page, 3276 delalloc_start, 3277 delalloc_end, 3278 &page_started, 3279 nr_written); 3280 /* File system has been set read-only */ 3281 if (ret) { 3282 SetPageError(page); 3283 /* fill_delalloc should be return < 0 for error 3284 * but just in case, we use > 0 here meaning the 3285 * IO is started, so we don't want to return > 0 3286 * unless things are going well. 3287 */ 3288 ret = ret < 0 ? ret : -EIO; 3289 goto done; 3290 } 3291 /* 3292 * delalloc_end is already one less than the total length, so 3293 * we don't subtract one from PAGE_SIZE 3294 */ 3295 delalloc_to_write += (delalloc_end - delalloc_start + 3296 PAGE_SIZE) >> PAGE_SHIFT; 3297 delalloc_start = delalloc_end + 1; 3298 } 3299 if (wbc->nr_to_write < delalloc_to_write) { 3300 int thresh = 8192; 3301 3302 if (delalloc_to_write < thresh * 2) 3303 thresh = delalloc_to_write; 3304 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3305 thresh); 3306 } 3307 3308 /* did the fill delalloc function already unlock and start 3309 * the IO? 3310 */ 3311 if (page_started) { 3312 /* 3313 * we've unlocked the page, so we can't update 3314 * the mapping's writeback index, just update 3315 * nr_to_write. 3316 */ 3317 wbc->nr_to_write -= *nr_written; 3318 return 1; 3319 } 3320 3321 ret = 0; 3322 3323 done: 3324 return ret; 3325 } 3326 3327 /* 3328 * helper for __extent_writepage. This calls the writepage start hooks, 3329 * and does the loop to map the page into extents and bios. 3330 * 3331 * We return 1 if the IO is started and the page is unlocked, 3332 * 0 if all went well (page still locked) 3333 * < 0 if there were errors (page still locked) 3334 */ 3335 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3336 struct page *page, 3337 struct writeback_control *wbc, 3338 struct extent_page_data *epd, 3339 loff_t i_size, 3340 unsigned long nr_written, 3341 int write_flags, int *nr_ret) 3342 { 3343 struct extent_io_tree *tree = epd->tree; 3344 u64 start = page_offset(page); 3345 u64 page_end = start + PAGE_SIZE - 1; 3346 u64 end; 3347 u64 cur = start; 3348 u64 extent_offset; 3349 u64 block_start; 3350 u64 iosize; 3351 sector_t sector; 3352 struct extent_map *em; 3353 struct block_device *bdev; 3354 size_t pg_offset = 0; 3355 size_t blocksize; 3356 int ret = 0; 3357 int nr = 0; 3358 bool compressed; 3359 3360 if (tree->ops && tree->ops->writepage_start_hook) { 3361 ret = tree->ops->writepage_start_hook(page, start, 3362 page_end); 3363 if (ret) { 3364 /* Fixup worker will requeue */ 3365 if (ret == -EBUSY) 3366 wbc->pages_skipped++; 3367 else 3368 redirty_page_for_writepage(wbc, page); 3369 3370 update_nr_written(wbc, nr_written); 3371 unlock_page(page); 3372 return 1; 3373 } 3374 } 3375 3376 /* 3377 * we don't want to touch the inode after unlocking the page, 3378 * so we update the mapping writeback index now 3379 */ 3380 update_nr_written(wbc, nr_written + 1); 3381 3382 end = page_end; 3383 if (i_size <= start) { 3384 if (tree->ops && tree->ops->writepage_end_io_hook) 3385 tree->ops->writepage_end_io_hook(page, start, 3386 page_end, NULL, 1); 3387 goto done; 3388 } 3389 3390 blocksize = inode->i_sb->s_blocksize; 3391 3392 while (cur <= end) { 3393 u64 em_end; 3394 3395 if (cur >= i_size) { 3396 if (tree->ops && tree->ops->writepage_end_io_hook) 3397 tree->ops->writepage_end_io_hook(page, cur, 3398 page_end, NULL, 1); 3399 break; 3400 } 3401 em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, 3402 end - cur + 1, 1); 3403 if (IS_ERR_OR_NULL(em)) { 3404 SetPageError(page); 3405 ret = PTR_ERR_OR_ZERO(em); 3406 break; 3407 } 3408 3409 extent_offset = cur - em->start; 3410 em_end = extent_map_end(em); 3411 BUG_ON(em_end <= cur); 3412 BUG_ON(end < cur); 3413 iosize = min(em_end - cur, end - cur + 1); 3414 iosize = ALIGN(iosize, blocksize); 3415 sector = (em->block_start + extent_offset) >> 9; 3416 bdev = em->bdev; 3417 block_start = em->block_start; 3418 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3419 free_extent_map(em); 3420 em = NULL; 3421 3422 /* 3423 * compressed and inline extents are written through other 3424 * paths in the FS 3425 */ 3426 if (compressed || block_start == EXTENT_MAP_HOLE || 3427 block_start == EXTENT_MAP_INLINE) { 3428 /* 3429 * end_io notification does not happen here for 3430 * compressed extents 3431 */ 3432 if (!compressed && tree->ops && 3433 tree->ops->writepage_end_io_hook) 3434 tree->ops->writepage_end_io_hook(page, cur, 3435 cur + iosize - 1, 3436 NULL, 1); 3437 else if (compressed) { 3438 /* we don't want to end_page_writeback on 3439 * a compressed extent. this happens 3440 * elsewhere 3441 */ 3442 nr++; 3443 } 3444 3445 cur += iosize; 3446 pg_offset += iosize; 3447 continue; 3448 } 3449 3450 set_range_writeback(tree, cur, cur + iosize - 1); 3451 if (!PageWriteback(page)) { 3452 btrfs_err(BTRFS_I(inode)->root->fs_info, 3453 "page %lu not writeback, cur %llu end %llu", 3454 page->index, cur, end); 3455 } 3456 3457 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3458 page, sector, iosize, pg_offset, 3459 bdev, &epd->bio, 3460 end_bio_extent_writepage, 3461 0, 0, 0, false); 3462 if (ret) { 3463 SetPageError(page); 3464 if (PageWriteback(page)) 3465 end_page_writeback(page); 3466 } 3467 3468 cur = cur + iosize; 3469 pg_offset += iosize; 3470 nr++; 3471 } 3472 done: 3473 *nr_ret = nr; 3474 return ret; 3475 } 3476 3477 /* 3478 * the writepage semantics are similar to regular writepage. extent 3479 * records are inserted to lock ranges in the tree, and as dirty areas 3480 * are found, they are marked writeback. Then the lock bits are removed 3481 * and the end_io handler clears the writeback ranges 3482 */ 3483 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3484 void *data) 3485 { 3486 struct inode *inode = page->mapping->host; 3487 struct extent_page_data *epd = data; 3488 u64 start = page_offset(page); 3489 u64 page_end = start + PAGE_SIZE - 1; 3490 int ret; 3491 int nr = 0; 3492 size_t pg_offset = 0; 3493 loff_t i_size = i_size_read(inode); 3494 unsigned long end_index = i_size >> PAGE_SHIFT; 3495 int write_flags = 0; 3496 unsigned long nr_written = 0; 3497 3498 if (wbc->sync_mode == WB_SYNC_ALL) 3499 write_flags = REQ_SYNC; 3500 3501 trace___extent_writepage(page, inode, wbc); 3502 3503 WARN_ON(!PageLocked(page)); 3504 3505 ClearPageError(page); 3506 3507 pg_offset = i_size & (PAGE_SIZE - 1); 3508 if (page->index > end_index || 3509 (page->index == end_index && !pg_offset)) { 3510 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3511 unlock_page(page); 3512 return 0; 3513 } 3514 3515 if (page->index == end_index) { 3516 char *userpage; 3517 3518 userpage = kmap_atomic(page); 3519 memset(userpage + pg_offset, 0, 3520 PAGE_SIZE - pg_offset); 3521 kunmap_atomic(userpage); 3522 flush_dcache_page(page); 3523 } 3524 3525 pg_offset = 0; 3526 3527 set_page_extent_mapped(page); 3528 3529 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3530 if (ret == 1) 3531 goto done_unlocked; 3532 if (ret) 3533 goto done; 3534 3535 ret = __extent_writepage_io(inode, page, wbc, epd, 3536 i_size, nr_written, write_flags, &nr); 3537 if (ret == 1) 3538 goto done_unlocked; 3539 3540 done: 3541 if (nr == 0) { 3542 /* make sure the mapping tag for page dirty gets cleared */ 3543 set_page_writeback(page); 3544 end_page_writeback(page); 3545 } 3546 if (PageError(page)) { 3547 ret = ret < 0 ? ret : -EIO; 3548 end_extent_writepage(page, ret, start, page_end); 3549 } 3550 unlock_page(page); 3551 return ret; 3552 3553 done_unlocked: 3554 return 0; 3555 } 3556 3557 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3558 { 3559 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3560 TASK_UNINTERRUPTIBLE); 3561 } 3562 3563 static noinline_for_stack int 3564 lock_extent_buffer_for_io(struct extent_buffer *eb, 3565 struct btrfs_fs_info *fs_info, 3566 struct extent_page_data *epd) 3567 { 3568 unsigned long i, num_pages; 3569 int flush = 0; 3570 int ret = 0; 3571 3572 if (!btrfs_try_tree_write_lock(eb)) { 3573 flush = 1; 3574 flush_write_bio(epd); 3575 btrfs_tree_lock(eb); 3576 } 3577 3578 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3579 btrfs_tree_unlock(eb); 3580 if (!epd->sync_io) 3581 return 0; 3582 if (!flush) { 3583 flush_write_bio(epd); 3584 flush = 1; 3585 } 3586 while (1) { 3587 wait_on_extent_buffer_writeback(eb); 3588 btrfs_tree_lock(eb); 3589 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3590 break; 3591 btrfs_tree_unlock(eb); 3592 } 3593 } 3594 3595 /* 3596 * We need to do this to prevent races in people who check if the eb is 3597 * under IO since we can end up having no IO bits set for a short period 3598 * of time. 3599 */ 3600 spin_lock(&eb->refs_lock); 3601 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3602 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3603 spin_unlock(&eb->refs_lock); 3604 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3605 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3606 -eb->len, 3607 fs_info->dirty_metadata_batch); 3608 ret = 1; 3609 } else { 3610 spin_unlock(&eb->refs_lock); 3611 } 3612 3613 btrfs_tree_unlock(eb); 3614 3615 if (!ret) 3616 return ret; 3617 3618 num_pages = num_extent_pages(eb->start, eb->len); 3619 for (i = 0; i < num_pages; i++) { 3620 struct page *p = eb->pages[i]; 3621 3622 if (!trylock_page(p)) { 3623 if (!flush) { 3624 flush_write_bio(epd); 3625 flush = 1; 3626 } 3627 lock_page(p); 3628 } 3629 } 3630 3631 return ret; 3632 } 3633 3634 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3635 { 3636 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3637 smp_mb__after_atomic(); 3638 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3639 } 3640 3641 static void set_btree_ioerr(struct page *page) 3642 { 3643 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3644 3645 SetPageError(page); 3646 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3647 return; 3648 3649 /* 3650 * If writeback for a btree extent that doesn't belong to a log tree 3651 * failed, increment the counter transaction->eb_write_errors. 3652 * We do this because while the transaction is running and before it's 3653 * committing (when we call filemap_fdata[write|wait]_range against 3654 * the btree inode), we might have 3655 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3656 * returns an error or an error happens during writeback, when we're 3657 * committing the transaction we wouldn't know about it, since the pages 3658 * can be no longer dirty nor marked anymore for writeback (if a 3659 * subsequent modification to the extent buffer didn't happen before the 3660 * transaction commit), which makes filemap_fdata[write|wait]_range not 3661 * able to find the pages tagged with SetPageError at transaction 3662 * commit time. So if this happens we must abort the transaction, 3663 * otherwise we commit a super block with btree roots that point to 3664 * btree nodes/leafs whose content on disk is invalid - either garbage 3665 * or the content of some node/leaf from a past generation that got 3666 * cowed or deleted and is no longer valid. 3667 * 3668 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3669 * not be enough - we need to distinguish between log tree extents vs 3670 * non-log tree extents, and the next filemap_fdatawait_range() call 3671 * will catch and clear such errors in the mapping - and that call might 3672 * be from a log sync and not from a transaction commit. Also, checking 3673 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3674 * not done and would not be reliable - the eb might have been released 3675 * from memory and reading it back again means that flag would not be 3676 * set (since it's a runtime flag, not persisted on disk). 3677 * 3678 * Using the flags below in the btree inode also makes us achieve the 3679 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3680 * writeback for all dirty pages and before filemap_fdatawait_range() 3681 * is called, the writeback for all dirty pages had already finished 3682 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3683 * filemap_fdatawait_range() would return success, as it could not know 3684 * that writeback errors happened (the pages were no longer tagged for 3685 * writeback). 3686 */ 3687 switch (eb->log_index) { 3688 case -1: 3689 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3690 break; 3691 case 0: 3692 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3693 break; 3694 case 1: 3695 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3696 break; 3697 default: 3698 BUG(); /* unexpected, logic error */ 3699 } 3700 } 3701 3702 static void end_bio_extent_buffer_writepage(struct bio *bio) 3703 { 3704 struct bio_vec *bvec; 3705 struct extent_buffer *eb; 3706 int i, done; 3707 3708 bio_for_each_segment_all(bvec, bio, i) { 3709 struct page *page = bvec->bv_page; 3710 3711 eb = (struct extent_buffer *)page->private; 3712 BUG_ON(!eb); 3713 done = atomic_dec_and_test(&eb->io_pages); 3714 3715 if (bio->bi_status || 3716 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3717 ClearPageUptodate(page); 3718 set_btree_ioerr(page); 3719 } 3720 3721 end_page_writeback(page); 3722 3723 if (!done) 3724 continue; 3725 3726 end_extent_buffer_writeback(eb); 3727 } 3728 3729 bio_put(bio); 3730 } 3731 3732 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3733 struct btrfs_fs_info *fs_info, 3734 struct writeback_control *wbc, 3735 struct extent_page_data *epd) 3736 { 3737 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3738 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3739 u64 offset = eb->start; 3740 u32 nritems; 3741 unsigned long i, num_pages; 3742 unsigned long bio_flags = 0; 3743 unsigned long start, end; 3744 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3745 int ret = 0; 3746 3747 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3748 num_pages = num_extent_pages(eb->start, eb->len); 3749 atomic_set(&eb->io_pages, num_pages); 3750 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3751 bio_flags = EXTENT_BIO_TREE_LOG; 3752 3753 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3754 nritems = btrfs_header_nritems(eb); 3755 if (btrfs_header_level(eb) > 0) { 3756 end = btrfs_node_key_ptr_offset(nritems); 3757 3758 memzero_extent_buffer(eb, end, eb->len - end); 3759 } else { 3760 /* 3761 * leaf: 3762 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3763 */ 3764 start = btrfs_item_nr_offset(nritems); 3765 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3766 memzero_extent_buffer(eb, start, end - start); 3767 } 3768 3769 for (i = 0; i < num_pages; i++) { 3770 struct page *p = eb->pages[i]; 3771 3772 clear_page_dirty_for_io(p); 3773 set_page_writeback(p); 3774 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3775 p, offset >> 9, PAGE_SIZE, 0, bdev, 3776 &epd->bio, 3777 end_bio_extent_buffer_writepage, 3778 0, epd->bio_flags, bio_flags, false); 3779 epd->bio_flags = bio_flags; 3780 if (ret) { 3781 set_btree_ioerr(p); 3782 if (PageWriteback(p)) 3783 end_page_writeback(p); 3784 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3785 end_extent_buffer_writeback(eb); 3786 ret = -EIO; 3787 break; 3788 } 3789 offset += PAGE_SIZE; 3790 update_nr_written(wbc, 1); 3791 unlock_page(p); 3792 } 3793 3794 if (unlikely(ret)) { 3795 for (; i < num_pages; i++) { 3796 struct page *p = eb->pages[i]; 3797 clear_page_dirty_for_io(p); 3798 unlock_page(p); 3799 } 3800 } 3801 3802 return ret; 3803 } 3804 3805 int btree_write_cache_pages(struct address_space *mapping, 3806 struct writeback_control *wbc) 3807 { 3808 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3809 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3810 struct extent_buffer *eb, *prev_eb = NULL; 3811 struct extent_page_data epd = { 3812 .bio = NULL, 3813 .tree = tree, 3814 .extent_locked = 0, 3815 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3816 .bio_flags = 0, 3817 }; 3818 int ret = 0; 3819 int done = 0; 3820 int nr_to_write_done = 0; 3821 struct pagevec pvec; 3822 int nr_pages; 3823 pgoff_t index; 3824 pgoff_t end; /* Inclusive */ 3825 int scanned = 0; 3826 int tag; 3827 3828 pagevec_init(&pvec, 0); 3829 if (wbc->range_cyclic) { 3830 index = mapping->writeback_index; /* Start from prev offset */ 3831 end = -1; 3832 } else { 3833 index = wbc->range_start >> PAGE_SHIFT; 3834 end = wbc->range_end >> PAGE_SHIFT; 3835 scanned = 1; 3836 } 3837 if (wbc->sync_mode == WB_SYNC_ALL) 3838 tag = PAGECACHE_TAG_TOWRITE; 3839 else 3840 tag = PAGECACHE_TAG_DIRTY; 3841 retry: 3842 if (wbc->sync_mode == WB_SYNC_ALL) 3843 tag_pages_for_writeback(mapping, index, end); 3844 while (!done && !nr_to_write_done && (index <= end) && 3845 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3846 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3847 unsigned i; 3848 3849 scanned = 1; 3850 for (i = 0; i < nr_pages; i++) { 3851 struct page *page = pvec.pages[i]; 3852 3853 if (!PagePrivate(page)) 3854 continue; 3855 3856 if (!wbc->range_cyclic && page->index > end) { 3857 done = 1; 3858 break; 3859 } 3860 3861 spin_lock(&mapping->private_lock); 3862 if (!PagePrivate(page)) { 3863 spin_unlock(&mapping->private_lock); 3864 continue; 3865 } 3866 3867 eb = (struct extent_buffer *)page->private; 3868 3869 /* 3870 * Shouldn't happen and normally this would be a BUG_ON 3871 * but no sense in crashing the users box for something 3872 * we can survive anyway. 3873 */ 3874 if (WARN_ON(!eb)) { 3875 spin_unlock(&mapping->private_lock); 3876 continue; 3877 } 3878 3879 if (eb == prev_eb) { 3880 spin_unlock(&mapping->private_lock); 3881 continue; 3882 } 3883 3884 ret = atomic_inc_not_zero(&eb->refs); 3885 spin_unlock(&mapping->private_lock); 3886 if (!ret) 3887 continue; 3888 3889 prev_eb = eb; 3890 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3891 if (!ret) { 3892 free_extent_buffer(eb); 3893 continue; 3894 } 3895 3896 ret = write_one_eb(eb, fs_info, wbc, &epd); 3897 if (ret) { 3898 done = 1; 3899 free_extent_buffer(eb); 3900 break; 3901 } 3902 free_extent_buffer(eb); 3903 3904 /* 3905 * the filesystem may choose to bump up nr_to_write. 3906 * We have to make sure to honor the new nr_to_write 3907 * at any time 3908 */ 3909 nr_to_write_done = wbc->nr_to_write <= 0; 3910 } 3911 pagevec_release(&pvec); 3912 cond_resched(); 3913 } 3914 if (!scanned && !done) { 3915 /* 3916 * We hit the last page and there is more work to be done: wrap 3917 * back to the start of the file 3918 */ 3919 scanned = 1; 3920 index = 0; 3921 goto retry; 3922 } 3923 flush_write_bio(&epd); 3924 return ret; 3925 } 3926 3927 /** 3928 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3929 * @mapping: address space structure to write 3930 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3931 * @writepage: function called for each page 3932 * @data: data passed to writepage function 3933 * 3934 * If a page is already under I/O, write_cache_pages() skips it, even 3935 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3936 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3937 * and msync() need to guarantee that all the data which was dirty at the time 3938 * the call was made get new I/O started against them. If wbc->sync_mode is 3939 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3940 * existing IO to complete. 3941 */ 3942 static int extent_write_cache_pages(struct address_space *mapping, 3943 struct writeback_control *wbc, 3944 writepage_t writepage, void *data, 3945 void (*flush_fn)(void *)) 3946 { 3947 struct inode *inode = mapping->host; 3948 int ret = 0; 3949 int done = 0; 3950 int nr_to_write_done = 0; 3951 struct pagevec pvec; 3952 int nr_pages; 3953 pgoff_t index; 3954 pgoff_t end; /* Inclusive */ 3955 pgoff_t done_index; 3956 int range_whole = 0; 3957 int scanned = 0; 3958 int tag; 3959 3960 /* 3961 * We have to hold onto the inode so that ordered extents can do their 3962 * work when the IO finishes. The alternative to this is failing to add 3963 * an ordered extent if the igrab() fails there and that is a huge pain 3964 * to deal with, so instead just hold onto the inode throughout the 3965 * writepages operation. If it fails here we are freeing up the inode 3966 * anyway and we'd rather not waste our time writing out stuff that is 3967 * going to be truncated anyway. 3968 */ 3969 if (!igrab(inode)) 3970 return 0; 3971 3972 pagevec_init(&pvec, 0); 3973 if (wbc->range_cyclic) { 3974 index = mapping->writeback_index; /* Start from prev offset */ 3975 end = -1; 3976 } else { 3977 index = wbc->range_start >> PAGE_SHIFT; 3978 end = wbc->range_end >> PAGE_SHIFT; 3979 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3980 range_whole = 1; 3981 scanned = 1; 3982 } 3983 if (wbc->sync_mode == WB_SYNC_ALL) 3984 tag = PAGECACHE_TAG_TOWRITE; 3985 else 3986 tag = PAGECACHE_TAG_DIRTY; 3987 retry: 3988 if (wbc->sync_mode == WB_SYNC_ALL) 3989 tag_pages_for_writeback(mapping, index, end); 3990 done_index = index; 3991 while (!done && !nr_to_write_done && (index <= end) && 3992 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3993 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3994 unsigned i; 3995 3996 scanned = 1; 3997 for (i = 0; i < nr_pages; i++) { 3998 struct page *page = pvec.pages[i]; 3999 4000 done_index = page->index; 4001 /* 4002 * At this point we hold neither mapping->tree_lock nor 4003 * lock on the page itself: the page may be truncated or 4004 * invalidated (changing page->mapping to NULL), or even 4005 * swizzled back from swapper_space to tmpfs file 4006 * mapping 4007 */ 4008 if (!trylock_page(page)) { 4009 flush_fn(data); 4010 lock_page(page); 4011 } 4012 4013 if (unlikely(page->mapping != mapping)) { 4014 unlock_page(page); 4015 continue; 4016 } 4017 4018 if (!wbc->range_cyclic && page->index > end) { 4019 done = 1; 4020 unlock_page(page); 4021 continue; 4022 } 4023 4024 if (wbc->sync_mode != WB_SYNC_NONE) { 4025 if (PageWriteback(page)) 4026 flush_fn(data); 4027 wait_on_page_writeback(page); 4028 } 4029 4030 if (PageWriteback(page) || 4031 !clear_page_dirty_for_io(page)) { 4032 unlock_page(page); 4033 continue; 4034 } 4035 4036 ret = (*writepage)(page, wbc, data); 4037 4038 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4039 unlock_page(page); 4040 ret = 0; 4041 } 4042 if (ret < 0) { 4043 /* 4044 * done_index is set past this page, 4045 * so media errors will not choke 4046 * background writeout for the entire 4047 * file. This has consequences for 4048 * range_cyclic semantics (ie. it may 4049 * not be suitable for data integrity 4050 * writeout). 4051 */ 4052 done_index = page->index + 1; 4053 done = 1; 4054 break; 4055 } 4056 4057 /* 4058 * the filesystem may choose to bump up nr_to_write. 4059 * We have to make sure to honor the new nr_to_write 4060 * at any time 4061 */ 4062 nr_to_write_done = wbc->nr_to_write <= 0; 4063 } 4064 pagevec_release(&pvec); 4065 cond_resched(); 4066 } 4067 if (!scanned && !done) { 4068 /* 4069 * We hit the last page and there is more work to be done: wrap 4070 * back to the start of the file 4071 */ 4072 scanned = 1; 4073 index = 0; 4074 goto retry; 4075 } 4076 4077 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4078 mapping->writeback_index = done_index; 4079 4080 btrfs_add_delayed_iput(inode); 4081 return ret; 4082 } 4083 4084 static void flush_epd_write_bio(struct extent_page_data *epd) 4085 { 4086 if (epd->bio) { 4087 int ret; 4088 4089 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4090 epd->sync_io ? REQ_SYNC : 0); 4091 4092 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4093 BUG_ON(ret < 0); /* -ENOMEM */ 4094 epd->bio = NULL; 4095 } 4096 } 4097 4098 static noinline void flush_write_bio(void *data) 4099 { 4100 struct extent_page_data *epd = data; 4101 flush_epd_write_bio(epd); 4102 } 4103 4104 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4105 get_extent_t *get_extent, 4106 struct writeback_control *wbc) 4107 { 4108 int ret; 4109 struct extent_page_data epd = { 4110 .bio = NULL, 4111 .tree = tree, 4112 .get_extent = get_extent, 4113 .extent_locked = 0, 4114 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4115 .bio_flags = 0, 4116 }; 4117 4118 ret = __extent_writepage(page, wbc, &epd); 4119 4120 flush_epd_write_bio(&epd); 4121 return ret; 4122 } 4123 4124 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4125 u64 start, u64 end, get_extent_t *get_extent, 4126 int mode) 4127 { 4128 int ret = 0; 4129 struct address_space *mapping = inode->i_mapping; 4130 struct page *page; 4131 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4132 PAGE_SHIFT; 4133 4134 struct extent_page_data epd = { 4135 .bio = NULL, 4136 .tree = tree, 4137 .get_extent = get_extent, 4138 .extent_locked = 1, 4139 .sync_io = mode == WB_SYNC_ALL, 4140 .bio_flags = 0, 4141 }; 4142 struct writeback_control wbc_writepages = { 4143 .sync_mode = mode, 4144 .nr_to_write = nr_pages * 2, 4145 .range_start = start, 4146 .range_end = end + 1, 4147 }; 4148 4149 while (start <= end) { 4150 page = find_get_page(mapping, start >> PAGE_SHIFT); 4151 if (clear_page_dirty_for_io(page)) 4152 ret = __extent_writepage(page, &wbc_writepages, &epd); 4153 else { 4154 if (tree->ops && tree->ops->writepage_end_io_hook) 4155 tree->ops->writepage_end_io_hook(page, start, 4156 start + PAGE_SIZE - 1, 4157 NULL, 1); 4158 unlock_page(page); 4159 } 4160 put_page(page); 4161 start += PAGE_SIZE; 4162 } 4163 4164 flush_epd_write_bio(&epd); 4165 return ret; 4166 } 4167 4168 int extent_writepages(struct extent_io_tree *tree, 4169 struct address_space *mapping, 4170 get_extent_t *get_extent, 4171 struct writeback_control *wbc) 4172 { 4173 int ret = 0; 4174 struct extent_page_data epd = { 4175 .bio = NULL, 4176 .tree = tree, 4177 .get_extent = get_extent, 4178 .extent_locked = 0, 4179 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4180 .bio_flags = 0, 4181 }; 4182 4183 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, 4184 flush_write_bio); 4185 flush_epd_write_bio(&epd); 4186 return ret; 4187 } 4188 4189 int extent_readpages(struct extent_io_tree *tree, 4190 struct address_space *mapping, 4191 struct list_head *pages, unsigned nr_pages, 4192 get_extent_t get_extent) 4193 { 4194 struct bio *bio = NULL; 4195 unsigned page_idx; 4196 unsigned long bio_flags = 0; 4197 struct page *pagepool[16]; 4198 struct page *page; 4199 struct extent_map *em_cached = NULL; 4200 int nr = 0; 4201 u64 prev_em_start = (u64)-1; 4202 4203 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4204 page = list_entry(pages->prev, struct page, lru); 4205 4206 prefetchw(&page->flags); 4207 list_del(&page->lru); 4208 if (add_to_page_cache_lru(page, mapping, 4209 page->index, 4210 readahead_gfp_mask(mapping))) { 4211 put_page(page); 4212 continue; 4213 } 4214 4215 pagepool[nr++] = page; 4216 if (nr < ARRAY_SIZE(pagepool)) 4217 continue; 4218 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4219 &bio, 0, &bio_flags, &prev_em_start); 4220 nr = 0; 4221 } 4222 if (nr) 4223 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4224 &bio, 0, &bio_flags, &prev_em_start); 4225 4226 if (em_cached) 4227 free_extent_map(em_cached); 4228 4229 BUG_ON(!list_empty(pages)); 4230 if (bio) 4231 return submit_one_bio(bio, 0, bio_flags); 4232 return 0; 4233 } 4234 4235 /* 4236 * basic invalidatepage code, this waits on any locked or writeback 4237 * ranges corresponding to the page, and then deletes any extent state 4238 * records from the tree 4239 */ 4240 int extent_invalidatepage(struct extent_io_tree *tree, 4241 struct page *page, unsigned long offset) 4242 { 4243 struct extent_state *cached_state = NULL; 4244 u64 start = page_offset(page); 4245 u64 end = start + PAGE_SIZE - 1; 4246 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4247 4248 start += ALIGN(offset, blocksize); 4249 if (start > end) 4250 return 0; 4251 4252 lock_extent_bits(tree, start, end, &cached_state); 4253 wait_on_page_writeback(page); 4254 clear_extent_bit(tree, start, end, 4255 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4256 EXTENT_DO_ACCOUNTING, 4257 1, 1, &cached_state, GFP_NOFS); 4258 return 0; 4259 } 4260 4261 /* 4262 * a helper for releasepage, this tests for areas of the page that 4263 * are locked or under IO and drops the related state bits if it is safe 4264 * to drop the page. 4265 */ 4266 static int try_release_extent_state(struct extent_map_tree *map, 4267 struct extent_io_tree *tree, 4268 struct page *page, gfp_t mask) 4269 { 4270 u64 start = page_offset(page); 4271 u64 end = start + PAGE_SIZE - 1; 4272 int ret = 1; 4273 4274 if (test_range_bit(tree, start, end, 4275 EXTENT_IOBITS, 0, NULL)) 4276 ret = 0; 4277 else { 4278 /* 4279 * at this point we can safely clear everything except the 4280 * locked bit and the nodatasum bit 4281 */ 4282 ret = clear_extent_bit(tree, start, end, 4283 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4284 0, 0, NULL, mask); 4285 4286 /* if clear_extent_bit failed for enomem reasons, 4287 * we can't allow the release to continue. 4288 */ 4289 if (ret < 0) 4290 ret = 0; 4291 else 4292 ret = 1; 4293 } 4294 return ret; 4295 } 4296 4297 /* 4298 * a helper for releasepage. As long as there are no locked extents 4299 * in the range corresponding to the page, both state records and extent 4300 * map records are removed 4301 */ 4302 int try_release_extent_mapping(struct extent_map_tree *map, 4303 struct extent_io_tree *tree, struct page *page, 4304 gfp_t mask) 4305 { 4306 struct extent_map *em; 4307 u64 start = page_offset(page); 4308 u64 end = start + PAGE_SIZE - 1; 4309 4310 if (gfpflags_allow_blocking(mask) && 4311 page->mapping->host->i_size > SZ_16M) { 4312 u64 len; 4313 while (start <= end) { 4314 len = end - start + 1; 4315 write_lock(&map->lock); 4316 em = lookup_extent_mapping(map, start, len); 4317 if (!em) { 4318 write_unlock(&map->lock); 4319 break; 4320 } 4321 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4322 em->start != start) { 4323 write_unlock(&map->lock); 4324 free_extent_map(em); 4325 break; 4326 } 4327 if (!test_range_bit(tree, em->start, 4328 extent_map_end(em) - 1, 4329 EXTENT_LOCKED | EXTENT_WRITEBACK, 4330 0, NULL)) { 4331 remove_extent_mapping(map, em); 4332 /* once for the rb tree */ 4333 free_extent_map(em); 4334 } 4335 start = extent_map_end(em); 4336 write_unlock(&map->lock); 4337 4338 /* once for us */ 4339 free_extent_map(em); 4340 } 4341 } 4342 return try_release_extent_state(map, tree, page, mask); 4343 } 4344 4345 /* 4346 * helper function for fiemap, which doesn't want to see any holes. 4347 * This maps until we find something past 'last' 4348 */ 4349 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4350 u64 offset, 4351 u64 last, 4352 get_extent_t *get_extent) 4353 { 4354 u64 sectorsize = btrfs_inode_sectorsize(inode); 4355 struct extent_map *em; 4356 u64 len; 4357 4358 if (offset >= last) 4359 return NULL; 4360 4361 while (1) { 4362 len = last - offset; 4363 if (len == 0) 4364 break; 4365 len = ALIGN(len, sectorsize); 4366 em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); 4367 if (IS_ERR_OR_NULL(em)) 4368 return em; 4369 4370 /* if this isn't a hole return it */ 4371 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4372 em->block_start != EXTENT_MAP_HOLE) { 4373 return em; 4374 } 4375 4376 /* this is a hole, advance to the next extent */ 4377 offset = extent_map_end(em); 4378 free_extent_map(em); 4379 if (offset >= last) 4380 break; 4381 } 4382 return NULL; 4383 } 4384 4385 /* 4386 * To cache previous fiemap extent 4387 * 4388 * Will be used for merging fiemap extent 4389 */ 4390 struct fiemap_cache { 4391 u64 offset; 4392 u64 phys; 4393 u64 len; 4394 u32 flags; 4395 bool cached; 4396 }; 4397 4398 /* 4399 * Helper to submit fiemap extent. 4400 * 4401 * Will try to merge current fiemap extent specified by @offset, @phys, 4402 * @len and @flags with cached one. 4403 * And only when we fails to merge, cached one will be submitted as 4404 * fiemap extent. 4405 * 4406 * Return value is the same as fiemap_fill_next_extent(). 4407 */ 4408 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 4409 struct fiemap_cache *cache, 4410 u64 offset, u64 phys, u64 len, u32 flags) 4411 { 4412 int ret = 0; 4413 4414 if (!cache->cached) 4415 goto assign; 4416 4417 /* 4418 * Sanity check, extent_fiemap() should have ensured that new 4419 * fiemap extent won't overlap with cahced one. 4420 * Not recoverable. 4421 * 4422 * NOTE: Physical address can overlap, due to compression 4423 */ 4424 if (cache->offset + cache->len > offset) { 4425 WARN_ON(1); 4426 return -EINVAL; 4427 } 4428 4429 /* 4430 * Only merges fiemap extents if 4431 * 1) Their logical addresses are continuous 4432 * 4433 * 2) Their physical addresses are continuous 4434 * So truly compressed (physical size smaller than logical size) 4435 * extents won't get merged with each other 4436 * 4437 * 3) Share same flags except FIEMAP_EXTENT_LAST 4438 * So regular extent won't get merged with prealloc extent 4439 */ 4440 if (cache->offset + cache->len == offset && 4441 cache->phys + cache->len == phys && 4442 (cache->flags & ~FIEMAP_EXTENT_LAST) == 4443 (flags & ~FIEMAP_EXTENT_LAST)) { 4444 cache->len += len; 4445 cache->flags |= flags; 4446 goto try_submit_last; 4447 } 4448 4449 /* Not mergeable, need to submit cached one */ 4450 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4451 cache->len, cache->flags); 4452 cache->cached = false; 4453 if (ret) 4454 return ret; 4455 assign: 4456 cache->cached = true; 4457 cache->offset = offset; 4458 cache->phys = phys; 4459 cache->len = len; 4460 cache->flags = flags; 4461 try_submit_last: 4462 if (cache->flags & FIEMAP_EXTENT_LAST) { 4463 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 4464 cache->phys, cache->len, cache->flags); 4465 cache->cached = false; 4466 } 4467 return ret; 4468 } 4469 4470 /* 4471 * Sanity check for fiemap cache 4472 * 4473 * All fiemap cache should be submitted by emit_fiemap_extent() 4474 * Iteration should be terminated either by last fiemap extent or 4475 * fieinfo->fi_extents_max. 4476 * So no cached fiemap should exist. 4477 */ 4478 static int check_fiemap_cache(struct btrfs_fs_info *fs_info, 4479 struct fiemap_extent_info *fieinfo, 4480 struct fiemap_cache *cache) 4481 { 4482 int ret; 4483 4484 if (!cache->cached) 4485 return 0; 4486 4487 /* Small and recoverbale problem, only to info developer */ 4488 #ifdef CONFIG_BTRFS_DEBUG 4489 WARN_ON(1); 4490 #endif 4491 btrfs_warn(fs_info, 4492 "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x", 4493 cache->offset, cache->phys, cache->len, cache->flags); 4494 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4495 cache->len, cache->flags); 4496 cache->cached = false; 4497 if (ret > 0) 4498 ret = 0; 4499 return ret; 4500 } 4501 4502 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4503 __u64 start, __u64 len, get_extent_t *get_extent) 4504 { 4505 int ret = 0; 4506 u64 off = start; 4507 u64 max = start + len; 4508 u32 flags = 0; 4509 u32 found_type; 4510 u64 last; 4511 u64 last_for_get_extent = 0; 4512 u64 disko = 0; 4513 u64 isize = i_size_read(inode); 4514 struct btrfs_key found_key; 4515 struct extent_map *em = NULL; 4516 struct extent_state *cached_state = NULL; 4517 struct btrfs_path *path; 4518 struct btrfs_root *root = BTRFS_I(inode)->root; 4519 struct fiemap_cache cache = { 0 }; 4520 int end = 0; 4521 u64 em_start = 0; 4522 u64 em_len = 0; 4523 u64 em_end = 0; 4524 4525 if (len == 0) 4526 return -EINVAL; 4527 4528 path = btrfs_alloc_path(); 4529 if (!path) 4530 return -ENOMEM; 4531 path->leave_spinning = 1; 4532 4533 start = round_down(start, btrfs_inode_sectorsize(inode)); 4534 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4535 4536 /* 4537 * lookup the last file extent. We're not using i_size here 4538 * because there might be preallocation past i_size 4539 */ 4540 ret = btrfs_lookup_file_extent(NULL, root, path, 4541 btrfs_ino(BTRFS_I(inode)), -1, 0); 4542 if (ret < 0) { 4543 btrfs_free_path(path); 4544 return ret; 4545 } else { 4546 WARN_ON(!ret); 4547 if (ret == 1) 4548 ret = 0; 4549 } 4550 4551 path->slots[0]--; 4552 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4553 found_type = found_key.type; 4554 4555 /* No extents, but there might be delalloc bits */ 4556 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4557 found_type != BTRFS_EXTENT_DATA_KEY) { 4558 /* have to trust i_size as the end */ 4559 last = (u64)-1; 4560 last_for_get_extent = isize; 4561 } else { 4562 /* 4563 * remember the start of the last extent. There are a 4564 * bunch of different factors that go into the length of the 4565 * extent, so its much less complex to remember where it started 4566 */ 4567 last = found_key.offset; 4568 last_for_get_extent = last + 1; 4569 } 4570 btrfs_release_path(path); 4571 4572 /* 4573 * we might have some extents allocated but more delalloc past those 4574 * extents. so, we trust isize unless the start of the last extent is 4575 * beyond isize 4576 */ 4577 if (last < isize) { 4578 last = (u64)-1; 4579 last_for_get_extent = isize; 4580 } 4581 4582 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4583 &cached_state); 4584 4585 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4586 get_extent); 4587 if (!em) 4588 goto out; 4589 if (IS_ERR(em)) { 4590 ret = PTR_ERR(em); 4591 goto out; 4592 } 4593 4594 while (!end) { 4595 u64 offset_in_extent = 0; 4596 4597 /* break if the extent we found is outside the range */ 4598 if (em->start >= max || extent_map_end(em) < off) 4599 break; 4600 4601 /* 4602 * get_extent may return an extent that starts before our 4603 * requested range. We have to make sure the ranges 4604 * we return to fiemap always move forward and don't 4605 * overlap, so adjust the offsets here 4606 */ 4607 em_start = max(em->start, off); 4608 4609 /* 4610 * record the offset from the start of the extent 4611 * for adjusting the disk offset below. Only do this if the 4612 * extent isn't compressed since our in ram offset may be past 4613 * what we have actually allocated on disk. 4614 */ 4615 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4616 offset_in_extent = em_start - em->start; 4617 em_end = extent_map_end(em); 4618 em_len = em_end - em_start; 4619 disko = 0; 4620 flags = 0; 4621 4622 /* 4623 * bump off for our next call to get_extent 4624 */ 4625 off = extent_map_end(em); 4626 if (off >= max) 4627 end = 1; 4628 4629 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4630 end = 1; 4631 flags |= FIEMAP_EXTENT_LAST; 4632 } else if (em->block_start == EXTENT_MAP_INLINE) { 4633 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4634 FIEMAP_EXTENT_NOT_ALIGNED); 4635 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4636 flags |= (FIEMAP_EXTENT_DELALLOC | 4637 FIEMAP_EXTENT_UNKNOWN); 4638 } else if (fieinfo->fi_extents_max) { 4639 struct btrfs_trans_handle *trans; 4640 4641 u64 bytenr = em->block_start - 4642 (em->start - em->orig_start); 4643 4644 disko = em->block_start + offset_in_extent; 4645 4646 /* 4647 * We need a trans handle to get delayed refs 4648 */ 4649 trans = btrfs_join_transaction(root); 4650 /* 4651 * It's OK if we can't start a trans we can still check 4652 * from commit_root 4653 */ 4654 if (IS_ERR(trans)) 4655 trans = NULL; 4656 4657 /* 4658 * As btrfs supports shared space, this information 4659 * can be exported to userspace tools via 4660 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4661 * then we're just getting a count and we can skip the 4662 * lookup stuff. 4663 */ 4664 ret = btrfs_check_shared(trans, root->fs_info, 4665 root->objectid, 4666 btrfs_ino(BTRFS_I(inode)), bytenr); 4667 if (trans) 4668 btrfs_end_transaction(trans); 4669 if (ret < 0) 4670 goto out_free; 4671 if (ret) 4672 flags |= FIEMAP_EXTENT_SHARED; 4673 ret = 0; 4674 } 4675 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4676 flags |= FIEMAP_EXTENT_ENCODED; 4677 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4678 flags |= FIEMAP_EXTENT_UNWRITTEN; 4679 4680 free_extent_map(em); 4681 em = NULL; 4682 if ((em_start >= last) || em_len == (u64)-1 || 4683 (last == (u64)-1 && isize <= em_end)) { 4684 flags |= FIEMAP_EXTENT_LAST; 4685 end = 1; 4686 } 4687 4688 /* now scan forward to see if this is really the last extent. */ 4689 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4690 get_extent); 4691 if (IS_ERR(em)) { 4692 ret = PTR_ERR(em); 4693 goto out; 4694 } 4695 if (!em) { 4696 flags |= FIEMAP_EXTENT_LAST; 4697 end = 1; 4698 } 4699 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 4700 em_len, flags); 4701 if (ret) { 4702 if (ret == 1) 4703 ret = 0; 4704 goto out_free; 4705 } 4706 } 4707 out_free: 4708 if (!ret) 4709 ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); 4710 free_extent_map(em); 4711 out: 4712 btrfs_free_path(path); 4713 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4714 &cached_state, GFP_NOFS); 4715 return ret; 4716 } 4717 4718 static void __free_extent_buffer(struct extent_buffer *eb) 4719 { 4720 btrfs_leak_debug_del(&eb->leak_list); 4721 kmem_cache_free(extent_buffer_cache, eb); 4722 } 4723 4724 int extent_buffer_under_io(struct extent_buffer *eb) 4725 { 4726 return (atomic_read(&eb->io_pages) || 4727 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4728 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4729 } 4730 4731 /* 4732 * Helper for releasing extent buffer page. 4733 */ 4734 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4735 { 4736 unsigned long index; 4737 struct page *page; 4738 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4739 4740 BUG_ON(extent_buffer_under_io(eb)); 4741 4742 index = num_extent_pages(eb->start, eb->len); 4743 if (index == 0) 4744 return; 4745 4746 do { 4747 index--; 4748 page = eb->pages[index]; 4749 if (!page) 4750 continue; 4751 if (mapped) 4752 spin_lock(&page->mapping->private_lock); 4753 /* 4754 * We do this since we'll remove the pages after we've 4755 * removed the eb from the radix tree, so we could race 4756 * and have this page now attached to the new eb. So 4757 * only clear page_private if it's still connected to 4758 * this eb. 4759 */ 4760 if (PagePrivate(page) && 4761 page->private == (unsigned long)eb) { 4762 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4763 BUG_ON(PageDirty(page)); 4764 BUG_ON(PageWriteback(page)); 4765 /* 4766 * We need to make sure we haven't be attached 4767 * to a new eb. 4768 */ 4769 ClearPagePrivate(page); 4770 set_page_private(page, 0); 4771 /* One for the page private */ 4772 put_page(page); 4773 } 4774 4775 if (mapped) 4776 spin_unlock(&page->mapping->private_lock); 4777 4778 /* One for when we allocated the page */ 4779 put_page(page); 4780 } while (index != 0); 4781 } 4782 4783 /* 4784 * Helper for releasing the extent buffer. 4785 */ 4786 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4787 { 4788 btrfs_release_extent_buffer_page(eb); 4789 __free_extent_buffer(eb); 4790 } 4791 4792 static struct extent_buffer * 4793 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4794 unsigned long len) 4795 { 4796 struct extent_buffer *eb = NULL; 4797 4798 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4799 eb->start = start; 4800 eb->len = len; 4801 eb->fs_info = fs_info; 4802 eb->bflags = 0; 4803 rwlock_init(&eb->lock); 4804 atomic_set(&eb->write_locks, 0); 4805 atomic_set(&eb->read_locks, 0); 4806 atomic_set(&eb->blocking_readers, 0); 4807 atomic_set(&eb->blocking_writers, 0); 4808 atomic_set(&eb->spinning_readers, 0); 4809 atomic_set(&eb->spinning_writers, 0); 4810 eb->lock_nested = 0; 4811 init_waitqueue_head(&eb->write_lock_wq); 4812 init_waitqueue_head(&eb->read_lock_wq); 4813 4814 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4815 4816 spin_lock_init(&eb->refs_lock); 4817 atomic_set(&eb->refs, 1); 4818 atomic_set(&eb->io_pages, 0); 4819 4820 /* 4821 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4822 */ 4823 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4824 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4825 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4826 4827 return eb; 4828 } 4829 4830 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4831 { 4832 unsigned long i; 4833 struct page *p; 4834 struct extent_buffer *new; 4835 unsigned long num_pages = num_extent_pages(src->start, src->len); 4836 4837 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4838 if (new == NULL) 4839 return NULL; 4840 4841 for (i = 0; i < num_pages; i++) { 4842 p = alloc_page(GFP_NOFS); 4843 if (!p) { 4844 btrfs_release_extent_buffer(new); 4845 return NULL; 4846 } 4847 attach_extent_buffer_page(new, p); 4848 WARN_ON(PageDirty(p)); 4849 SetPageUptodate(p); 4850 new->pages[i] = p; 4851 copy_page(page_address(p), page_address(src->pages[i])); 4852 } 4853 4854 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4855 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4856 4857 return new; 4858 } 4859 4860 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4861 u64 start, unsigned long len) 4862 { 4863 struct extent_buffer *eb; 4864 unsigned long num_pages; 4865 unsigned long i; 4866 4867 num_pages = num_extent_pages(start, len); 4868 4869 eb = __alloc_extent_buffer(fs_info, start, len); 4870 if (!eb) 4871 return NULL; 4872 4873 for (i = 0; i < num_pages; i++) { 4874 eb->pages[i] = alloc_page(GFP_NOFS); 4875 if (!eb->pages[i]) 4876 goto err; 4877 } 4878 set_extent_buffer_uptodate(eb); 4879 btrfs_set_header_nritems(eb, 0); 4880 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4881 4882 return eb; 4883 err: 4884 for (; i > 0; i--) 4885 __free_page(eb->pages[i - 1]); 4886 __free_extent_buffer(eb); 4887 return NULL; 4888 } 4889 4890 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4891 u64 start) 4892 { 4893 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4894 } 4895 4896 static void check_buffer_tree_ref(struct extent_buffer *eb) 4897 { 4898 int refs; 4899 /* the ref bit is tricky. We have to make sure it is set 4900 * if we have the buffer dirty. Otherwise the 4901 * code to free a buffer can end up dropping a dirty 4902 * page 4903 * 4904 * Once the ref bit is set, it won't go away while the 4905 * buffer is dirty or in writeback, and it also won't 4906 * go away while we have the reference count on the 4907 * eb bumped. 4908 * 4909 * We can't just set the ref bit without bumping the 4910 * ref on the eb because free_extent_buffer might 4911 * see the ref bit and try to clear it. If this happens 4912 * free_extent_buffer might end up dropping our original 4913 * ref by mistake and freeing the page before we are able 4914 * to add one more ref. 4915 * 4916 * So bump the ref count first, then set the bit. If someone 4917 * beat us to it, drop the ref we added. 4918 */ 4919 refs = atomic_read(&eb->refs); 4920 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4921 return; 4922 4923 spin_lock(&eb->refs_lock); 4924 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4925 atomic_inc(&eb->refs); 4926 spin_unlock(&eb->refs_lock); 4927 } 4928 4929 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4930 struct page *accessed) 4931 { 4932 unsigned long num_pages, i; 4933 4934 check_buffer_tree_ref(eb); 4935 4936 num_pages = num_extent_pages(eb->start, eb->len); 4937 for (i = 0; i < num_pages; i++) { 4938 struct page *p = eb->pages[i]; 4939 4940 if (p != accessed) 4941 mark_page_accessed(p); 4942 } 4943 } 4944 4945 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4946 u64 start) 4947 { 4948 struct extent_buffer *eb; 4949 4950 rcu_read_lock(); 4951 eb = radix_tree_lookup(&fs_info->buffer_radix, 4952 start >> PAGE_SHIFT); 4953 if (eb && atomic_inc_not_zero(&eb->refs)) { 4954 rcu_read_unlock(); 4955 /* 4956 * Lock our eb's refs_lock to avoid races with 4957 * free_extent_buffer. When we get our eb it might be flagged 4958 * with EXTENT_BUFFER_STALE and another task running 4959 * free_extent_buffer might have seen that flag set, 4960 * eb->refs == 2, that the buffer isn't under IO (dirty and 4961 * writeback flags not set) and it's still in the tree (flag 4962 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4963 * of decrementing the extent buffer's reference count twice. 4964 * So here we could race and increment the eb's reference count, 4965 * clear its stale flag, mark it as dirty and drop our reference 4966 * before the other task finishes executing free_extent_buffer, 4967 * which would later result in an attempt to free an extent 4968 * buffer that is dirty. 4969 */ 4970 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4971 spin_lock(&eb->refs_lock); 4972 spin_unlock(&eb->refs_lock); 4973 } 4974 mark_extent_buffer_accessed(eb, NULL); 4975 return eb; 4976 } 4977 rcu_read_unlock(); 4978 4979 return NULL; 4980 } 4981 4982 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4983 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4984 u64 start) 4985 { 4986 struct extent_buffer *eb, *exists = NULL; 4987 int ret; 4988 4989 eb = find_extent_buffer(fs_info, start); 4990 if (eb) 4991 return eb; 4992 eb = alloc_dummy_extent_buffer(fs_info, start); 4993 if (!eb) 4994 return NULL; 4995 eb->fs_info = fs_info; 4996 again: 4997 ret = radix_tree_preload(GFP_NOFS); 4998 if (ret) 4999 goto free_eb; 5000 spin_lock(&fs_info->buffer_lock); 5001 ret = radix_tree_insert(&fs_info->buffer_radix, 5002 start >> PAGE_SHIFT, eb); 5003 spin_unlock(&fs_info->buffer_lock); 5004 radix_tree_preload_end(); 5005 if (ret == -EEXIST) { 5006 exists = find_extent_buffer(fs_info, start); 5007 if (exists) 5008 goto free_eb; 5009 else 5010 goto again; 5011 } 5012 check_buffer_tree_ref(eb); 5013 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5014 5015 /* 5016 * We will free dummy extent buffer's if they come into 5017 * free_extent_buffer with a ref count of 2, but if we are using this we 5018 * want the buffers to stay in memory until we're done with them, so 5019 * bump the ref count again. 5020 */ 5021 atomic_inc(&eb->refs); 5022 return eb; 5023 free_eb: 5024 btrfs_release_extent_buffer(eb); 5025 return exists; 5026 } 5027 #endif 5028 5029 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 5030 u64 start) 5031 { 5032 unsigned long len = fs_info->nodesize; 5033 unsigned long num_pages = num_extent_pages(start, len); 5034 unsigned long i; 5035 unsigned long index = start >> PAGE_SHIFT; 5036 struct extent_buffer *eb; 5037 struct extent_buffer *exists = NULL; 5038 struct page *p; 5039 struct address_space *mapping = fs_info->btree_inode->i_mapping; 5040 int uptodate = 1; 5041 int ret; 5042 5043 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 5044 btrfs_err(fs_info, "bad tree block start %llu", start); 5045 return ERR_PTR(-EINVAL); 5046 } 5047 5048 eb = find_extent_buffer(fs_info, start); 5049 if (eb) 5050 return eb; 5051 5052 eb = __alloc_extent_buffer(fs_info, start, len); 5053 if (!eb) 5054 return ERR_PTR(-ENOMEM); 5055 5056 for (i = 0; i < num_pages; i++, index++) { 5057 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 5058 if (!p) { 5059 exists = ERR_PTR(-ENOMEM); 5060 goto free_eb; 5061 } 5062 5063 spin_lock(&mapping->private_lock); 5064 if (PagePrivate(p)) { 5065 /* 5066 * We could have already allocated an eb for this page 5067 * and attached one so lets see if we can get a ref on 5068 * the existing eb, and if we can we know it's good and 5069 * we can just return that one, else we know we can just 5070 * overwrite page->private. 5071 */ 5072 exists = (struct extent_buffer *)p->private; 5073 if (atomic_inc_not_zero(&exists->refs)) { 5074 spin_unlock(&mapping->private_lock); 5075 unlock_page(p); 5076 put_page(p); 5077 mark_extent_buffer_accessed(exists, p); 5078 goto free_eb; 5079 } 5080 exists = NULL; 5081 5082 /* 5083 * Do this so attach doesn't complain and we need to 5084 * drop the ref the old guy had. 5085 */ 5086 ClearPagePrivate(p); 5087 WARN_ON(PageDirty(p)); 5088 put_page(p); 5089 } 5090 attach_extent_buffer_page(eb, p); 5091 spin_unlock(&mapping->private_lock); 5092 WARN_ON(PageDirty(p)); 5093 eb->pages[i] = p; 5094 if (!PageUptodate(p)) 5095 uptodate = 0; 5096 5097 /* 5098 * see below about how we avoid a nasty race with release page 5099 * and why we unlock later 5100 */ 5101 } 5102 if (uptodate) 5103 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5104 again: 5105 ret = radix_tree_preload(GFP_NOFS); 5106 if (ret) { 5107 exists = ERR_PTR(ret); 5108 goto free_eb; 5109 } 5110 5111 spin_lock(&fs_info->buffer_lock); 5112 ret = radix_tree_insert(&fs_info->buffer_radix, 5113 start >> PAGE_SHIFT, eb); 5114 spin_unlock(&fs_info->buffer_lock); 5115 radix_tree_preload_end(); 5116 if (ret == -EEXIST) { 5117 exists = find_extent_buffer(fs_info, start); 5118 if (exists) 5119 goto free_eb; 5120 else 5121 goto again; 5122 } 5123 /* add one reference for the tree */ 5124 check_buffer_tree_ref(eb); 5125 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5126 5127 /* 5128 * there is a race where release page may have 5129 * tried to find this extent buffer in the radix 5130 * but failed. It will tell the VM it is safe to 5131 * reclaim the, and it will clear the page private bit. 5132 * We must make sure to set the page private bit properly 5133 * after the extent buffer is in the radix tree so 5134 * it doesn't get lost 5135 */ 5136 SetPageChecked(eb->pages[0]); 5137 for (i = 1; i < num_pages; i++) { 5138 p = eb->pages[i]; 5139 ClearPageChecked(p); 5140 unlock_page(p); 5141 } 5142 unlock_page(eb->pages[0]); 5143 return eb; 5144 5145 free_eb: 5146 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5147 for (i = 0; i < num_pages; i++) { 5148 if (eb->pages[i]) 5149 unlock_page(eb->pages[i]); 5150 } 5151 5152 btrfs_release_extent_buffer(eb); 5153 return exists; 5154 } 5155 5156 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5157 { 5158 struct extent_buffer *eb = 5159 container_of(head, struct extent_buffer, rcu_head); 5160 5161 __free_extent_buffer(eb); 5162 } 5163 5164 /* Expects to have eb->eb_lock already held */ 5165 static int release_extent_buffer(struct extent_buffer *eb) 5166 { 5167 WARN_ON(atomic_read(&eb->refs) == 0); 5168 if (atomic_dec_and_test(&eb->refs)) { 5169 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5170 struct btrfs_fs_info *fs_info = eb->fs_info; 5171 5172 spin_unlock(&eb->refs_lock); 5173 5174 spin_lock(&fs_info->buffer_lock); 5175 radix_tree_delete(&fs_info->buffer_radix, 5176 eb->start >> PAGE_SHIFT); 5177 spin_unlock(&fs_info->buffer_lock); 5178 } else { 5179 spin_unlock(&eb->refs_lock); 5180 } 5181 5182 /* Should be safe to release our pages at this point */ 5183 btrfs_release_extent_buffer_page(eb); 5184 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5185 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5186 __free_extent_buffer(eb); 5187 return 1; 5188 } 5189 #endif 5190 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5191 return 1; 5192 } 5193 spin_unlock(&eb->refs_lock); 5194 5195 return 0; 5196 } 5197 5198 void free_extent_buffer(struct extent_buffer *eb) 5199 { 5200 int refs; 5201 int old; 5202 if (!eb) 5203 return; 5204 5205 while (1) { 5206 refs = atomic_read(&eb->refs); 5207 if (refs <= 3) 5208 break; 5209 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5210 if (old == refs) 5211 return; 5212 } 5213 5214 spin_lock(&eb->refs_lock); 5215 if (atomic_read(&eb->refs) == 2 && 5216 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5217 atomic_dec(&eb->refs); 5218 5219 if (atomic_read(&eb->refs) == 2 && 5220 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5221 !extent_buffer_under_io(eb) && 5222 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5223 atomic_dec(&eb->refs); 5224 5225 /* 5226 * I know this is terrible, but it's temporary until we stop tracking 5227 * the uptodate bits and such for the extent buffers. 5228 */ 5229 release_extent_buffer(eb); 5230 } 5231 5232 void free_extent_buffer_stale(struct extent_buffer *eb) 5233 { 5234 if (!eb) 5235 return; 5236 5237 spin_lock(&eb->refs_lock); 5238 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5239 5240 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5241 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5242 atomic_dec(&eb->refs); 5243 release_extent_buffer(eb); 5244 } 5245 5246 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5247 { 5248 unsigned long i; 5249 unsigned long num_pages; 5250 struct page *page; 5251 5252 num_pages = num_extent_pages(eb->start, eb->len); 5253 5254 for (i = 0; i < num_pages; i++) { 5255 page = eb->pages[i]; 5256 if (!PageDirty(page)) 5257 continue; 5258 5259 lock_page(page); 5260 WARN_ON(!PagePrivate(page)); 5261 5262 clear_page_dirty_for_io(page); 5263 spin_lock_irq(&page->mapping->tree_lock); 5264 if (!PageDirty(page)) { 5265 radix_tree_tag_clear(&page->mapping->page_tree, 5266 page_index(page), 5267 PAGECACHE_TAG_DIRTY); 5268 } 5269 spin_unlock_irq(&page->mapping->tree_lock); 5270 ClearPageError(page); 5271 unlock_page(page); 5272 } 5273 WARN_ON(atomic_read(&eb->refs) == 0); 5274 } 5275 5276 int set_extent_buffer_dirty(struct extent_buffer *eb) 5277 { 5278 unsigned long i; 5279 unsigned long num_pages; 5280 int was_dirty = 0; 5281 5282 check_buffer_tree_ref(eb); 5283 5284 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5285 5286 num_pages = num_extent_pages(eb->start, eb->len); 5287 WARN_ON(atomic_read(&eb->refs) == 0); 5288 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5289 5290 for (i = 0; i < num_pages; i++) 5291 set_page_dirty(eb->pages[i]); 5292 return was_dirty; 5293 } 5294 5295 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5296 { 5297 unsigned long i; 5298 struct page *page; 5299 unsigned long num_pages; 5300 5301 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5302 num_pages = num_extent_pages(eb->start, eb->len); 5303 for (i = 0; i < num_pages; i++) { 5304 page = eb->pages[i]; 5305 if (page) 5306 ClearPageUptodate(page); 5307 } 5308 } 5309 5310 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5311 { 5312 unsigned long i; 5313 struct page *page; 5314 unsigned long num_pages; 5315 5316 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5317 num_pages = num_extent_pages(eb->start, eb->len); 5318 for (i = 0; i < num_pages; i++) { 5319 page = eb->pages[i]; 5320 SetPageUptodate(page); 5321 } 5322 } 5323 5324 int extent_buffer_uptodate(struct extent_buffer *eb) 5325 { 5326 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5327 } 5328 5329 int read_extent_buffer_pages(struct extent_io_tree *tree, 5330 struct extent_buffer *eb, int wait, 5331 get_extent_t *get_extent, int mirror_num) 5332 { 5333 unsigned long i; 5334 struct page *page; 5335 int err; 5336 int ret = 0; 5337 int locked_pages = 0; 5338 int all_uptodate = 1; 5339 unsigned long num_pages; 5340 unsigned long num_reads = 0; 5341 struct bio *bio = NULL; 5342 unsigned long bio_flags = 0; 5343 5344 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5345 return 0; 5346 5347 num_pages = num_extent_pages(eb->start, eb->len); 5348 for (i = 0; i < num_pages; i++) { 5349 page = eb->pages[i]; 5350 if (wait == WAIT_NONE) { 5351 if (!trylock_page(page)) 5352 goto unlock_exit; 5353 } else { 5354 lock_page(page); 5355 } 5356 locked_pages++; 5357 } 5358 /* 5359 * We need to firstly lock all pages to make sure that 5360 * the uptodate bit of our pages won't be affected by 5361 * clear_extent_buffer_uptodate(). 5362 */ 5363 for (i = 0; i < num_pages; i++) { 5364 page = eb->pages[i]; 5365 if (!PageUptodate(page)) { 5366 num_reads++; 5367 all_uptodate = 0; 5368 } 5369 } 5370 5371 if (all_uptodate) { 5372 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5373 goto unlock_exit; 5374 } 5375 5376 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5377 eb->read_mirror = 0; 5378 atomic_set(&eb->io_pages, num_reads); 5379 for (i = 0; i < num_pages; i++) { 5380 page = eb->pages[i]; 5381 5382 if (!PageUptodate(page)) { 5383 if (ret) { 5384 atomic_dec(&eb->io_pages); 5385 unlock_page(page); 5386 continue; 5387 } 5388 5389 ClearPageError(page); 5390 err = __extent_read_full_page(tree, page, 5391 get_extent, &bio, 5392 mirror_num, &bio_flags, 5393 REQ_META); 5394 if (err) { 5395 ret = err; 5396 /* 5397 * We use &bio in above __extent_read_full_page, 5398 * so we ensure that if it returns error, the 5399 * current page fails to add itself to bio and 5400 * it's been unlocked. 5401 * 5402 * We must dec io_pages by ourselves. 5403 */ 5404 atomic_dec(&eb->io_pages); 5405 } 5406 } else { 5407 unlock_page(page); 5408 } 5409 } 5410 5411 if (bio) { 5412 err = submit_one_bio(bio, mirror_num, bio_flags); 5413 if (err) 5414 return err; 5415 } 5416 5417 if (ret || wait != WAIT_COMPLETE) 5418 return ret; 5419 5420 for (i = 0; i < num_pages; i++) { 5421 page = eb->pages[i]; 5422 wait_on_page_locked(page); 5423 if (!PageUptodate(page)) 5424 ret = -EIO; 5425 } 5426 5427 return ret; 5428 5429 unlock_exit: 5430 while (locked_pages > 0) { 5431 locked_pages--; 5432 page = eb->pages[locked_pages]; 5433 unlock_page(page); 5434 } 5435 return ret; 5436 } 5437 5438 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5439 unsigned long start, 5440 unsigned long len) 5441 { 5442 size_t cur; 5443 size_t offset; 5444 struct page *page; 5445 char *kaddr; 5446 char *dst = (char *)dstv; 5447 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5448 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5449 5450 WARN_ON(start > eb->len); 5451 WARN_ON(start + len > eb->start + eb->len); 5452 5453 offset = (start_offset + start) & (PAGE_SIZE - 1); 5454 5455 while (len > 0) { 5456 page = eb->pages[i]; 5457 5458 cur = min(len, (PAGE_SIZE - offset)); 5459 kaddr = page_address(page); 5460 memcpy(dst, kaddr + offset, cur); 5461 5462 dst += cur; 5463 len -= cur; 5464 offset = 0; 5465 i++; 5466 } 5467 } 5468 5469 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5470 unsigned long start, 5471 unsigned long len) 5472 { 5473 size_t cur; 5474 size_t offset; 5475 struct page *page; 5476 char *kaddr; 5477 char __user *dst = (char __user *)dstv; 5478 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5479 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5480 int ret = 0; 5481 5482 WARN_ON(start > eb->len); 5483 WARN_ON(start + len > eb->start + eb->len); 5484 5485 offset = (start_offset + start) & (PAGE_SIZE - 1); 5486 5487 while (len > 0) { 5488 page = eb->pages[i]; 5489 5490 cur = min(len, (PAGE_SIZE - offset)); 5491 kaddr = page_address(page); 5492 if (copy_to_user(dst, kaddr + offset, cur)) { 5493 ret = -EFAULT; 5494 break; 5495 } 5496 5497 dst += cur; 5498 len -= cur; 5499 offset = 0; 5500 i++; 5501 } 5502 5503 return ret; 5504 } 5505 5506 /* 5507 * return 0 if the item is found within a page. 5508 * return 1 if the item spans two pages. 5509 * return -EINVAL otherwise. 5510 */ 5511 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5512 unsigned long min_len, char **map, 5513 unsigned long *map_start, 5514 unsigned long *map_len) 5515 { 5516 size_t offset = start & (PAGE_SIZE - 1); 5517 char *kaddr; 5518 struct page *p; 5519 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5520 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5521 unsigned long end_i = (start_offset + start + min_len - 1) >> 5522 PAGE_SHIFT; 5523 5524 if (i != end_i) 5525 return 1; 5526 5527 if (i == 0) { 5528 offset = start_offset; 5529 *map_start = 0; 5530 } else { 5531 offset = 0; 5532 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5533 } 5534 5535 if (start + min_len > eb->len) { 5536 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5537 eb->start, eb->len, start, min_len); 5538 return -EINVAL; 5539 } 5540 5541 p = eb->pages[i]; 5542 kaddr = page_address(p); 5543 *map = kaddr + offset; 5544 *map_len = PAGE_SIZE - offset; 5545 return 0; 5546 } 5547 5548 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5549 unsigned long start, 5550 unsigned long len) 5551 { 5552 size_t cur; 5553 size_t offset; 5554 struct page *page; 5555 char *kaddr; 5556 char *ptr = (char *)ptrv; 5557 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5558 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5559 int ret = 0; 5560 5561 WARN_ON(start > eb->len); 5562 WARN_ON(start + len > eb->start + eb->len); 5563 5564 offset = (start_offset + start) & (PAGE_SIZE - 1); 5565 5566 while (len > 0) { 5567 page = eb->pages[i]; 5568 5569 cur = min(len, (PAGE_SIZE - offset)); 5570 5571 kaddr = page_address(page); 5572 ret = memcmp(ptr, kaddr + offset, cur); 5573 if (ret) 5574 break; 5575 5576 ptr += cur; 5577 len -= cur; 5578 offset = 0; 5579 i++; 5580 } 5581 return ret; 5582 } 5583 5584 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5585 const void *srcv) 5586 { 5587 char *kaddr; 5588 5589 WARN_ON(!PageUptodate(eb->pages[0])); 5590 kaddr = page_address(eb->pages[0]); 5591 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5592 BTRFS_FSID_SIZE); 5593 } 5594 5595 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5596 { 5597 char *kaddr; 5598 5599 WARN_ON(!PageUptodate(eb->pages[0])); 5600 kaddr = page_address(eb->pages[0]); 5601 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5602 BTRFS_FSID_SIZE); 5603 } 5604 5605 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5606 unsigned long start, unsigned long len) 5607 { 5608 size_t cur; 5609 size_t offset; 5610 struct page *page; 5611 char *kaddr; 5612 char *src = (char *)srcv; 5613 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5614 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5615 5616 WARN_ON(start > eb->len); 5617 WARN_ON(start + len > eb->start + eb->len); 5618 5619 offset = (start_offset + start) & (PAGE_SIZE - 1); 5620 5621 while (len > 0) { 5622 page = eb->pages[i]; 5623 WARN_ON(!PageUptodate(page)); 5624 5625 cur = min(len, PAGE_SIZE - offset); 5626 kaddr = page_address(page); 5627 memcpy(kaddr + offset, src, cur); 5628 5629 src += cur; 5630 len -= cur; 5631 offset = 0; 5632 i++; 5633 } 5634 } 5635 5636 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5637 unsigned long len) 5638 { 5639 size_t cur; 5640 size_t offset; 5641 struct page *page; 5642 char *kaddr; 5643 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5644 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5645 5646 WARN_ON(start > eb->len); 5647 WARN_ON(start + len > eb->start + eb->len); 5648 5649 offset = (start_offset + start) & (PAGE_SIZE - 1); 5650 5651 while (len > 0) { 5652 page = eb->pages[i]; 5653 WARN_ON(!PageUptodate(page)); 5654 5655 cur = min(len, PAGE_SIZE - offset); 5656 kaddr = page_address(page); 5657 memset(kaddr + offset, 0, cur); 5658 5659 len -= cur; 5660 offset = 0; 5661 i++; 5662 } 5663 } 5664 5665 void copy_extent_buffer_full(struct extent_buffer *dst, 5666 struct extent_buffer *src) 5667 { 5668 int i; 5669 unsigned num_pages; 5670 5671 ASSERT(dst->len == src->len); 5672 5673 num_pages = num_extent_pages(dst->start, dst->len); 5674 for (i = 0; i < num_pages; i++) 5675 copy_page(page_address(dst->pages[i]), 5676 page_address(src->pages[i])); 5677 } 5678 5679 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5680 unsigned long dst_offset, unsigned long src_offset, 5681 unsigned long len) 5682 { 5683 u64 dst_len = dst->len; 5684 size_t cur; 5685 size_t offset; 5686 struct page *page; 5687 char *kaddr; 5688 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5689 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5690 5691 WARN_ON(src->len != dst_len); 5692 5693 offset = (start_offset + dst_offset) & 5694 (PAGE_SIZE - 1); 5695 5696 while (len > 0) { 5697 page = dst->pages[i]; 5698 WARN_ON(!PageUptodate(page)); 5699 5700 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5701 5702 kaddr = page_address(page); 5703 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5704 5705 src_offset += cur; 5706 len -= cur; 5707 offset = 0; 5708 i++; 5709 } 5710 } 5711 5712 void le_bitmap_set(u8 *map, unsigned int start, int len) 5713 { 5714 u8 *p = map + BIT_BYTE(start); 5715 const unsigned int size = start + len; 5716 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5717 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5718 5719 while (len - bits_to_set >= 0) { 5720 *p |= mask_to_set; 5721 len -= bits_to_set; 5722 bits_to_set = BITS_PER_BYTE; 5723 mask_to_set = ~0; 5724 p++; 5725 } 5726 if (len) { 5727 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5728 *p |= mask_to_set; 5729 } 5730 } 5731 5732 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5733 { 5734 u8 *p = map + BIT_BYTE(start); 5735 const unsigned int size = start + len; 5736 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5737 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5738 5739 while (len - bits_to_clear >= 0) { 5740 *p &= ~mask_to_clear; 5741 len -= bits_to_clear; 5742 bits_to_clear = BITS_PER_BYTE; 5743 mask_to_clear = ~0; 5744 p++; 5745 } 5746 if (len) { 5747 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5748 *p &= ~mask_to_clear; 5749 } 5750 } 5751 5752 /* 5753 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5754 * given bit number 5755 * @eb: the extent buffer 5756 * @start: offset of the bitmap item in the extent buffer 5757 * @nr: bit number 5758 * @page_index: return index of the page in the extent buffer that contains the 5759 * given bit number 5760 * @page_offset: return offset into the page given by page_index 5761 * 5762 * This helper hides the ugliness of finding the byte in an extent buffer which 5763 * contains a given bit. 5764 */ 5765 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5766 unsigned long start, unsigned long nr, 5767 unsigned long *page_index, 5768 size_t *page_offset) 5769 { 5770 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5771 size_t byte_offset = BIT_BYTE(nr); 5772 size_t offset; 5773 5774 /* 5775 * The byte we want is the offset of the extent buffer + the offset of 5776 * the bitmap item in the extent buffer + the offset of the byte in the 5777 * bitmap item. 5778 */ 5779 offset = start_offset + start + byte_offset; 5780 5781 *page_index = offset >> PAGE_SHIFT; 5782 *page_offset = offset & (PAGE_SIZE - 1); 5783 } 5784 5785 /** 5786 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5787 * @eb: the extent buffer 5788 * @start: offset of the bitmap item in the extent buffer 5789 * @nr: bit number to test 5790 */ 5791 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5792 unsigned long nr) 5793 { 5794 u8 *kaddr; 5795 struct page *page; 5796 unsigned long i; 5797 size_t offset; 5798 5799 eb_bitmap_offset(eb, start, nr, &i, &offset); 5800 page = eb->pages[i]; 5801 WARN_ON(!PageUptodate(page)); 5802 kaddr = page_address(page); 5803 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5804 } 5805 5806 /** 5807 * extent_buffer_bitmap_set - set an area of a bitmap 5808 * @eb: the extent buffer 5809 * @start: offset of the bitmap item in the extent buffer 5810 * @pos: bit number of the first bit 5811 * @len: number of bits to set 5812 */ 5813 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5814 unsigned long pos, unsigned long len) 5815 { 5816 u8 *kaddr; 5817 struct page *page; 5818 unsigned long i; 5819 size_t offset; 5820 const unsigned int size = pos + len; 5821 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5822 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5823 5824 eb_bitmap_offset(eb, start, pos, &i, &offset); 5825 page = eb->pages[i]; 5826 WARN_ON(!PageUptodate(page)); 5827 kaddr = page_address(page); 5828 5829 while (len >= bits_to_set) { 5830 kaddr[offset] |= mask_to_set; 5831 len -= bits_to_set; 5832 bits_to_set = BITS_PER_BYTE; 5833 mask_to_set = ~0; 5834 if (++offset >= PAGE_SIZE && len > 0) { 5835 offset = 0; 5836 page = eb->pages[++i]; 5837 WARN_ON(!PageUptodate(page)); 5838 kaddr = page_address(page); 5839 } 5840 } 5841 if (len) { 5842 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5843 kaddr[offset] |= mask_to_set; 5844 } 5845 } 5846 5847 5848 /** 5849 * extent_buffer_bitmap_clear - clear an area of a bitmap 5850 * @eb: the extent buffer 5851 * @start: offset of the bitmap item in the extent buffer 5852 * @pos: bit number of the first bit 5853 * @len: number of bits to clear 5854 */ 5855 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5856 unsigned long pos, unsigned long len) 5857 { 5858 u8 *kaddr; 5859 struct page *page; 5860 unsigned long i; 5861 size_t offset; 5862 const unsigned int size = pos + len; 5863 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5864 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5865 5866 eb_bitmap_offset(eb, start, pos, &i, &offset); 5867 page = eb->pages[i]; 5868 WARN_ON(!PageUptodate(page)); 5869 kaddr = page_address(page); 5870 5871 while (len >= bits_to_clear) { 5872 kaddr[offset] &= ~mask_to_clear; 5873 len -= bits_to_clear; 5874 bits_to_clear = BITS_PER_BYTE; 5875 mask_to_clear = ~0; 5876 if (++offset >= PAGE_SIZE && len > 0) { 5877 offset = 0; 5878 page = eb->pages[++i]; 5879 WARN_ON(!PageUptodate(page)); 5880 kaddr = page_address(page); 5881 } 5882 } 5883 if (len) { 5884 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5885 kaddr[offset] &= ~mask_to_clear; 5886 } 5887 } 5888 5889 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5890 { 5891 unsigned long distance = (src > dst) ? src - dst : dst - src; 5892 return distance < len; 5893 } 5894 5895 static void copy_pages(struct page *dst_page, struct page *src_page, 5896 unsigned long dst_off, unsigned long src_off, 5897 unsigned long len) 5898 { 5899 char *dst_kaddr = page_address(dst_page); 5900 char *src_kaddr; 5901 int must_memmove = 0; 5902 5903 if (dst_page != src_page) { 5904 src_kaddr = page_address(src_page); 5905 } else { 5906 src_kaddr = dst_kaddr; 5907 if (areas_overlap(src_off, dst_off, len)) 5908 must_memmove = 1; 5909 } 5910 5911 if (must_memmove) 5912 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5913 else 5914 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5915 } 5916 5917 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5918 unsigned long src_offset, unsigned long len) 5919 { 5920 struct btrfs_fs_info *fs_info = dst->fs_info; 5921 size_t cur; 5922 size_t dst_off_in_page; 5923 size_t src_off_in_page; 5924 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5925 unsigned long dst_i; 5926 unsigned long src_i; 5927 5928 if (src_offset + len > dst->len) { 5929 btrfs_err(fs_info, 5930 "memmove bogus src_offset %lu move len %lu dst len %lu", 5931 src_offset, len, dst->len); 5932 BUG_ON(1); 5933 } 5934 if (dst_offset + len > dst->len) { 5935 btrfs_err(fs_info, 5936 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5937 dst_offset, len, dst->len); 5938 BUG_ON(1); 5939 } 5940 5941 while (len > 0) { 5942 dst_off_in_page = (start_offset + dst_offset) & 5943 (PAGE_SIZE - 1); 5944 src_off_in_page = (start_offset + src_offset) & 5945 (PAGE_SIZE - 1); 5946 5947 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5948 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5949 5950 cur = min(len, (unsigned long)(PAGE_SIZE - 5951 src_off_in_page)); 5952 cur = min_t(unsigned long, cur, 5953 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5954 5955 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5956 dst_off_in_page, src_off_in_page, cur); 5957 5958 src_offset += cur; 5959 dst_offset += cur; 5960 len -= cur; 5961 } 5962 } 5963 5964 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5965 unsigned long src_offset, unsigned long len) 5966 { 5967 struct btrfs_fs_info *fs_info = dst->fs_info; 5968 size_t cur; 5969 size_t dst_off_in_page; 5970 size_t src_off_in_page; 5971 unsigned long dst_end = dst_offset + len - 1; 5972 unsigned long src_end = src_offset + len - 1; 5973 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5974 unsigned long dst_i; 5975 unsigned long src_i; 5976 5977 if (src_offset + len > dst->len) { 5978 btrfs_err(fs_info, 5979 "memmove bogus src_offset %lu move len %lu len %lu", 5980 src_offset, len, dst->len); 5981 BUG_ON(1); 5982 } 5983 if (dst_offset + len > dst->len) { 5984 btrfs_err(fs_info, 5985 "memmove bogus dst_offset %lu move len %lu len %lu", 5986 dst_offset, len, dst->len); 5987 BUG_ON(1); 5988 } 5989 if (dst_offset < src_offset) { 5990 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5991 return; 5992 } 5993 while (len > 0) { 5994 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5995 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5996 5997 dst_off_in_page = (start_offset + dst_end) & 5998 (PAGE_SIZE - 1); 5999 src_off_in_page = (start_offset + src_end) & 6000 (PAGE_SIZE - 1); 6001 6002 cur = min_t(unsigned long, len, src_off_in_page + 1); 6003 cur = min(cur, dst_off_in_page + 1); 6004 copy_pages(dst->pages[dst_i], dst->pages[src_i], 6005 dst_off_in_page - cur + 1, 6006 src_off_in_page - cur + 1, cur); 6007 6008 dst_end -= cur; 6009 src_end -= cur; 6010 len -= cur; 6011 } 6012 } 6013 6014 int try_release_extent_buffer(struct page *page) 6015 { 6016 struct extent_buffer *eb; 6017 6018 /* 6019 * We need to make sure nobody is attaching this page to an eb right 6020 * now. 6021 */ 6022 spin_lock(&page->mapping->private_lock); 6023 if (!PagePrivate(page)) { 6024 spin_unlock(&page->mapping->private_lock); 6025 return 1; 6026 } 6027 6028 eb = (struct extent_buffer *)page->private; 6029 BUG_ON(!eb); 6030 6031 /* 6032 * This is a little awful but should be ok, we need to make sure that 6033 * the eb doesn't disappear out from under us while we're looking at 6034 * this page. 6035 */ 6036 spin_lock(&eb->refs_lock); 6037 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 6038 spin_unlock(&eb->refs_lock); 6039 spin_unlock(&page->mapping->private_lock); 6040 return 0; 6041 } 6042 spin_unlock(&page->mapping->private_lock); 6043 6044 /* 6045 * If tree ref isn't set then we know the ref on this eb is a real ref, 6046 * so just return, this page will likely be freed soon anyway. 6047 */ 6048 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 6049 spin_unlock(&eb->refs_lock); 6050 return 0; 6051 } 6052 6053 return release_extent_buffer(eb); 6054 } 6055