xref: /linux/fs/btrfs/extent_io.c (revision b43ab901d671e3e3cad425ea5e9a3c74e266dcdd)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21 #include "check-integrity.h"
22 
23 static struct kmem_cache *extent_state_cache;
24 static struct kmem_cache *extent_buffer_cache;
25 
26 static LIST_HEAD(buffers);
27 static LIST_HEAD(states);
28 
29 #define LEAK_DEBUG 0
30 #if LEAK_DEBUG
31 static DEFINE_SPINLOCK(leak_lock);
32 #endif
33 
34 #define BUFFER_LRU_MAX 64
35 
36 struct tree_entry {
37 	u64 start;
38 	u64 end;
39 	struct rb_node rb_node;
40 };
41 
42 struct extent_page_data {
43 	struct bio *bio;
44 	struct extent_io_tree *tree;
45 	get_extent_t *get_extent;
46 
47 	/* tells writepage not to lock the state bits for this range
48 	 * it still does the unlocking
49 	 */
50 	unsigned int extent_locked:1;
51 
52 	/* tells the submit_bio code to use a WRITE_SYNC */
53 	unsigned int sync_io:1;
54 };
55 
56 int __init extent_io_init(void)
57 {
58 	extent_state_cache = kmem_cache_create("extent_state",
59 			sizeof(struct extent_state), 0,
60 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
61 	if (!extent_state_cache)
62 		return -ENOMEM;
63 
64 	extent_buffer_cache = kmem_cache_create("extent_buffers",
65 			sizeof(struct extent_buffer), 0,
66 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
67 	if (!extent_buffer_cache)
68 		goto free_state_cache;
69 	return 0;
70 
71 free_state_cache:
72 	kmem_cache_destroy(extent_state_cache);
73 	return -ENOMEM;
74 }
75 
76 void extent_io_exit(void)
77 {
78 	struct extent_state *state;
79 	struct extent_buffer *eb;
80 
81 	while (!list_empty(&states)) {
82 		state = list_entry(states.next, struct extent_state, leak_list);
83 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
84 		       "state %lu in tree %p refs %d\n",
85 		       (unsigned long long)state->start,
86 		       (unsigned long long)state->end,
87 		       state->state, state->tree, atomic_read(&state->refs));
88 		list_del(&state->leak_list);
89 		kmem_cache_free(extent_state_cache, state);
90 
91 	}
92 
93 	while (!list_empty(&buffers)) {
94 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
95 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
96 		       "refs %d\n", (unsigned long long)eb->start,
97 		       eb->len, atomic_read(&eb->refs));
98 		list_del(&eb->leak_list);
99 		kmem_cache_free(extent_buffer_cache, eb);
100 	}
101 	if (extent_state_cache)
102 		kmem_cache_destroy(extent_state_cache);
103 	if (extent_buffer_cache)
104 		kmem_cache_destroy(extent_buffer_cache);
105 }
106 
107 void extent_io_tree_init(struct extent_io_tree *tree,
108 			 struct address_space *mapping)
109 {
110 	tree->state = RB_ROOT;
111 	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
112 	tree->ops = NULL;
113 	tree->dirty_bytes = 0;
114 	spin_lock_init(&tree->lock);
115 	spin_lock_init(&tree->buffer_lock);
116 	tree->mapping = mapping;
117 }
118 
119 static struct extent_state *alloc_extent_state(gfp_t mask)
120 {
121 	struct extent_state *state;
122 #if LEAK_DEBUG
123 	unsigned long flags;
124 #endif
125 
126 	state = kmem_cache_alloc(extent_state_cache, mask);
127 	if (!state)
128 		return state;
129 	state->state = 0;
130 	state->private = 0;
131 	state->tree = NULL;
132 #if LEAK_DEBUG
133 	spin_lock_irqsave(&leak_lock, flags);
134 	list_add(&state->leak_list, &states);
135 	spin_unlock_irqrestore(&leak_lock, flags);
136 #endif
137 	atomic_set(&state->refs, 1);
138 	init_waitqueue_head(&state->wq);
139 	return state;
140 }
141 
142 void free_extent_state(struct extent_state *state)
143 {
144 	if (!state)
145 		return;
146 	if (atomic_dec_and_test(&state->refs)) {
147 #if LEAK_DEBUG
148 		unsigned long flags;
149 #endif
150 		WARN_ON(state->tree);
151 #if LEAK_DEBUG
152 		spin_lock_irqsave(&leak_lock, flags);
153 		list_del(&state->leak_list);
154 		spin_unlock_irqrestore(&leak_lock, flags);
155 #endif
156 		kmem_cache_free(extent_state_cache, state);
157 	}
158 }
159 
160 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
161 				   struct rb_node *node)
162 {
163 	struct rb_node **p = &root->rb_node;
164 	struct rb_node *parent = NULL;
165 	struct tree_entry *entry;
166 
167 	while (*p) {
168 		parent = *p;
169 		entry = rb_entry(parent, struct tree_entry, rb_node);
170 
171 		if (offset < entry->start)
172 			p = &(*p)->rb_left;
173 		else if (offset > entry->end)
174 			p = &(*p)->rb_right;
175 		else
176 			return parent;
177 	}
178 
179 	entry = rb_entry(node, struct tree_entry, rb_node);
180 	rb_link_node(node, parent, p);
181 	rb_insert_color(node, root);
182 	return NULL;
183 }
184 
185 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
186 				     struct rb_node **prev_ret,
187 				     struct rb_node **next_ret)
188 {
189 	struct rb_root *root = &tree->state;
190 	struct rb_node *n = root->rb_node;
191 	struct rb_node *prev = NULL;
192 	struct rb_node *orig_prev = NULL;
193 	struct tree_entry *entry;
194 	struct tree_entry *prev_entry = NULL;
195 
196 	while (n) {
197 		entry = rb_entry(n, struct tree_entry, rb_node);
198 		prev = n;
199 		prev_entry = entry;
200 
201 		if (offset < entry->start)
202 			n = n->rb_left;
203 		else if (offset > entry->end)
204 			n = n->rb_right;
205 		else
206 			return n;
207 	}
208 
209 	if (prev_ret) {
210 		orig_prev = prev;
211 		while (prev && offset > prev_entry->end) {
212 			prev = rb_next(prev);
213 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214 		}
215 		*prev_ret = prev;
216 		prev = orig_prev;
217 	}
218 
219 	if (next_ret) {
220 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		while (prev && offset < prev_entry->start) {
222 			prev = rb_prev(prev);
223 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
224 		}
225 		*next_ret = prev;
226 	}
227 	return NULL;
228 }
229 
230 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
231 					  u64 offset)
232 {
233 	struct rb_node *prev = NULL;
234 	struct rb_node *ret;
235 
236 	ret = __etree_search(tree, offset, &prev, NULL);
237 	if (!ret)
238 		return prev;
239 	return ret;
240 }
241 
242 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
243 		     struct extent_state *other)
244 {
245 	if (tree->ops && tree->ops->merge_extent_hook)
246 		tree->ops->merge_extent_hook(tree->mapping->host, new,
247 					     other);
248 }
249 
250 /*
251  * utility function to look for merge candidates inside a given range.
252  * Any extents with matching state are merged together into a single
253  * extent in the tree.  Extents with EXTENT_IO in their state field
254  * are not merged because the end_io handlers need to be able to do
255  * operations on them without sleeping (or doing allocations/splits).
256  *
257  * This should be called with the tree lock held.
258  */
259 static void merge_state(struct extent_io_tree *tree,
260 		        struct extent_state *state)
261 {
262 	struct extent_state *other;
263 	struct rb_node *other_node;
264 
265 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
266 		return;
267 
268 	other_node = rb_prev(&state->rb_node);
269 	if (other_node) {
270 		other = rb_entry(other_node, struct extent_state, rb_node);
271 		if (other->end == state->start - 1 &&
272 		    other->state == state->state) {
273 			merge_cb(tree, state, other);
274 			state->start = other->start;
275 			other->tree = NULL;
276 			rb_erase(&other->rb_node, &tree->state);
277 			free_extent_state(other);
278 		}
279 	}
280 	other_node = rb_next(&state->rb_node);
281 	if (other_node) {
282 		other = rb_entry(other_node, struct extent_state, rb_node);
283 		if (other->start == state->end + 1 &&
284 		    other->state == state->state) {
285 			merge_cb(tree, state, other);
286 			state->end = other->end;
287 			other->tree = NULL;
288 			rb_erase(&other->rb_node, &tree->state);
289 			free_extent_state(other);
290 		}
291 	}
292 }
293 
294 static void set_state_cb(struct extent_io_tree *tree,
295 			 struct extent_state *state, int *bits)
296 {
297 	if (tree->ops && tree->ops->set_bit_hook)
298 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
299 }
300 
301 static void clear_state_cb(struct extent_io_tree *tree,
302 			   struct extent_state *state, int *bits)
303 {
304 	if (tree->ops && tree->ops->clear_bit_hook)
305 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
306 }
307 
308 static void set_state_bits(struct extent_io_tree *tree,
309 			   struct extent_state *state, int *bits);
310 
311 /*
312  * insert an extent_state struct into the tree.  'bits' are set on the
313  * struct before it is inserted.
314  *
315  * This may return -EEXIST if the extent is already there, in which case the
316  * state struct is freed.
317  *
318  * The tree lock is not taken internally.  This is a utility function and
319  * probably isn't what you want to call (see set/clear_extent_bit).
320  */
321 static int insert_state(struct extent_io_tree *tree,
322 			struct extent_state *state, u64 start, u64 end,
323 			int *bits)
324 {
325 	struct rb_node *node;
326 
327 	if (end < start) {
328 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
329 		       (unsigned long long)end,
330 		       (unsigned long long)start);
331 		WARN_ON(1);
332 	}
333 	state->start = start;
334 	state->end = end;
335 
336 	set_state_bits(tree, state, bits);
337 
338 	node = tree_insert(&tree->state, end, &state->rb_node);
339 	if (node) {
340 		struct extent_state *found;
341 		found = rb_entry(node, struct extent_state, rb_node);
342 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
343 		       "%llu %llu\n", (unsigned long long)found->start,
344 		       (unsigned long long)found->end,
345 		       (unsigned long long)start, (unsigned long long)end);
346 		return -EEXIST;
347 	}
348 	state->tree = tree;
349 	merge_state(tree, state);
350 	return 0;
351 }
352 
353 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
354 		     u64 split)
355 {
356 	if (tree->ops && tree->ops->split_extent_hook)
357 		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
358 }
359 
360 /*
361  * split a given extent state struct in two, inserting the preallocated
362  * struct 'prealloc' as the newly created second half.  'split' indicates an
363  * offset inside 'orig' where it should be split.
364  *
365  * Before calling,
366  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
367  * are two extent state structs in the tree:
368  * prealloc: [orig->start, split - 1]
369  * orig: [ split, orig->end ]
370  *
371  * The tree locks are not taken by this function. They need to be held
372  * by the caller.
373  */
374 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
375 		       struct extent_state *prealloc, u64 split)
376 {
377 	struct rb_node *node;
378 
379 	split_cb(tree, orig, split);
380 
381 	prealloc->start = orig->start;
382 	prealloc->end = split - 1;
383 	prealloc->state = orig->state;
384 	orig->start = split;
385 
386 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
387 	if (node) {
388 		free_extent_state(prealloc);
389 		return -EEXIST;
390 	}
391 	prealloc->tree = tree;
392 	return 0;
393 }
394 
395 /*
396  * utility function to clear some bits in an extent state struct.
397  * it will optionally wake up any one waiting on this state (wake == 1), or
398  * forcibly remove the state from the tree (delete == 1).
399  *
400  * If no bits are set on the state struct after clearing things, the
401  * struct is freed and removed from the tree
402  */
403 static int clear_state_bit(struct extent_io_tree *tree,
404 			    struct extent_state *state,
405 			    int *bits, int wake)
406 {
407 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
408 	int ret = state->state & bits_to_clear;
409 
410 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
411 		u64 range = state->end - state->start + 1;
412 		WARN_ON(range > tree->dirty_bytes);
413 		tree->dirty_bytes -= range;
414 	}
415 	clear_state_cb(tree, state, bits);
416 	state->state &= ~bits_to_clear;
417 	if (wake)
418 		wake_up(&state->wq);
419 	if (state->state == 0) {
420 		if (state->tree) {
421 			rb_erase(&state->rb_node, &tree->state);
422 			state->tree = NULL;
423 			free_extent_state(state);
424 		} else {
425 			WARN_ON(1);
426 		}
427 	} else {
428 		merge_state(tree, state);
429 	}
430 	return ret;
431 }
432 
433 static struct extent_state *
434 alloc_extent_state_atomic(struct extent_state *prealloc)
435 {
436 	if (!prealloc)
437 		prealloc = alloc_extent_state(GFP_ATOMIC);
438 
439 	return prealloc;
440 }
441 
442 /*
443  * clear some bits on a range in the tree.  This may require splitting
444  * or inserting elements in the tree, so the gfp mask is used to
445  * indicate which allocations or sleeping are allowed.
446  *
447  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
448  * the given range from the tree regardless of state (ie for truncate).
449  *
450  * the range [start, end] is inclusive.
451  *
452  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
453  * bits were already set, or zero if none of the bits were already set.
454  */
455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
456 		     int bits, int wake, int delete,
457 		     struct extent_state **cached_state,
458 		     gfp_t mask)
459 {
460 	struct extent_state *state;
461 	struct extent_state *cached;
462 	struct extent_state *prealloc = NULL;
463 	struct rb_node *next_node;
464 	struct rb_node *node;
465 	u64 last_end;
466 	int err;
467 	int set = 0;
468 	int clear = 0;
469 
470 	if (delete)
471 		bits |= ~EXTENT_CTLBITS;
472 	bits |= EXTENT_FIRST_DELALLOC;
473 
474 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
475 		clear = 1;
476 again:
477 	if (!prealloc && (mask & __GFP_WAIT)) {
478 		prealloc = alloc_extent_state(mask);
479 		if (!prealloc)
480 			return -ENOMEM;
481 	}
482 
483 	spin_lock(&tree->lock);
484 	if (cached_state) {
485 		cached = *cached_state;
486 
487 		if (clear) {
488 			*cached_state = NULL;
489 			cached_state = NULL;
490 		}
491 
492 		if (cached && cached->tree && cached->start <= start &&
493 		    cached->end > start) {
494 			if (clear)
495 				atomic_dec(&cached->refs);
496 			state = cached;
497 			goto hit_next;
498 		}
499 		if (clear)
500 			free_extent_state(cached);
501 	}
502 	/*
503 	 * this search will find the extents that end after
504 	 * our range starts
505 	 */
506 	node = tree_search(tree, start);
507 	if (!node)
508 		goto out;
509 	state = rb_entry(node, struct extent_state, rb_node);
510 hit_next:
511 	if (state->start > end)
512 		goto out;
513 	WARN_ON(state->end < start);
514 	last_end = state->end;
515 
516 	/*
517 	 *     | ---- desired range ---- |
518 	 *  | state | or
519 	 *  | ------------- state -------------- |
520 	 *
521 	 * We need to split the extent we found, and may flip
522 	 * bits on second half.
523 	 *
524 	 * If the extent we found extends past our range, we
525 	 * just split and search again.  It'll get split again
526 	 * the next time though.
527 	 *
528 	 * If the extent we found is inside our range, we clear
529 	 * the desired bit on it.
530 	 */
531 
532 	if (state->start < start) {
533 		prealloc = alloc_extent_state_atomic(prealloc);
534 		BUG_ON(!prealloc);
535 		err = split_state(tree, state, prealloc, start);
536 		BUG_ON(err == -EEXIST);
537 		prealloc = NULL;
538 		if (err)
539 			goto out;
540 		if (state->end <= end) {
541 			set |= clear_state_bit(tree, state, &bits, wake);
542 			if (last_end == (u64)-1)
543 				goto out;
544 			start = last_end + 1;
545 		}
546 		goto search_again;
547 	}
548 	/*
549 	 * | ---- desired range ---- |
550 	 *                        | state |
551 	 * We need to split the extent, and clear the bit
552 	 * on the first half
553 	 */
554 	if (state->start <= end && state->end > end) {
555 		prealloc = alloc_extent_state_atomic(prealloc);
556 		BUG_ON(!prealloc);
557 		err = split_state(tree, state, prealloc, end + 1);
558 		BUG_ON(err == -EEXIST);
559 		if (wake)
560 			wake_up(&state->wq);
561 
562 		set |= clear_state_bit(tree, prealloc, &bits, wake);
563 
564 		prealloc = NULL;
565 		goto out;
566 	}
567 
568 	if (state->end < end && prealloc && !need_resched())
569 		next_node = rb_next(&state->rb_node);
570 	else
571 		next_node = NULL;
572 
573 	set |= clear_state_bit(tree, state, &bits, wake);
574 	if (last_end == (u64)-1)
575 		goto out;
576 	start = last_end + 1;
577 	if (start <= end && next_node) {
578 		state = rb_entry(next_node, struct extent_state,
579 				 rb_node);
580 		if (state->start == start)
581 			goto hit_next;
582 	}
583 	goto search_again;
584 
585 out:
586 	spin_unlock(&tree->lock);
587 	if (prealloc)
588 		free_extent_state(prealloc);
589 
590 	return set;
591 
592 search_again:
593 	if (start > end)
594 		goto out;
595 	spin_unlock(&tree->lock);
596 	if (mask & __GFP_WAIT)
597 		cond_resched();
598 	goto again;
599 }
600 
601 static int wait_on_state(struct extent_io_tree *tree,
602 			 struct extent_state *state)
603 		__releases(tree->lock)
604 		__acquires(tree->lock)
605 {
606 	DEFINE_WAIT(wait);
607 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
608 	spin_unlock(&tree->lock);
609 	schedule();
610 	spin_lock(&tree->lock);
611 	finish_wait(&state->wq, &wait);
612 	return 0;
613 }
614 
615 /*
616  * waits for one or more bits to clear on a range in the state tree.
617  * The range [start, end] is inclusive.
618  * The tree lock is taken by this function
619  */
620 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
621 {
622 	struct extent_state *state;
623 	struct rb_node *node;
624 
625 	spin_lock(&tree->lock);
626 again:
627 	while (1) {
628 		/*
629 		 * this search will find all the extents that end after
630 		 * our range starts
631 		 */
632 		node = tree_search(tree, start);
633 		if (!node)
634 			break;
635 
636 		state = rb_entry(node, struct extent_state, rb_node);
637 
638 		if (state->start > end)
639 			goto out;
640 
641 		if (state->state & bits) {
642 			start = state->start;
643 			atomic_inc(&state->refs);
644 			wait_on_state(tree, state);
645 			free_extent_state(state);
646 			goto again;
647 		}
648 		start = state->end + 1;
649 
650 		if (start > end)
651 			break;
652 
653 		cond_resched_lock(&tree->lock);
654 	}
655 out:
656 	spin_unlock(&tree->lock);
657 	return 0;
658 }
659 
660 static void set_state_bits(struct extent_io_tree *tree,
661 			   struct extent_state *state,
662 			   int *bits)
663 {
664 	int bits_to_set = *bits & ~EXTENT_CTLBITS;
665 
666 	set_state_cb(tree, state, bits);
667 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
668 		u64 range = state->end - state->start + 1;
669 		tree->dirty_bytes += range;
670 	}
671 	state->state |= bits_to_set;
672 }
673 
674 static void cache_state(struct extent_state *state,
675 			struct extent_state **cached_ptr)
676 {
677 	if (cached_ptr && !(*cached_ptr)) {
678 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
679 			*cached_ptr = state;
680 			atomic_inc(&state->refs);
681 		}
682 	}
683 }
684 
685 static void uncache_state(struct extent_state **cached_ptr)
686 {
687 	if (cached_ptr && (*cached_ptr)) {
688 		struct extent_state *state = *cached_ptr;
689 		*cached_ptr = NULL;
690 		free_extent_state(state);
691 	}
692 }
693 
694 /*
695  * set some bits on a range in the tree.  This may require allocations or
696  * sleeping, so the gfp mask is used to indicate what is allowed.
697  *
698  * If any of the exclusive bits are set, this will fail with -EEXIST if some
699  * part of the range already has the desired bits set.  The start of the
700  * existing range is returned in failed_start in this case.
701  *
702  * [start, end] is inclusive This takes the tree lock.
703  */
704 
705 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
706 		   int bits, int exclusive_bits, u64 *failed_start,
707 		   struct extent_state **cached_state, gfp_t mask)
708 {
709 	struct extent_state *state;
710 	struct extent_state *prealloc = NULL;
711 	struct rb_node *node;
712 	int err = 0;
713 	u64 last_start;
714 	u64 last_end;
715 
716 	bits |= EXTENT_FIRST_DELALLOC;
717 again:
718 	if (!prealloc && (mask & __GFP_WAIT)) {
719 		prealloc = alloc_extent_state(mask);
720 		BUG_ON(!prealloc);
721 	}
722 
723 	spin_lock(&tree->lock);
724 	if (cached_state && *cached_state) {
725 		state = *cached_state;
726 		if (state->start <= start && state->end > start &&
727 		    state->tree) {
728 			node = &state->rb_node;
729 			goto hit_next;
730 		}
731 	}
732 	/*
733 	 * this search will find all the extents that end after
734 	 * our range starts.
735 	 */
736 	node = tree_search(tree, start);
737 	if (!node) {
738 		prealloc = alloc_extent_state_atomic(prealloc);
739 		BUG_ON(!prealloc);
740 		err = insert_state(tree, prealloc, start, end, &bits);
741 		prealloc = NULL;
742 		BUG_ON(err == -EEXIST);
743 		goto out;
744 	}
745 	state = rb_entry(node, struct extent_state, rb_node);
746 hit_next:
747 	last_start = state->start;
748 	last_end = state->end;
749 
750 	/*
751 	 * | ---- desired range ---- |
752 	 * | state |
753 	 *
754 	 * Just lock what we found and keep going
755 	 */
756 	if (state->start == start && state->end <= end) {
757 		struct rb_node *next_node;
758 		if (state->state & exclusive_bits) {
759 			*failed_start = state->start;
760 			err = -EEXIST;
761 			goto out;
762 		}
763 
764 		set_state_bits(tree, state, &bits);
765 
766 		cache_state(state, cached_state);
767 		merge_state(tree, state);
768 		if (last_end == (u64)-1)
769 			goto out;
770 
771 		start = last_end + 1;
772 		next_node = rb_next(&state->rb_node);
773 		if (next_node && start < end && prealloc && !need_resched()) {
774 			state = rb_entry(next_node, struct extent_state,
775 					 rb_node);
776 			if (state->start == start)
777 				goto hit_next;
778 		}
779 		goto search_again;
780 	}
781 
782 	/*
783 	 *     | ---- desired range ---- |
784 	 * | state |
785 	 *   or
786 	 * | ------------- state -------------- |
787 	 *
788 	 * We need to split the extent we found, and may flip bits on
789 	 * second half.
790 	 *
791 	 * If the extent we found extends past our
792 	 * range, we just split and search again.  It'll get split
793 	 * again the next time though.
794 	 *
795 	 * If the extent we found is inside our range, we set the
796 	 * desired bit on it.
797 	 */
798 	if (state->start < start) {
799 		if (state->state & exclusive_bits) {
800 			*failed_start = start;
801 			err = -EEXIST;
802 			goto out;
803 		}
804 
805 		prealloc = alloc_extent_state_atomic(prealloc);
806 		BUG_ON(!prealloc);
807 		err = split_state(tree, state, prealloc, start);
808 		BUG_ON(err == -EEXIST);
809 		prealloc = NULL;
810 		if (err)
811 			goto out;
812 		if (state->end <= end) {
813 			set_state_bits(tree, state, &bits);
814 			cache_state(state, cached_state);
815 			merge_state(tree, state);
816 			if (last_end == (u64)-1)
817 				goto out;
818 			start = last_end + 1;
819 		}
820 		goto search_again;
821 	}
822 	/*
823 	 * | ---- desired range ---- |
824 	 *     | state | or               | state |
825 	 *
826 	 * There's a hole, we need to insert something in it and
827 	 * ignore the extent we found.
828 	 */
829 	if (state->start > start) {
830 		u64 this_end;
831 		if (end < last_start)
832 			this_end = end;
833 		else
834 			this_end = last_start - 1;
835 
836 		prealloc = alloc_extent_state_atomic(prealloc);
837 		BUG_ON(!prealloc);
838 
839 		/*
840 		 * Avoid to free 'prealloc' if it can be merged with
841 		 * the later extent.
842 		 */
843 		err = insert_state(tree, prealloc, start, this_end,
844 				   &bits);
845 		BUG_ON(err == -EEXIST);
846 		if (err) {
847 			free_extent_state(prealloc);
848 			prealloc = NULL;
849 			goto out;
850 		}
851 		cache_state(prealloc, cached_state);
852 		prealloc = NULL;
853 		start = this_end + 1;
854 		goto search_again;
855 	}
856 	/*
857 	 * | ---- desired range ---- |
858 	 *                        | state |
859 	 * We need to split the extent, and set the bit
860 	 * on the first half
861 	 */
862 	if (state->start <= end && state->end > end) {
863 		if (state->state & exclusive_bits) {
864 			*failed_start = start;
865 			err = -EEXIST;
866 			goto out;
867 		}
868 
869 		prealloc = alloc_extent_state_atomic(prealloc);
870 		BUG_ON(!prealloc);
871 		err = split_state(tree, state, prealloc, end + 1);
872 		BUG_ON(err == -EEXIST);
873 
874 		set_state_bits(tree, prealloc, &bits);
875 		cache_state(prealloc, cached_state);
876 		merge_state(tree, prealloc);
877 		prealloc = NULL;
878 		goto out;
879 	}
880 
881 	goto search_again;
882 
883 out:
884 	spin_unlock(&tree->lock);
885 	if (prealloc)
886 		free_extent_state(prealloc);
887 
888 	return err;
889 
890 search_again:
891 	if (start > end)
892 		goto out;
893 	spin_unlock(&tree->lock);
894 	if (mask & __GFP_WAIT)
895 		cond_resched();
896 	goto again;
897 }
898 
899 /**
900  * convert_extent - convert all bits in a given range from one bit to another
901  * @tree:	the io tree to search
902  * @start:	the start offset in bytes
903  * @end:	the end offset in bytes (inclusive)
904  * @bits:	the bits to set in this range
905  * @clear_bits:	the bits to clear in this range
906  * @mask:	the allocation mask
907  *
908  * This will go through and set bits for the given range.  If any states exist
909  * already in this range they are set with the given bit and cleared of the
910  * clear_bits.  This is only meant to be used by things that are mergeable, ie
911  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
912  * boundary bits like LOCK.
913  */
914 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
915 		       int bits, int clear_bits, gfp_t mask)
916 {
917 	struct extent_state *state;
918 	struct extent_state *prealloc = NULL;
919 	struct rb_node *node;
920 	int err = 0;
921 	u64 last_start;
922 	u64 last_end;
923 
924 again:
925 	if (!prealloc && (mask & __GFP_WAIT)) {
926 		prealloc = alloc_extent_state(mask);
927 		if (!prealloc)
928 			return -ENOMEM;
929 	}
930 
931 	spin_lock(&tree->lock);
932 	/*
933 	 * this search will find all the extents that end after
934 	 * our range starts.
935 	 */
936 	node = tree_search(tree, start);
937 	if (!node) {
938 		prealloc = alloc_extent_state_atomic(prealloc);
939 		if (!prealloc) {
940 			err = -ENOMEM;
941 			goto out;
942 		}
943 		err = insert_state(tree, prealloc, start, end, &bits);
944 		prealloc = NULL;
945 		BUG_ON(err == -EEXIST);
946 		goto out;
947 	}
948 	state = rb_entry(node, struct extent_state, rb_node);
949 hit_next:
950 	last_start = state->start;
951 	last_end = state->end;
952 
953 	/*
954 	 * | ---- desired range ---- |
955 	 * | state |
956 	 *
957 	 * Just lock what we found and keep going
958 	 */
959 	if (state->start == start && state->end <= end) {
960 		struct rb_node *next_node;
961 
962 		set_state_bits(tree, state, &bits);
963 		clear_state_bit(tree, state, &clear_bits, 0);
964 
965 		merge_state(tree, state);
966 		if (last_end == (u64)-1)
967 			goto out;
968 
969 		start = last_end + 1;
970 		next_node = rb_next(&state->rb_node);
971 		if (next_node && start < end && prealloc && !need_resched()) {
972 			state = rb_entry(next_node, struct extent_state,
973 					 rb_node);
974 			if (state->start == start)
975 				goto hit_next;
976 		}
977 		goto search_again;
978 	}
979 
980 	/*
981 	 *     | ---- desired range ---- |
982 	 * | state |
983 	 *   or
984 	 * | ------------- state -------------- |
985 	 *
986 	 * We need to split the extent we found, and may flip bits on
987 	 * second half.
988 	 *
989 	 * If the extent we found extends past our
990 	 * range, we just split and search again.  It'll get split
991 	 * again the next time though.
992 	 *
993 	 * If the extent we found is inside our range, we set the
994 	 * desired bit on it.
995 	 */
996 	if (state->start < start) {
997 		prealloc = alloc_extent_state_atomic(prealloc);
998 		if (!prealloc) {
999 			err = -ENOMEM;
1000 			goto out;
1001 		}
1002 		err = split_state(tree, state, prealloc, start);
1003 		BUG_ON(err == -EEXIST);
1004 		prealloc = NULL;
1005 		if (err)
1006 			goto out;
1007 		if (state->end <= end) {
1008 			set_state_bits(tree, state, &bits);
1009 			clear_state_bit(tree, state, &clear_bits, 0);
1010 			merge_state(tree, state);
1011 			if (last_end == (u64)-1)
1012 				goto out;
1013 			start = last_end + 1;
1014 		}
1015 		goto search_again;
1016 	}
1017 	/*
1018 	 * | ---- desired range ---- |
1019 	 *     | state | or               | state |
1020 	 *
1021 	 * There's a hole, we need to insert something in it and
1022 	 * ignore the extent we found.
1023 	 */
1024 	if (state->start > start) {
1025 		u64 this_end;
1026 		if (end < last_start)
1027 			this_end = end;
1028 		else
1029 			this_end = last_start - 1;
1030 
1031 		prealloc = alloc_extent_state_atomic(prealloc);
1032 		if (!prealloc) {
1033 			err = -ENOMEM;
1034 			goto out;
1035 		}
1036 
1037 		/*
1038 		 * Avoid to free 'prealloc' if it can be merged with
1039 		 * the later extent.
1040 		 */
1041 		err = insert_state(tree, prealloc, start, this_end,
1042 				   &bits);
1043 		BUG_ON(err == -EEXIST);
1044 		if (err) {
1045 			free_extent_state(prealloc);
1046 			prealloc = NULL;
1047 			goto out;
1048 		}
1049 		prealloc = NULL;
1050 		start = this_end + 1;
1051 		goto search_again;
1052 	}
1053 	/*
1054 	 * | ---- desired range ---- |
1055 	 *                        | state |
1056 	 * We need to split the extent, and set the bit
1057 	 * on the first half
1058 	 */
1059 	if (state->start <= end && state->end > end) {
1060 		prealloc = alloc_extent_state_atomic(prealloc);
1061 		if (!prealloc) {
1062 			err = -ENOMEM;
1063 			goto out;
1064 		}
1065 
1066 		err = split_state(tree, state, prealloc, end + 1);
1067 		BUG_ON(err == -EEXIST);
1068 
1069 		set_state_bits(tree, prealloc, &bits);
1070 		clear_state_bit(tree, prealloc, &clear_bits, 0);
1071 
1072 		merge_state(tree, prealloc);
1073 		prealloc = NULL;
1074 		goto out;
1075 	}
1076 
1077 	goto search_again;
1078 
1079 out:
1080 	spin_unlock(&tree->lock);
1081 	if (prealloc)
1082 		free_extent_state(prealloc);
1083 
1084 	return err;
1085 
1086 search_again:
1087 	if (start > end)
1088 		goto out;
1089 	spin_unlock(&tree->lock);
1090 	if (mask & __GFP_WAIT)
1091 		cond_resched();
1092 	goto again;
1093 }
1094 
1095 /* wrappers around set/clear extent bit */
1096 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1097 		     gfp_t mask)
1098 {
1099 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
1100 			      NULL, mask);
1101 }
1102 
1103 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1104 		    int bits, gfp_t mask)
1105 {
1106 	return set_extent_bit(tree, start, end, bits, 0, NULL,
1107 			      NULL, mask);
1108 }
1109 
1110 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1111 		      int bits, gfp_t mask)
1112 {
1113 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1114 }
1115 
1116 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1117 			struct extent_state **cached_state, gfp_t mask)
1118 {
1119 	return set_extent_bit(tree, start, end,
1120 			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1121 			      0, NULL, cached_state, mask);
1122 }
1123 
1124 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1125 		       gfp_t mask)
1126 {
1127 	return clear_extent_bit(tree, start, end,
1128 				EXTENT_DIRTY | EXTENT_DELALLOC |
1129 				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1130 }
1131 
1132 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1133 		     gfp_t mask)
1134 {
1135 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
1136 			      NULL, mask);
1137 }
1138 
1139 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1140 			struct extent_state **cached_state, gfp_t mask)
1141 {
1142 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1143 			      NULL, cached_state, mask);
1144 }
1145 
1146 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1147 				 u64 end, struct extent_state **cached_state,
1148 				 gfp_t mask)
1149 {
1150 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1151 				cached_state, mask);
1152 }
1153 
1154 /*
1155  * either insert or lock state struct between start and end use mask to tell
1156  * us if waiting is desired.
1157  */
1158 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1159 		     int bits, struct extent_state **cached_state, gfp_t mask)
1160 {
1161 	int err;
1162 	u64 failed_start;
1163 	while (1) {
1164 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1165 				     EXTENT_LOCKED, &failed_start,
1166 				     cached_state, mask);
1167 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
1168 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1169 			start = failed_start;
1170 		} else {
1171 			break;
1172 		}
1173 		WARN_ON(start > end);
1174 	}
1175 	return err;
1176 }
1177 
1178 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1179 {
1180 	return lock_extent_bits(tree, start, end, 0, NULL, mask);
1181 }
1182 
1183 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1184 		    gfp_t mask)
1185 {
1186 	int err;
1187 	u64 failed_start;
1188 
1189 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1190 			     &failed_start, NULL, mask);
1191 	if (err == -EEXIST) {
1192 		if (failed_start > start)
1193 			clear_extent_bit(tree, start, failed_start - 1,
1194 					 EXTENT_LOCKED, 1, 0, NULL, mask);
1195 		return 0;
1196 	}
1197 	return 1;
1198 }
1199 
1200 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1201 			 struct extent_state **cached, gfp_t mask)
1202 {
1203 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1204 				mask);
1205 }
1206 
1207 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1208 {
1209 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1210 				mask);
1211 }
1212 
1213 /*
1214  * helper function to set both pages and extents in the tree writeback
1215  */
1216 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1217 {
1218 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1219 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1220 	struct page *page;
1221 
1222 	while (index <= end_index) {
1223 		page = find_get_page(tree->mapping, index);
1224 		BUG_ON(!page);
1225 		set_page_writeback(page);
1226 		page_cache_release(page);
1227 		index++;
1228 	}
1229 	return 0;
1230 }
1231 
1232 /* find the first state struct with 'bits' set after 'start', and
1233  * return it.  tree->lock must be held.  NULL will returned if
1234  * nothing was found after 'start'
1235  */
1236 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1237 						 u64 start, int bits)
1238 {
1239 	struct rb_node *node;
1240 	struct extent_state *state;
1241 
1242 	/*
1243 	 * this search will find all the extents that end after
1244 	 * our range starts.
1245 	 */
1246 	node = tree_search(tree, start);
1247 	if (!node)
1248 		goto out;
1249 
1250 	while (1) {
1251 		state = rb_entry(node, struct extent_state, rb_node);
1252 		if (state->end >= start && (state->state & bits))
1253 			return state;
1254 
1255 		node = rb_next(node);
1256 		if (!node)
1257 			break;
1258 	}
1259 out:
1260 	return NULL;
1261 }
1262 
1263 /*
1264  * find the first offset in the io tree with 'bits' set. zero is
1265  * returned if we find something, and *start_ret and *end_ret are
1266  * set to reflect the state struct that was found.
1267  *
1268  * If nothing was found, 1 is returned, < 0 on error
1269  */
1270 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1271 			  u64 *start_ret, u64 *end_ret, int bits)
1272 {
1273 	struct extent_state *state;
1274 	int ret = 1;
1275 
1276 	spin_lock(&tree->lock);
1277 	state = find_first_extent_bit_state(tree, start, bits);
1278 	if (state) {
1279 		*start_ret = state->start;
1280 		*end_ret = state->end;
1281 		ret = 0;
1282 	}
1283 	spin_unlock(&tree->lock);
1284 	return ret;
1285 }
1286 
1287 /*
1288  * find a contiguous range of bytes in the file marked as delalloc, not
1289  * more than 'max_bytes'.  start and end are used to return the range,
1290  *
1291  * 1 is returned if we find something, 0 if nothing was in the tree
1292  */
1293 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1294 					u64 *start, u64 *end, u64 max_bytes,
1295 					struct extent_state **cached_state)
1296 {
1297 	struct rb_node *node;
1298 	struct extent_state *state;
1299 	u64 cur_start = *start;
1300 	u64 found = 0;
1301 	u64 total_bytes = 0;
1302 
1303 	spin_lock(&tree->lock);
1304 
1305 	/*
1306 	 * this search will find all the extents that end after
1307 	 * our range starts.
1308 	 */
1309 	node = tree_search(tree, cur_start);
1310 	if (!node) {
1311 		if (!found)
1312 			*end = (u64)-1;
1313 		goto out;
1314 	}
1315 
1316 	while (1) {
1317 		state = rb_entry(node, struct extent_state, rb_node);
1318 		if (found && (state->start != cur_start ||
1319 			      (state->state & EXTENT_BOUNDARY))) {
1320 			goto out;
1321 		}
1322 		if (!(state->state & EXTENT_DELALLOC)) {
1323 			if (!found)
1324 				*end = state->end;
1325 			goto out;
1326 		}
1327 		if (!found) {
1328 			*start = state->start;
1329 			*cached_state = state;
1330 			atomic_inc(&state->refs);
1331 		}
1332 		found++;
1333 		*end = state->end;
1334 		cur_start = state->end + 1;
1335 		node = rb_next(node);
1336 		if (!node)
1337 			break;
1338 		total_bytes += state->end - state->start + 1;
1339 		if (total_bytes >= max_bytes)
1340 			break;
1341 	}
1342 out:
1343 	spin_unlock(&tree->lock);
1344 	return found;
1345 }
1346 
1347 static noinline int __unlock_for_delalloc(struct inode *inode,
1348 					  struct page *locked_page,
1349 					  u64 start, u64 end)
1350 {
1351 	int ret;
1352 	struct page *pages[16];
1353 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1354 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1355 	unsigned long nr_pages = end_index - index + 1;
1356 	int i;
1357 
1358 	if (index == locked_page->index && end_index == index)
1359 		return 0;
1360 
1361 	while (nr_pages > 0) {
1362 		ret = find_get_pages_contig(inode->i_mapping, index,
1363 				     min_t(unsigned long, nr_pages,
1364 				     ARRAY_SIZE(pages)), pages);
1365 		for (i = 0; i < ret; i++) {
1366 			if (pages[i] != locked_page)
1367 				unlock_page(pages[i]);
1368 			page_cache_release(pages[i]);
1369 		}
1370 		nr_pages -= ret;
1371 		index += ret;
1372 		cond_resched();
1373 	}
1374 	return 0;
1375 }
1376 
1377 static noinline int lock_delalloc_pages(struct inode *inode,
1378 					struct page *locked_page,
1379 					u64 delalloc_start,
1380 					u64 delalloc_end)
1381 {
1382 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1383 	unsigned long start_index = index;
1384 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1385 	unsigned long pages_locked = 0;
1386 	struct page *pages[16];
1387 	unsigned long nrpages;
1388 	int ret;
1389 	int i;
1390 
1391 	/* the caller is responsible for locking the start index */
1392 	if (index == locked_page->index && index == end_index)
1393 		return 0;
1394 
1395 	/* skip the page at the start index */
1396 	nrpages = end_index - index + 1;
1397 	while (nrpages > 0) {
1398 		ret = find_get_pages_contig(inode->i_mapping, index,
1399 				     min_t(unsigned long,
1400 				     nrpages, ARRAY_SIZE(pages)), pages);
1401 		if (ret == 0) {
1402 			ret = -EAGAIN;
1403 			goto done;
1404 		}
1405 		/* now we have an array of pages, lock them all */
1406 		for (i = 0; i < ret; i++) {
1407 			/*
1408 			 * the caller is taking responsibility for
1409 			 * locked_page
1410 			 */
1411 			if (pages[i] != locked_page) {
1412 				lock_page(pages[i]);
1413 				if (!PageDirty(pages[i]) ||
1414 				    pages[i]->mapping != inode->i_mapping) {
1415 					ret = -EAGAIN;
1416 					unlock_page(pages[i]);
1417 					page_cache_release(pages[i]);
1418 					goto done;
1419 				}
1420 			}
1421 			page_cache_release(pages[i]);
1422 			pages_locked++;
1423 		}
1424 		nrpages -= ret;
1425 		index += ret;
1426 		cond_resched();
1427 	}
1428 	ret = 0;
1429 done:
1430 	if (ret && pages_locked) {
1431 		__unlock_for_delalloc(inode, locked_page,
1432 			      delalloc_start,
1433 			      ((u64)(start_index + pages_locked - 1)) <<
1434 			      PAGE_CACHE_SHIFT);
1435 	}
1436 	return ret;
1437 }
1438 
1439 /*
1440  * find a contiguous range of bytes in the file marked as delalloc, not
1441  * more than 'max_bytes'.  start and end are used to return the range,
1442  *
1443  * 1 is returned if we find something, 0 if nothing was in the tree
1444  */
1445 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1446 					     struct extent_io_tree *tree,
1447 					     struct page *locked_page,
1448 					     u64 *start, u64 *end,
1449 					     u64 max_bytes)
1450 {
1451 	u64 delalloc_start;
1452 	u64 delalloc_end;
1453 	u64 found;
1454 	struct extent_state *cached_state = NULL;
1455 	int ret;
1456 	int loops = 0;
1457 
1458 again:
1459 	/* step one, find a bunch of delalloc bytes starting at start */
1460 	delalloc_start = *start;
1461 	delalloc_end = 0;
1462 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1463 				    max_bytes, &cached_state);
1464 	if (!found || delalloc_end <= *start) {
1465 		*start = delalloc_start;
1466 		*end = delalloc_end;
1467 		free_extent_state(cached_state);
1468 		return found;
1469 	}
1470 
1471 	/*
1472 	 * start comes from the offset of locked_page.  We have to lock
1473 	 * pages in order, so we can't process delalloc bytes before
1474 	 * locked_page
1475 	 */
1476 	if (delalloc_start < *start)
1477 		delalloc_start = *start;
1478 
1479 	/*
1480 	 * make sure to limit the number of pages we try to lock down
1481 	 * if we're looping.
1482 	 */
1483 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1484 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1485 
1486 	/* step two, lock all the pages after the page that has start */
1487 	ret = lock_delalloc_pages(inode, locked_page,
1488 				  delalloc_start, delalloc_end);
1489 	if (ret == -EAGAIN) {
1490 		/* some of the pages are gone, lets avoid looping by
1491 		 * shortening the size of the delalloc range we're searching
1492 		 */
1493 		free_extent_state(cached_state);
1494 		if (!loops) {
1495 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1496 			max_bytes = PAGE_CACHE_SIZE - offset;
1497 			loops = 1;
1498 			goto again;
1499 		} else {
1500 			found = 0;
1501 			goto out_failed;
1502 		}
1503 	}
1504 	BUG_ON(ret);
1505 
1506 	/* step three, lock the state bits for the whole range */
1507 	lock_extent_bits(tree, delalloc_start, delalloc_end,
1508 			 0, &cached_state, GFP_NOFS);
1509 
1510 	/* then test to make sure it is all still delalloc */
1511 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1512 			     EXTENT_DELALLOC, 1, cached_state);
1513 	if (!ret) {
1514 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1515 				     &cached_state, GFP_NOFS);
1516 		__unlock_for_delalloc(inode, locked_page,
1517 			      delalloc_start, delalloc_end);
1518 		cond_resched();
1519 		goto again;
1520 	}
1521 	free_extent_state(cached_state);
1522 	*start = delalloc_start;
1523 	*end = delalloc_end;
1524 out_failed:
1525 	return found;
1526 }
1527 
1528 int extent_clear_unlock_delalloc(struct inode *inode,
1529 				struct extent_io_tree *tree,
1530 				u64 start, u64 end, struct page *locked_page,
1531 				unsigned long op)
1532 {
1533 	int ret;
1534 	struct page *pages[16];
1535 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1536 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1537 	unsigned long nr_pages = end_index - index + 1;
1538 	int i;
1539 	int clear_bits = 0;
1540 
1541 	if (op & EXTENT_CLEAR_UNLOCK)
1542 		clear_bits |= EXTENT_LOCKED;
1543 	if (op & EXTENT_CLEAR_DIRTY)
1544 		clear_bits |= EXTENT_DIRTY;
1545 
1546 	if (op & EXTENT_CLEAR_DELALLOC)
1547 		clear_bits |= EXTENT_DELALLOC;
1548 
1549 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1550 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1551 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1552 		    EXTENT_SET_PRIVATE2)))
1553 		return 0;
1554 
1555 	while (nr_pages > 0) {
1556 		ret = find_get_pages_contig(inode->i_mapping, index,
1557 				     min_t(unsigned long,
1558 				     nr_pages, ARRAY_SIZE(pages)), pages);
1559 		for (i = 0; i < ret; i++) {
1560 
1561 			if (op & EXTENT_SET_PRIVATE2)
1562 				SetPagePrivate2(pages[i]);
1563 
1564 			if (pages[i] == locked_page) {
1565 				page_cache_release(pages[i]);
1566 				continue;
1567 			}
1568 			if (op & EXTENT_CLEAR_DIRTY)
1569 				clear_page_dirty_for_io(pages[i]);
1570 			if (op & EXTENT_SET_WRITEBACK)
1571 				set_page_writeback(pages[i]);
1572 			if (op & EXTENT_END_WRITEBACK)
1573 				end_page_writeback(pages[i]);
1574 			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1575 				unlock_page(pages[i]);
1576 			page_cache_release(pages[i]);
1577 		}
1578 		nr_pages -= ret;
1579 		index += ret;
1580 		cond_resched();
1581 	}
1582 	return 0;
1583 }
1584 
1585 /*
1586  * count the number of bytes in the tree that have a given bit(s)
1587  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1588  * cached.  The total number found is returned.
1589  */
1590 u64 count_range_bits(struct extent_io_tree *tree,
1591 		     u64 *start, u64 search_end, u64 max_bytes,
1592 		     unsigned long bits, int contig)
1593 {
1594 	struct rb_node *node;
1595 	struct extent_state *state;
1596 	u64 cur_start = *start;
1597 	u64 total_bytes = 0;
1598 	u64 last = 0;
1599 	int found = 0;
1600 
1601 	if (search_end <= cur_start) {
1602 		WARN_ON(1);
1603 		return 0;
1604 	}
1605 
1606 	spin_lock(&tree->lock);
1607 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1608 		total_bytes = tree->dirty_bytes;
1609 		goto out;
1610 	}
1611 	/*
1612 	 * this search will find all the extents that end after
1613 	 * our range starts.
1614 	 */
1615 	node = tree_search(tree, cur_start);
1616 	if (!node)
1617 		goto out;
1618 
1619 	while (1) {
1620 		state = rb_entry(node, struct extent_state, rb_node);
1621 		if (state->start > search_end)
1622 			break;
1623 		if (contig && found && state->start > last + 1)
1624 			break;
1625 		if (state->end >= cur_start && (state->state & bits) == bits) {
1626 			total_bytes += min(search_end, state->end) + 1 -
1627 				       max(cur_start, state->start);
1628 			if (total_bytes >= max_bytes)
1629 				break;
1630 			if (!found) {
1631 				*start = max(cur_start, state->start);
1632 				found = 1;
1633 			}
1634 			last = state->end;
1635 		} else if (contig && found) {
1636 			break;
1637 		}
1638 		node = rb_next(node);
1639 		if (!node)
1640 			break;
1641 	}
1642 out:
1643 	spin_unlock(&tree->lock);
1644 	return total_bytes;
1645 }
1646 
1647 /*
1648  * set the private field for a given byte offset in the tree.  If there isn't
1649  * an extent_state there already, this does nothing.
1650  */
1651 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1652 {
1653 	struct rb_node *node;
1654 	struct extent_state *state;
1655 	int ret = 0;
1656 
1657 	spin_lock(&tree->lock);
1658 	/*
1659 	 * this search will find all the extents that end after
1660 	 * our range starts.
1661 	 */
1662 	node = tree_search(tree, start);
1663 	if (!node) {
1664 		ret = -ENOENT;
1665 		goto out;
1666 	}
1667 	state = rb_entry(node, struct extent_state, rb_node);
1668 	if (state->start != start) {
1669 		ret = -ENOENT;
1670 		goto out;
1671 	}
1672 	state->private = private;
1673 out:
1674 	spin_unlock(&tree->lock);
1675 	return ret;
1676 }
1677 
1678 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1679 {
1680 	struct rb_node *node;
1681 	struct extent_state *state;
1682 	int ret = 0;
1683 
1684 	spin_lock(&tree->lock);
1685 	/*
1686 	 * this search will find all the extents that end after
1687 	 * our range starts.
1688 	 */
1689 	node = tree_search(tree, start);
1690 	if (!node) {
1691 		ret = -ENOENT;
1692 		goto out;
1693 	}
1694 	state = rb_entry(node, struct extent_state, rb_node);
1695 	if (state->start != start) {
1696 		ret = -ENOENT;
1697 		goto out;
1698 	}
1699 	*private = state->private;
1700 out:
1701 	spin_unlock(&tree->lock);
1702 	return ret;
1703 }
1704 
1705 /*
1706  * searches a range in the state tree for a given mask.
1707  * If 'filled' == 1, this returns 1 only if every extent in the tree
1708  * has the bits set.  Otherwise, 1 is returned if any bit in the
1709  * range is found set.
1710  */
1711 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1712 		   int bits, int filled, struct extent_state *cached)
1713 {
1714 	struct extent_state *state = NULL;
1715 	struct rb_node *node;
1716 	int bitset = 0;
1717 
1718 	spin_lock(&tree->lock);
1719 	if (cached && cached->tree && cached->start <= start &&
1720 	    cached->end > start)
1721 		node = &cached->rb_node;
1722 	else
1723 		node = tree_search(tree, start);
1724 	while (node && start <= end) {
1725 		state = rb_entry(node, struct extent_state, rb_node);
1726 
1727 		if (filled && state->start > start) {
1728 			bitset = 0;
1729 			break;
1730 		}
1731 
1732 		if (state->start > end)
1733 			break;
1734 
1735 		if (state->state & bits) {
1736 			bitset = 1;
1737 			if (!filled)
1738 				break;
1739 		} else if (filled) {
1740 			bitset = 0;
1741 			break;
1742 		}
1743 
1744 		if (state->end == (u64)-1)
1745 			break;
1746 
1747 		start = state->end + 1;
1748 		if (start > end)
1749 			break;
1750 		node = rb_next(node);
1751 		if (!node) {
1752 			if (filled)
1753 				bitset = 0;
1754 			break;
1755 		}
1756 	}
1757 	spin_unlock(&tree->lock);
1758 	return bitset;
1759 }
1760 
1761 /*
1762  * helper function to set a given page up to date if all the
1763  * extents in the tree for that page are up to date
1764  */
1765 static int check_page_uptodate(struct extent_io_tree *tree,
1766 			       struct page *page)
1767 {
1768 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1769 	u64 end = start + PAGE_CACHE_SIZE - 1;
1770 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1771 		SetPageUptodate(page);
1772 	return 0;
1773 }
1774 
1775 /*
1776  * helper function to unlock a page if all the extents in the tree
1777  * for that page are unlocked
1778  */
1779 static int check_page_locked(struct extent_io_tree *tree,
1780 			     struct page *page)
1781 {
1782 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1783 	u64 end = start + PAGE_CACHE_SIZE - 1;
1784 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1785 		unlock_page(page);
1786 	return 0;
1787 }
1788 
1789 /*
1790  * helper function to end page writeback if all the extents
1791  * in the tree for that page are done with writeback
1792  */
1793 static int check_page_writeback(struct extent_io_tree *tree,
1794 			     struct page *page)
1795 {
1796 	end_page_writeback(page);
1797 	return 0;
1798 }
1799 
1800 /*
1801  * When IO fails, either with EIO or csum verification fails, we
1802  * try other mirrors that might have a good copy of the data.  This
1803  * io_failure_record is used to record state as we go through all the
1804  * mirrors.  If another mirror has good data, the page is set up to date
1805  * and things continue.  If a good mirror can't be found, the original
1806  * bio end_io callback is called to indicate things have failed.
1807  */
1808 struct io_failure_record {
1809 	struct page *page;
1810 	u64 start;
1811 	u64 len;
1812 	u64 logical;
1813 	unsigned long bio_flags;
1814 	int this_mirror;
1815 	int failed_mirror;
1816 	int in_validation;
1817 };
1818 
1819 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1820 				int did_repair)
1821 {
1822 	int ret;
1823 	int err = 0;
1824 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1825 
1826 	set_state_private(failure_tree, rec->start, 0);
1827 	ret = clear_extent_bits(failure_tree, rec->start,
1828 				rec->start + rec->len - 1,
1829 				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1830 	if (ret)
1831 		err = ret;
1832 
1833 	if (did_repair) {
1834 		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1835 					rec->start + rec->len - 1,
1836 					EXTENT_DAMAGED, GFP_NOFS);
1837 		if (ret && !err)
1838 			err = ret;
1839 	}
1840 
1841 	kfree(rec);
1842 	return err;
1843 }
1844 
1845 static void repair_io_failure_callback(struct bio *bio, int err)
1846 {
1847 	complete(bio->bi_private);
1848 }
1849 
1850 /*
1851  * this bypasses the standard btrfs submit functions deliberately, as
1852  * the standard behavior is to write all copies in a raid setup. here we only
1853  * want to write the one bad copy. so we do the mapping for ourselves and issue
1854  * submit_bio directly.
1855  * to avoid any synchonization issues, wait for the data after writing, which
1856  * actually prevents the read that triggered the error from finishing.
1857  * currently, there can be no more than two copies of every data bit. thus,
1858  * exactly one rewrite is required.
1859  */
1860 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1861 			u64 length, u64 logical, struct page *page,
1862 			int mirror_num)
1863 {
1864 	struct bio *bio;
1865 	struct btrfs_device *dev;
1866 	DECLARE_COMPLETION_ONSTACK(compl);
1867 	u64 map_length = 0;
1868 	u64 sector;
1869 	struct btrfs_bio *bbio = NULL;
1870 	int ret;
1871 
1872 	BUG_ON(!mirror_num);
1873 
1874 	bio = bio_alloc(GFP_NOFS, 1);
1875 	if (!bio)
1876 		return -EIO;
1877 	bio->bi_private = &compl;
1878 	bio->bi_end_io = repair_io_failure_callback;
1879 	bio->bi_size = 0;
1880 	map_length = length;
1881 
1882 	ret = btrfs_map_block(map_tree, WRITE, logical,
1883 			      &map_length, &bbio, mirror_num);
1884 	if (ret) {
1885 		bio_put(bio);
1886 		return -EIO;
1887 	}
1888 	BUG_ON(mirror_num != bbio->mirror_num);
1889 	sector = bbio->stripes[mirror_num-1].physical >> 9;
1890 	bio->bi_sector = sector;
1891 	dev = bbio->stripes[mirror_num-1].dev;
1892 	kfree(bbio);
1893 	if (!dev || !dev->bdev || !dev->writeable) {
1894 		bio_put(bio);
1895 		return -EIO;
1896 	}
1897 	bio->bi_bdev = dev->bdev;
1898 	bio_add_page(bio, page, length, start-page_offset(page));
1899 	btrfsic_submit_bio(WRITE_SYNC, bio);
1900 	wait_for_completion(&compl);
1901 
1902 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1903 		/* try to remap that extent elsewhere? */
1904 		bio_put(bio);
1905 		return -EIO;
1906 	}
1907 
1908 	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1909 			"sector %llu)\n", page->mapping->host->i_ino, start,
1910 			dev->name, sector);
1911 
1912 	bio_put(bio);
1913 	return 0;
1914 }
1915 
1916 /*
1917  * each time an IO finishes, we do a fast check in the IO failure tree
1918  * to see if we need to process or clean up an io_failure_record
1919  */
1920 static int clean_io_failure(u64 start, struct page *page)
1921 {
1922 	u64 private;
1923 	u64 private_failure;
1924 	struct io_failure_record *failrec;
1925 	struct btrfs_mapping_tree *map_tree;
1926 	struct extent_state *state;
1927 	int num_copies;
1928 	int did_repair = 0;
1929 	int ret;
1930 	struct inode *inode = page->mapping->host;
1931 
1932 	private = 0;
1933 	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1934 				(u64)-1, 1, EXTENT_DIRTY, 0);
1935 	if (!ret)
1936 		return 0;
1937 
1938 	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1939 				&private_failure);
1940 	if (ret)
1941 		return 0;
1942 
1943 	failrec = (struct io_failure_record *)(unsigned long) private_failure;
1944 	BUG_ON(!failrec->this_mirror);
1945 
1946 	if (failrec->in_validation) {
1947 		/* there was no real error, just free the record */
1948 		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1949 			 failrec->start);
1950 		did_repair = 1;
1951 		goto out;
1952 	}
1953 
1954 	spin_lock(&BTRFS_I(inode)->io_tree.lock);
1955 	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1956 					    failrec->start,
1957 					    EXTENT_LOCKED);
1958 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1959 
1960 	if (state && state->start == failrec->start) {
1961 		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1962 		num_copies = btrfs_num_copies(map_tree, failrec->logical,
1963 						failrec->len);
1964 		if (num_copies > 1)  {
1965 			ret = repair_io_failure(map_tree, start, failrec->len,
1966 						failrec->logical, page,
1967 						failrec->failed_mirror);
1968 			did_repair = !ret;
1969 		}
1970 	}
1971 
1972 out:
1973 	if (!ret)
1974 		ret = free_io_failure(inode, failrec, did_repair);
1975 
1976 	return ret;
1977 }
1978 
1979 /*
1980  * this is a generic handler for readpage errors (default
1981  * readpage_io_failed_hook). if other copies exist, read those and write back
1982  * good data to the failed position. does not investigate in remapping the
1983  * failed extent elsewhere, hoping the device will be smart enough to do this as
1984  * needed
1985  */
1986 
1987 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1988 				u64 start, u64 end, int failed_mirror,
1989 				struct extent_state *state)
1990 {
1991 	struct io_failure_record *failrec = NULL;
1992 	u64 private;
1993 	struct extent_map *em;
1994 	struct inode *inode = page->mapping->host;
1995 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1996 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1997 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1998 	struct bio *bio;
1999 	int num_copies;
2000 	int ret;
2001 	int read_mode;
2002 	u64 logical;
2003 
2004 	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2005 
2006 	ret = get_state_private(failure_tree, start, &private);
2007 	if (ret) {
2008 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2009 		if (!failrec)
2010 			return -ENOMEM;
2011 		failrec->start = start;
2012 		failrec->len = end - start + 1;
2013 		failrec->this_mirror = 0;
2014 		failrec->bio_flags = 0;
2015 		failrec->in_validation = 0;
2016 
2017 		read_lock(&em_tree->lock);
2018 		em = lookup_extent_mapping(em_tree, start, failrec->len);
2019 		if (!em) {
2020 			read_unlock(&em_tree->lock);
2021 			kfree(failrec);
2022 			return -EIO;
2023 		}
2024 
2025 		if (em->start > start || em->start + em->len < start) {
2026 			free_extent_map(em);
2027 			em = NULL;
2028 		}
2029 		read_unlock(&em_tree->lock);
2030 
2031 		if (!em || IS_ERR(em)) {
2032 			kfree(failrec);
2033 			return -EIO;
2034 		}
2035 		logical = start - em->start;
2036 		logical = em->block_start + logical;
2037 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2038 			logical = em->block_start;
2039 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2040 			extent_set_compress_type(&failrec->bio_flags,
2041 						 em->compress_type);
2042 		}
2043 		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2044 			 "len=%llu\n", logical, start, failrec->len);
2045 		failrec->logical = logical;
2046 		free_extent_map(em);
2047 
2048 		/* set the bits in the private failure tree */
2049 		ret = set_extent_bits(failure_tree, start, end,
2050 					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2051 		if (ret >= 0)
2052 			ret = set_state_private(failure_tree, start,
2053 						(u64)(unsigned long)failrec);
2054 		/* set the bits in the inode's tree */
2055 		if (ret >= 0)
2056 			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2057 						GFP_NOFS);
2058 		if (ret < 0) {
2059 			kfree(failrec);
2060 			return ret;
2061 		}
2062 	} else {
2063 		failrec = (struct io_failure_record *)(unsigned long)private;
2064 		pr_debug("bio_readpage_error: (found) logical=%llu, "
2065 			 "start=%llu, len=%llu, validation=%d\n",
2066 			 failrec->logical, failrec->start, failrec->len,
2067 			 failrec->in_validation);
2068 		/*
2069 		 * when data can be on disk more than twice, add to failrec here
2070 		 * (e.g. with a list for failed_mirror) to make
2071 		 * clean_io_failure() clean all those errors at once.
2072 		 */
2073 	}
2074 	num_copies = btrfs_num_copies(
2075 			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
2076 			      failrec->logical, failrec->len);
2077 	if (num_copies == 1) {
2078 		/*
2079 		 * we only have a single copy of the data, so don't bother with
2080 		 * all the retry and error correction code that follows. no
2081 		 * matter what the error is, it is very likely to persist.
2082 		 */
2083 		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2084 			 "state=%p, num_copies=%d, next_mirror %d, "
2085 			 "failed_mirror %d\n", state, num_copies,
2086 			 failrec->this_mirror, failed_mirror);
2087 		free_io_failure(inode, failrec, 0);
2088 		return -EIO;
2089 	}
2090 
2091 	if (!state) {
2092 		spin_lock(&tree->lock);
2093 		state = find_first_extent_bit_state(tree, failrec->start,
2094 						    EXTENT_LOCKED);
2095 		if (state && state->start != failrec->start)
2096 			state = NULL;
2097 		spin_unlock(&tree->lock);
2098 	}
2099 
2100 	/*
2101 	 * there are two premises:
2102 	 *	a) deliver good data to the caller
2103 	 *	b) correct the bad sectors on disk
2104 	 */
2105 	if (failed_bio->bi_vcnt > 1) {
2106 		/*
2107 		 * to fulfill b), we need to know the exact failing sectors, as
2108 		 * we don't want to rewrite any more than the failed ones. thus,
2109 		 * we need separate read requests for the failed bio
2110 		 *
2111 		 * if the following BUG_ON triggers, our validation request got
2112 		 * merged. we need separate requests for our algorithm to work.
2113 		 */
2114 		BUG_ON(failrec->in_validation);
2115 		failrec->in_validation = 1;
2116 		failrec->this_mirror = failed_mirror;
2117 		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2118 	} else {
2119 		/*
2120 		 * we're ready to fulfill a) and b) alongside. get a good copy
2121 		 * of the failed sector and if we succeed, we have setup
2122 		 * everything for repair_io_failure to do the rest for us.
2123 		 */
2124 		if (failrec->in_validation) {
2125 			BUG_ON(failrec->this_mirror != failed_mirror);
2126 			failrec->in_validation = 0;
2127 			failrec->this_mirror = 0;
2128 		}
2129 		failrec->failed_mirror = failed_mirror;
2130 		failrec->this_mirror++;
2131 		if (failrec->this_mirror == failed_mirror)
2132 			failrec->this_mirror++;
2133 		read_mode = READ_SYNC;
2134 	}
2135 
2136 	if (!state || failrec->this_mirror > num_copies) {
2137 		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2138 			 "next_mirror %d, failed_mirror %d\n", state,
2139 			 num_copies, failrec->this_mirror, failed_mirror);
2140 		free_io_failure(inode, failrec, 0);
2141 		return -EIO;
2142 	}
2143 
2144 	bio = bio_alloc(GFP_NOFS, 1);
2145 	bio->bi_private = state;
2146 	bio->bi_end_io = failed_bio->bi_end_io;
2147 	bio->bi_sector = failrec->logical >> 9;
2148 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2149 	bio->bi_size = 0;
2150 
2151 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2152 
2153 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2154 		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2155 		 failrec->this_mirror, num_copies, failrec->in_validation);
2156 
2157 	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2158 					failrec->bio_flags, 0);
2159 	return 0;
2160 }
2161 
2162 /* lots and lots of room for performance fixes in the end_bio funcs */
2163 
2164 /*
2165  * after a writepage IO is done, we need to:
2166  * clear the uptodate bits on error
2167  * clear the writeback bits in the extent tree for this IO
2168  * end_page_writeback if the page has no more pending IO
2169  *
2170  * Scheduling is not allowed, so the extent state tree is expected
2171  * to have one and only one object corresponding to this IO.
2172  */
2173 static void end_bio_extent_writepage(struct bio *bio, int err)
2174 {
2175 	int uptodate = err == 0;
2176 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2177 	struct extent_io_tree *tree;
2178 	u64 start;
2179 	u64 end;
2180 	int whole_page;
2181 	int ret;
2182 
2183 	do {
2184 		struct page *page = bvec->bv_page;
2185 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2186 
2187 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2188 			 bvec->bv_offset;
2189 		end = start + bvec->bv_len - 1;
2190 
2191 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2192 			whole_page = 1;
2193 		else
2194 			whole_page = 0;
2195 
2196 		if (--bvec >= bio->bi_io_vec)
2197 			prefetchw(&bvec->bv_page->flags);
2198 		if (tree->ops && tree->ops->writepage_end_io_hook) {
2199 			ret = tree->ops->writepage_end_io_hook(page, start,
2200 						       end, NULL, uptodate);
2201 			if (ret)
2202 				uptodate = 0;
2203 		}
2204 
2205 		if (!uptodate && tree->ops &&
2206 		    tree->ops->writepage_io_failed_hook) {
2207 			ret = tree->ops->writepage_io_failed_hook(bio, page,
2208 							 start, end, NULL);
2209 			if (ret == 0) {
2210 				uptodate = (err == 0);
2211 				continue;
2212 			}
2213 		}
2214 
2215 		if (!uptodate) {
2216 			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2217 			ClearPageUptodate(page);
2218 			SetPageError(page);
2219 		}
2220 
2221 		if (whole_page)
2222 			end_page_writeback(page);
2223 		else
2224 			check_page_writeback(tree, page);
2225 	} while (bvec >= bio->bi_io_vec);
2226 
2227 	bio_put(bio);
2228 }
2229 
2230 /*
2231  * after a readpage IO is done, we need to:
2232  * clear the uptodate bits on error
2233  * set the uptodate bits if things worked
2234  * set the page up to date if all extents in the tree are uptodate
2235  * clear the lock bit in the extent tree
2236  * unlock the page if there are no other extents locked for it
2237  *
2238  * Scheduling is not allowed, so the extent state tree is expected
2239  * to have one and only one object corresponding to this IO.
2240  */
2241 static void end_bio_extent_readpage(struct bio *bio, int err)
2242 {
2243 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2244 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2245 	struct bio_vec *bvec = bio->bi_io_vec;
2246 	struct extent_io_tree *tree;
2247 	u64 start;
2248 	u64 end;
2249 	int whole_page;
2250 	int ret;
2251 
2252 	if (err)
2253 		uptodate = 0;
2254 
2255 	do {
2256 		struct page *page = bvec->bv_page;
2257 		struct extent_state *cached = NULL;
2258 		struct extent_state *state;
2259 
2260 		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2261 			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2262 			 (long int)bio->bi_bdev);
2263 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2264 
2265 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2266 			bvec->bv_offset;
2267 		end = start + bvec->bv_len - 1;
2268 
2269 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2270 			whole_page = 1;
2271 		else
2272 			whole_page = 0;
2273 
2274 		if (++bvec <= bvec_end)
2275 			prefetchw(&bvec->bv_page->flags);
2276 
2277 		spin_lock(&tree->lock);
2278 		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2279 		if (state && state->start == start) {
2280 			/*
2281 			 * take a reference on the state, unlock will drop
2282 			 * the ref
2283 			 */
2284 			cache_state(state, &cached);
2285 		}
2286 		spin_unlock(&tree->lock);
2287 
2288 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2289 			ret = tree->ops->readpage_end_io_hook(page, start, end,
2290 							      state);
2291 			if (ret)
2292 				uptodate = 0;
2293 			else
2294 				clean_io_failure(start, page);
2295 		}
2296 		if (!uptodate) {
2297 			int failed_mirror;
2298 			failed_mirror = (int)(unsigned long)bio->bi_bdev;
2299 			/*
2300 			 * The generic bio_readpage_error handles errors the
2301 			 * following way: If possible, new read requests are
2302 			 * created and submitted and will end up in
2303 			 * end_bio_extent_readpage as well (if we're lucky, not
2304 			 * in the !uptodate case). In that case it returns 0 and
2305 			 * we just go on with the next page in our bio. If it
2306 			 * can't handle the error it will return -EIO and we
2307 			 * remain responsible for that page.
2308 			 */
2309 			ret = bio_readpage_error(bio, page, start, end,
2310 							failed_mirror, NULL);
2311 			if (ret == 0) {
2312 error_handled:
2313 				uptodate =
2314 					test_bit(BIO_UPTODATE, &bio->bi_flags);
2315 				if (err)
2316 					uptodate = 0;
2317 				uncache_state(&cached);
2318 				continue;
2319 			}
2320 			if (tree->ops && tree->ops->readpage_io_failed_hook) {
2321 				ret = tree->ops->readpage_io_failed_hook(
2322 							bio, page, start, end,
2323 							failed_mirror, state);
2324 				if (ret == 0)
2325 					goto error_handled;
2326 			}
2327 		}
2328 
2329 		if (uptodate) {
2330 			set_extent_uptodate(tree, start, end, &cached,
2331 					    GFP_ATOMIC);
2332 		}
2333 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2334 
2335 		if (whole_page) {
2336 			if (uptodate) {
2337 				SetPageUptodate(page);
2338 			} else {
2339 				ClearPageUptodate(page);
2340 				SetPageError(page);
2341 			}
2342 			unlock_page(page);
2343 		} else {
2344 			if (uptodate) {
2345 				check_page_uptodate(tree, page);
2346 			} else {
2347 				ClearPageUptodate(page);
2348 				SetPageError(page);
2349 			}
2350 			check_page_locked(tree, page);
2351 		}
2352 	} while (bvec <= bvec_end);
2353 
2354 	bio_put(bio);
2355 }
2356 
2357 struct bio *
2358 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2359 		gfp_t gfp_flags)
2360 {
2361 	struct bio *bio;
2362 
2363 	bio = bio_alloc(gfp_flags, nr_vecs);
2364 
2365 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2366 		while (!bio && (nr_vecs /= 2))
2367 			bio = bio_alloc(gfp_flags, nr_vecs);
2368 	}
2369 
2370 	if (bio) {
2371 		bio->bi_size = 0;
2372 		bio->bi_bdev = bdev;
2373 		bio->bi_sector = first_sector;
2374 	}
2375 	return bio;
2376 }
2377 
2378 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2379 			  unsigned long bio_flags)
2380 {
2381 	int ret = 0;
2382 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2383 	struct page *page = bvec->bv_page;
2384 	struct extent_io_tree *tree = bio->bi_private;
2385 	u64 start;
2386 
2387 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2388 
2389 	bio->bi_private = NULL;
2390 
2391 	bio_get(bio);
2392 
2393 	if (tree->ops && tree->ops->submit_bio_hook)
2394 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2395 					   mirror_num, bio_flags, start);
2396 	else
2397 		btrfsic_submit_bio(rw, bio);
2398 
2399 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2400 		ret = -EOPNOTSUPP;
2401 	bio_put(bio);
2402 	return ret;
2403 }
2404 
2405 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2406 			      struct page *page, sector_t sector,
2407 			      size_t size, unsigned long offset,
2408 			      struct block_device *bdev,
2409 			      struct bio **bio_ret,
2410 			      unsigned long max_pages,
2411 			      bio_end_io_t end_io_func,
2412 			      int mirror_num,
2413 			      unsigned long prev_bio_flags,
2414 			      unsigned long bio_flags)
2415 {
2416 	int ret = 0;
2417 	struct bio *bio;
2418 	int nr;
2419 	int contig = 0;
2420 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2421 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2422 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2423 
2424 	if (bio_ret && *bio_ret) {
2425 		bio = *bio_ret;
2426 		if (old_compressed)
2427 			contig = bio->bi_sector == sector;
2428 		else
2429 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
2430 				sector;
2431 
2432 		if (prev_bio_flags != bio_flags || !contig ||
2433 		    (tree->ops && tree->ops->merge_bio_hook &&
2434 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
2435 					       bio_flags)) ||
2436 		    bio_add_page(bio, page, page_size, offset) < page_size) {
2437 			ret = submit_one_bio(rw, bio, mirror_num,
2438 					     prev_bio_flags);
2439 			bio = NULL;
2440 		} else {
2441 			return 0;
2442 		}
2443 	}
2444 	if (this_compressed)
2445 		nr = BIO_MAX_PAGES;
2446 	else
2447 		nr = bio_get_nr_vecs(bdev);
2448 
2449 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2450 	if (!bio)
2451 		return -ENOMEM;
2452 
2453 	bio_add_page(bio, page, page_size, offset);
2454 	bio->bi_end_io = end_io_func;
2455 	bio->bi_private = tree;
2456 
2457 	if (bio_ret)
2458 		*bio_ret = bio;
2459 	else
2460 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2461 
2462 	return ret;
2463 }
2464 
2465 void set_page_extent_mapped(struct page *page)
2466 {
2467 	if (!PagePrivate(page)) {
2468 		SetPagePrivate(page);
2469 		page_cache_get(page);
2470 		set_page_private(page, EXTENT_PAGE_PRIVATE);
2471 	}
2472 }
2473 
2474 static void set_page_extent_head(struct page *page, unsigned long len)
2475 {
2476 	WARN_ON(!PagePrivate(page));
2477 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2478 }
2479 
2480 /*
2481  * basic readpage implementation.  Locked extent state structs are inserted
2482  * into the tree that are removed when the IO is done (by the end_io
2483  * handlers)
2484  */
2485 static int __extent_read_full_page(struct extent_io_tree *tree,
2486 				   struct page *page,
2487 				   get_extent_t *get_extent,
2488 				   struct bio **bio, int mirror_num,
2489 				   unsigned long *bio_flags)
2490 {
2491 	struct inode *inode = page->mapping->host;
2492 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2493 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2494 	u64 end;
2495 	u64 cur = start;
2496 	u64 extent_offset;
2497 	u64 last_byte = i_size_read(inode);
2498 	u64 block_start;
2499 	u64 cur_end;
2500 	sector_t sector;
2501 	struct extent_map *em;
2502 	struct block_device *bdev;
2503 	struct btrfs_ordered_extent *ordered;
2504 	int ret;
2505 	int nr = 0;
2506 	size_t pg_offset = 0;
2507 	size_t iosize;
2508 	size_t disk_io_size;
2509 	size_t blocksize = inode->i_sb->s_blocksize;
2510 	unsigned long this_bio_flag = 0;
2511 
2512 	set_page_extent_mapped(page);
2513 
2514 	if (!PageUptodate(page)) {
2515 		if (cleancache_get_page(page) == 0) {
2516 			BUG_ON(blocksize != PAGE_SIZE);
2517 			goto out;
2518 		}
2519 	}
2520 
2521 	end = page_end;
2522 	while (1) {
2523 		lock_extent(tree, start, end, GFP_NOFS);
2524 		ordered = btrfs_lookup_ordered_extent(inode, start);
2525 		if (!ordered)
2526 			break;
2527 		unlock_extent(tree, start, end, GFP_NOFS);
2528 		btrfs_start_ordered_extent(inode, ordered, 1);
2529 		btrfs_put_ordered_extent(ordered);
2530 	}
2531 
2532 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2533 		char *userpage;
2534 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2535 
2536 		if (zero_offset) {
2537 			iosize = PAGE_CACHE_SIZE - zero_offset;
2538 			userpage = kmap_atomic(page, KM_USER0);
2539 			memset(userpage + zero_offset, 0, iosize);
2540 			flush_dcache_page(page);
2541 			kunmap_atomic(userpage, KM_USER0);
2542 		}
2543 	}
2544 	while (cur <= end) {
2545 		if (cur >= last_byte) {
2546 			char *userpage;
2547 			struct extent_state *cached = NULL;
2548 
2549 			iosize = PAGE_CACHE_SIZE - pg_offset;
2550 			userpage = kmap_atomic(page, KM_USER0);
2551 			memset(userpage + pg_offset, 0, iosize);
2552 			flush_dcache_page(page);
2553 			kunmap_atomic(userpage, KM_USER0);
2554 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2555 					    &cached, GFP_NOFS);
2556 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2557 					     &cached, GFP_NOFS);
2558 			break;
2559 		}
2560 		em = get_extent(inode, page, pg_offset, cur,
2561 				end - cur + 1, 0);
2562 		if (IS_ERR_OR_NULL(em)) {
2563 			SetPageError(page);
2564 			unlock_extent(tree, cur, end, GFP_NOFS);
2565 			break;
2566 		}
2567 		extent_offset = cur - em->start;
2568 		BUG_ON(extent_map_end(em) <= cur);
2569 		BUG_ON(end < cur);
2570 
2571 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2572 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2573 			extent_set_compress_type(&this_bio_flag,
2574 						 em->compress_type);
2575 		}
2576 
2577 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2578 		cur_end = min(extent_map_end(em) - 1, end);
2579 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2580 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2581 			disk_io_size = em->block_len;
2582 			sector = em->block_start >> 9;
2583 		} else {
2584 			sector = (em->block_start + extent_offset) >> 9;
2585 			disk_io_size = iosize;
2586 		}
2587 		bdev = em->bdev;
2588 		block_start = em->block_start;
2589 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2590 			block_start = EXTENT_MAP_HOLE;
2591 		free_extent_map(em);
2592 		em = NULL;
2593 
2594 		/* we've found a hole, just zero and go on */
2595 		if (block_start == EXTENT_MAP_HOLE) {
2596 			char *userpage;
2597 			struct extent_state *cached = NULL;
2598 
2599 			userpage = kmap_atomic(page, KM_USER0);
2600 			memset(userpage + pg_offset, 0, iosize);
2601 			flush_dcache_page(page);
2602 			kunmap_atomic(userpage, KM_USER0);
2603 
2604 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2605 					    &cached, GFP_NOFS);
2606 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2607 			                     &cached, GFP_NOFS);
2608 			cur = cur + iosize;
2609 			pg_offset += iosize;
2610 			continue;
2611 		}
2612 		/* the get_extent function already copied into the page */
2613 		if (test_range_bit(tree, cur, cur_end,
2614 				   EXTENT_UPTODATE, 1, NULL)) {
2615 			check_page_uptodate(tree, page);
2616 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2617 			cur = cur + iosize;
2618 			pg_offset += iosize;
2619 			continue;
2620 		}
2621 		/* we have an inline extent but it didn't get marked up
2622 		 * to date.  Error out
2623 		 */
2624 		if (block_start == EXTENT_MAP_INLINE) {
2625 			SetPageError(page);
2626 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2627 			cur = cur + iosize;
2628 			pg_offset += iosize;
2629 			continue;
2630 		}
2631 
2632 		ret = 0;
2633 		if (tree->ops && tree->ops->readpage_io_hook) {
2634 			ret = tree->ops->readpage_io_hook(page, cur,
2635 							  cur + iosize - 1);
2636 		}
2637 		if (!ret) {
2638 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2639 			pnr -= page->index;
2640 			ret = submit_extent_page(READ, tree, page,
2641 					 sector, disk_io_size, pg_offset,
2642 					 bdev, bio, pnr,
2643 					 end_bio_extent_readpage, mirror_num,
2644 					 *bio_flags,
2645 					 this_bio_flag);
2646 			nr++;
2647 			*bio_flags = this_bio_flag;
2648 		}
2649 		if (ret)
2650 			SetPageError(page);
2651 		cur = cur + iosize;
2652 		pg_offset += iosize;
2653 	}
2654 out:
2655 	if (!nr) {
2656 		if (!PageError(page))
2657 			SetPageUptodate(page);
2658 		unlock_page(page);
2659 	}
2660 	return 0;
2661 }
2662 
2663 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2664 			    get_extent_t *get_extent, int mirror_num)
2665 {
2666 	struct bio *bio = NULL;
2667 	unsigned long bio_flags = 0;
2668 	int ret;
2669 
2670 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2671 				      &bio_flags);
2672 	if (bio)
2673 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2674 	return ret;
2675 }
2676 
2677 static noinline void update_nr_written(struct page *page,
2678 				      struct writeback_control *wbc,
2679 				      unsigned long nr_written)
2680 {
2681 	wbc->nr_to_write -= nr_written;
2682 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2683 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2684 		page->mapping->writeback_index = page->index + nr_written;
2685 }
2686 
2687 /*
2688  * the writepage semantics are similar to regular writepage.  extent
2689  * records are inserted to lock ranges in the tree, and as dirty areas
2690  * are found, they are marked writeback.  Then the lock bits are removed
2691  * and the end_io handler clears the writeback ranges
2692  */
2693 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2694 			      void *data)
2695 {
2696 	struct inode *inode = page->mapping->host;
2697 	struct extent_page_data *epd = data;
2698 	struct extent_io_tree *tree = epd->tree;
2699 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2700 	u64 delalloc_start;
2701 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2702 	u64 end;
2703 	u64 cur = start;
2704 	u64 extent_offset;
2705 	u64 last_byte = i_size_read(inode);
2706 	u64 block_start;
2707 	u64 iosize;
2708 	sector_t sector;
2709 	struct extent_state *cached_state = NULL;
2710 	struct extent_map *em;
2711 	struct block_device *bdev;
2712 	int ret;
2713 	int nr = 0;
2714 	size_t pg_offset = 0;
2715 	size_t blocksize;
2716 	loff_t i_size = i_size_read(inode);
2717 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2718 	u64 nr_delalloc;
2719 	u64 delalloc_end;
2720 	int page_started;
2721 	int compressed;
2722 	int write_flags;
2723 	unsigned long nr_written = 0;
2724 	bool fill_delalloc = true;
2725 
2726 	if (wbc->sync_mode == WB_SYNC_ALL)
2727 		write_flags = WRITE_SYNC;
2728 	else
2729 		write_flags = WRITE;
2730 
2731 	trace___extent_writepage(page, inode, wbc);
2732 
2733 	WARN_ON(!PageLocked(page));
2734 
2735 	ClearPageError(page);
2736 
2737 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2738 	if (page->index > end_index ||
2739 	   (page->index == end_index && !pg_offset)) {
2740 		page->mapping->a_ops->invalidatepage(page, 0);
2741 		unlock_page(page);
2742 		return 0;
2743 	}
2744 
2745 	if (page->index == end_index) {
2746 		char *userpage;
2747 
2748 		userpage = kmap_atomic(page, KM_USER0);
2749 		memset(userpage + pg_offset, 0,
2750 		       PAGE_CACHE_SIZE - pg_offset);
2751 		kunmap_atomic(userpage, KM_USER0);
2752 		flush_dcache_page(page);
2753 	}
2754 	pg_offset = 0;
2755 
2756 	set_page_extent_mapped(page);
2757 
2758 	if (!tree->ops || !tree->ops->fill_delalloc)
2759 		fill_delalloc = false;
2760 
2761 	delalloc_start = start;
2762 	delalloc_end = 0;
2763 	page_started = 0;
2764 	if (!epd->extent_locked && fill_delalloc) {
2765 		u64 delalloc_to_write = 0;
2766 		/*
2767 		 * make sure the wbc mapping index is at least updated
2768 		 * to this page.
2769 		 */
2770 		update_nr_written(page, wbc, 0);
2771 
2772 		while (delalloc_end < page_end) {
2773 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2774 						       page,
2775 						       &delalloc_start,
2776 						       &delalloc_end,
2777 						       128 * 1024 * 1024);
2778 			if (nr_delalloc == 0) {
2779 				delalloc_start = delalloc_end + 1;
2780 				continue;
2781 			}
2782 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2783 						 delalloc_end, &page_started,
2784 						 &nr_written);
2785 			/*
2786 			 * delalloc_end is already one less than the total
2787 			 * length, so we don't subtract one from
2788 			 * PAGE_CACHE_SIZE
2789 			 */
2790 			delalloc_to_write += (delalloc_end - delalloc_start +
2791 					      PAGE_CACHE_SIZE) >>
2792 					      PAGE_CACHE_SHIFT;
2793 			delalloc_start = delalloc_end + 1;
2794 		}
2795 		if (wbc->nr_to_write < delalloc_to_write) {
2796 			int thresh = 8192;
2797 
2798 			if (delalloc_to_write < thresh * 2)
2799 				thresh = delalloc_to_write;
2800 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2801 						 thresh);
2802 		}
2803 
2804 		/* did the fill delalloc function already unlock and start
2805 		 * the IO?
2806 		 */
2807 		if (page_started) {
2808 			ret = 0;
2809 			/*
2810 			 * we've unlocked the page, so we can't update
2811 			 * the mapping's writeback index, just update
2812 			 * nr_to_write.
2813 			 */
2814 			wbc->nr_to_write -= nr_written;
2815 			goto done_unlocked;
2816 		}
2817 	}
2818 	if (tree->ops && tree->ops->writepage_start_hook) {
2819 		ret = tree->ops->writepage_start_hook(page, start,
2820 						      page_end);
2821 		if (ret == -EAGAIN) {
2822 			redirty_page_for_writepage(wbc, page);
2823 			update_nr_written(page, wbc, nr_written);
2824 			unlock_page(page);
2825 			ret = 0;
2826 			goto done_unlocked;
2827 		}
2828 	}
2829 
2830 	/*
2831 	 * we don't want to touch the inode after unlocking the page,
2832 	 * so we update the mapping writeback index now
2833 	 */
2834 	update_nr_written(page, wbc, nr_written + 1);
2835 
2836 	end = page_end;
2837 	if (last_byte <= start) {
2838 		if (tree->ops && tree->ops->writepage_end_io_hook)
2839 			tree->ops->writepage_end_io_hook(page, start,
2840 							 page_end, NULL, 1);
2841 		goto done;
2842 	}
2843 
2844 	blocksize = inode->i_sb->s_blocksize;
2845 
2846 	while (cur <= end) {
2847 		if (cur >= last_byte) {
2848 			if (tree->ops && tree->ops->writepage_end_io_hook)
2849 				tree->ops->writepage_end_io_hook(page, cur,
2850 							 page_end, NULL, 1);
2851 			break;
2852 		}
2853 		em = epd->get_extent(inode, page, pg_offset, cur,
2854 				     end - cur + 1, 1);
2855 		if (IS_ERR_OR_NULL(em)) {
2856 			SetPageError(page);
2857 			break;
2858 		}
2859 
2860 		extent_offset = cur - em->start;
2861 		BUG_ON(extent_map_end(em) <= cur);
2862 		BUG_ON(end < cur);
2863 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2864 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2865 		sector = (em->block_start + extent_offset) >> 9;
2866 		bdev = em->bdev;
2867 		block_start = em->block_start;
2868 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2869 		free_extent_map(em);
2870 		em = NULL;
2871 
2872 		/*
2873 		 * compressed and inline extents are written through other
2874 		 * paths in the FS
2875 		 */
2876 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2877 		    block_start == EXTENT_MAP_INLINE) {
2878 			/*
2879 			 * end_io notification does not happen here for
2880 			 * compressed extents
2881 			 */
2882 			if (!compressed && tree->ops &&
2883 			    tree->ops->writepage_end_io_hook)
2884 				tree->ops->writepage_end_io_hook(page, cur,
2885 							 cur + iosize - 1,
2886 							 NULL, 1);
2887 			else if (compressed) {
2888 				/* we don't want to end_page_writeback on
2889 				 * a compressed extent.  this happens
2890 				 * elsewhere
2891 				 */
2892 				nr++;
2893 			}
2894 
2895 			cur += iosize;
2896 			pg_offset += iosize;
2897 			continue;
2898 		}
2899 		/* leave this out until we have a page_mkwrite call */
2900 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2901 				   EXTENT_DIRTY, 0, NULL)) {
2902 			cur = cur + iosize;
2903 			pg_offset += iosize;
2904 			continue;
2905 		}
2906 
2907 		if (tree->ops && tree->ops->writepage_io_hook) {
2908 			ret = tree->ops->writepage_io_hook(page, cur,
2909 						cur + iosize - 1);
2910 		} else {
2911 			ret = 0;
2912 		}
2913 		if (ret) {
2914 			SetPageError(page);
2915 		} else {
2916 			unsigned long max_nr = end_index + 1;
2917 
2918 			set_range_writeback(tree, cur, cur + iosize - 1);
2919 			if (!PageWriteback(page)) {
2920 				printk(KERN_ERR "btrfs warning page %lu not "
2921 				       "writeback, cur %llu end %llu\n",
2922 				       page->index, (unsigned long long)cur,
2923 				       (unsigned long long)end);
2924 			}
2925 
2926 			ret = submit_extent_page(write_flags, tree, page,
2927 						 sector, iosize, pg_offset,
2928 						 bdev, &epd->bio, max_nr,
2929 						 end_bio_extent_writepage,
2930 						 0, 0, 0);
2931 			if (ret)
2932 				SetPageError(page);
2933 		}
2934 		cur = cur + iosize;
2935 		pg_offset += iosize;
2936 		nr++;
2937 	}
2938 done:
2939 	if (nr == 0) {
2940 		/* make sure the mapping tag for page dirty gets cleared */
2941 		set_page_writeback(page);
2942 		end_page_writeback(page);
2943 	}
2944 	unlock_page(page);
2945 
2946 done_unlocked:
2947 
2948 	/* drop our reference on any cached states */
2949 	free_extent_state(cached_state);
2950 	return 0;
2951 }
2952 
2953 /**
2954  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2955  * @mapping: address space structure to write
2956  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2957  * @writepage: function called for each page
2958  * @data: data passed to writepage function
2959  *
2960  * If a page is already under I/O, write_cache_pages() skips it, even
2961  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2962  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2963  * and msync() need to guarantee that all the data which was dirty at the time
2964  * the call was made get new I/O started against them.  If wbc->sync_mode is
2965  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2966  * existing IO to complete.
2967  */
2968 static int extent_write_cache_pages(struct extent_io_tree *tree,
2969 			     struct address_space *mapping,
2970 			     struct writeback_control *wbc,
2971 			     writepage_t writepage, void *data,
2972 			     void (*flush_fn)(void *))
2973 {
2974 	int ret = 0;
2975 	int done = 0;
2976 	int nr_to_write_done = 0;
2977 	struct pagevec pvec;
2978 	int nr_pages;
2979 	pgoff_t index;
2980 	pgoff_t end;		/* Inclusive */
2981 	int scanned = 0;
2982 	int tag;
2983 
2984 	pagevec_init(&pvec, 0);
2985 	if (wbc->range_cyclic) {
2986 		index = mapping->writeback_index; /* Start from prev offset */
2987 		end = -1;
2988 	} else {
2989 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2990 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2991 		scanned = 1;
2992 	}
2993 	if (wbc->sync_mode == WB_SYNC_ALL)
2994 		tag = PAGECACHE_TAG_TOWRITE;
2995 	else
2996 		tag = PAGECACHE_TAG_DIRTY;
2997 retry:
2998 	if (wbc->sync_mode == WB_SYNC_ALL)
2999 		tag_pages_for_writeback(mapping, index, end);
3000 	while (!done && !nr_to_write_done && (index <= end) &&
3001 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3002 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3003 		unsigned i;
3004 
3005 		scanned = 1;
3006 		for (i = 0; i < nr_pages; i++) {
3007 			struct page *page = pvec.pages[i];
3008 
3009 			/*
3010 			 * At this point we hold neither mapping->tree_lock nor
3011 			 * lock on the page itself: the page may be truncated or
3012 			 * invalidated (changing page->mapping to NULL), or even
3013 			 * swizzled back from swapper_space to tmpfs file
3014 			 * mapping
3015 			 */
3016 			if (tree->ops &&
3017 			    tree->ops->write_cache_pages_lock_hook) {
3018 				tree->ops->write_cache_pages_lock_hook(page,
3019 							       data, flush_fn);
3020 			} else {
3021 				if (!trylock_page(page)) {
3022 					flush_fn(data);
3023 					lock_page(page);
3024 				}
3025 			}
3026 
3027 			if (unlikely(page->mapping != mapping)) {
3028 				unlock_page(page);
3029 				continue;
3030 			}
3031 
3032 			if (!wbc->range_cyclic && page->index > end) {
3033 				done = 1;
3034 				unlock_page(page);
3035 				continue;
3036 			}
3037 
3038 			if (wbc->sync_mode != WB_SYNC_NONE) {
3039 				if (PageWriteback(page))
3040 					flush_fn(data);
3041 				wait_on_page_writeback(page);
3042 			}
3043 
3044 			if (PageWriteback(page) ||
3045 			    !clear_page_dirty_for_io(page)) {
3046 				unlock_page(page);
3047 				continue;
3048 			}
3049 
3050 			ret = (*writepage)(page, wbc, data);
3051 
3052 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3053 				unlock_page(page);
3054 				ret = 0;
3055 			}
3056 			if (ret)
3057 				done = 1;
3058 
3059 			/*
3060 			 * the filesystem may choose to bump up nr_to_write.
3061 			 * We have to make sure to honor the new nr_to_write
3062 			 * at any time
3063 			 */
3064 			nr_to_write_done = wbc->nr_to_write <= 0;
3065 		}
3066 		pagevec_release(&pvec);
3067 		cond_resched();
3068 	}
3069 	if (!scanned && !done) {
3070 		/*
3071 		 * We hit the last page and there is more work to be done: wrap
3072 		 * back to the start of the file
3073 		 */
3074 		scanned = 1;
3075 		index = 0;
3076 		goto retry;
3077 	}
3078 	return ret;
3079 }
3080 
3081 static void flush_epd_write_bio(struct extent_page_data *epd)
3082 {
3083 	if (epd->bio) {
3084 		if (epd->sync_io)
3085 			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
3086 		else
3087 			submit_one_bio(WRITE, epd->bio, 0, 0);
3088 		epd->bio = NULL;
3089 	}
3090 }
3091 
3092 static noinline void flush_write_bio(void *data)
3093 {
3094 	struct extent_page_data *epd = data;
3095 	flush_epd_write_bio(epd);
3096 }
3097 
3098 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3099 			  get_extent_t *get_extent,
3100 			  struct writeback_control *wbc)
3101 {
3102 	int ret;
3103 	struct extent_page_data epd = {
3104 		.bio = NULL,
3105 		.tree = tree,
3106 		.get_extent = get_extent,
3107 		.extent_locked = 0,
3108 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3109 	};
3110 
3111 	ret = __extent_writepage(page, wbc, &epd);
3112 
3113 	flush_epd_write_bio(&epd);
3114 	return ret;
3115 }
3116 
3117 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3118 			      u64 start, u64 end, get_extent_t *get_extent,
3119 			      int mode)
3120 {
3121 	int ret = 0;
3122 	struct address_space *mapping = inode->i_mapping;
3123 	struct page *page;
3124 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3125 		PAGE_CACHE_SHIFT;
3126 
3127 	struct extent_page_data epd = {
3128 		.bio = NULL,
3129 		.tree = tree,
3130 		.get_extent = get_extent,
3131 		.extent_locked = 1,
3132 		.sync_io = mode == WB_SYNC_ALL,
3133 	};
3134 	struct writeback_control wbc_writepages = {
3135 		.sync_mode	= mode,
3136 		.nr_to_write	= nr_pages * 2,
3137 		.range_start	= start,
3138 		.range_end	= end + 1,
3139 	};
3140 
3141 	while (start <= end) {
3142 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3143 		if (clear_page_dirty_for_io(page))
3144 			ret = __extent_writepage(page, &wbc_writepages, &epd);
3145 		else {
3146 			if (tree->ops && tree->ops->writepage_end_io_hook)
3147 				tree->ops->writepage_end_io_hook(page, start,
3148 						 start + PAGE_CACHE_SIZE - 1,
3149 						 NULL, 1);
3150 			unlock_page(page);
3151 		}
3152 		page_cache_release(page);
3153 		start += PAGE_CACHE_SIZE;
3154 	}
3155 
3156 	flush_epd_write_bio(&epd);
3157 	return ret;
3158 }
3159 
3160 int extent_writepages(struct extent_io_tree *tree,
3161 		      struct address_space *mapping,
3162 		      get_extent_t *get_extent,
3163 		      struct writeback_control *wbc)
3164 {
3165 	int ret = 0;
3166 	struct extent_page_data epd = {
3167 		.bio = NULL,
3168 		.tree = tree,
3169 		.get_extent = get_extent,
3170 		.extent_locked = 0,
3171 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3172 	};
3173 
3174 	ret = extent_write_cache_pages(tree, mapping, wbc,
3175 				       __extent_writepage, &epd,
3176 				       flush_write_bio);
3177 	flush_epd_write_bio(&epd);
3178 	return ret;
3179 }
3180 
3181 int extent_readpages(struct extent_io_tree *tree,
3182 		     struct address_space *mapping,
3183 		     struct list_head *pages, unsigned nr_pages,
3184 		     get_extent_t get_extent)
3185 {
3186 	struct bio *bio = NULL;
3187 	unsigned page_idx;
3188 	unsigned long bio_flags = 0;
3189 
3190 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3191 		struct page *page = list_entry(pages->prev, struct page, lru);
3192 
3193 		prefetchw(&page->flags);
3194 		list_del(&page->lru);
3195 		if (!add_to_page_cache_lru(page, mapping,
3196 					page->index, GFP_NOFS)) {
3197 			__extent_read_full_page(tree, page, get_extent,
3198 						&bio, 0, &bio_flags);
3199 		}
3200 		page_cache_release(page);
3201 	}
3202 	BUG_ON(!list_empty(pages));
3203 	if (bio)
3204 		submit_one_bio(READ, bio, 0, bio_flags);
3205 	return 0;
3206 }
3207 
3208 /*
3209  * basic invalidatepage code, this waits on any locked or writeback
3210  * ranges corresponding to the page, and then deletes any extent state
3211  * records from the tree
3212  */
3213 int extent_invalidatepage(struct extent_io_tree *tree,
3214 			  struct page *page, unsigned long offset)
3215 {
3216 	struct extent_state *cached_state = NULL;
3217 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3218 	u64 end = start + PAGE_CACHE_SIZE - 1;
3219 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3220 
3221 	start += (offset + blocksize - 1) & ~(blocksize - 1);
3222 	if (start > end)
3223 		return 0;
3224 
3225 	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
3226 	wait_on_page_writeback(page);
3227 	clear_extent_bit(tree, start, end,
3228 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3229 			 EXTENT_DO_ACCOUNTING,
3230 			 1, 1, &cached_state, GFP_NOFS);
3231 	return 0;
3232 }
3233 
3234 /*
3235  * a helper for releasepage, this tests for areas of the page that
3236  * are locked or under IO and drops the related state bits if it is safe
3237  * to drop the page.
3238  */
3239 int try_release_extent_state(struct extent_map_tree *map,
3240 			     struct extent_io_tree *tree, struct page *page,
3241 			     gfp_t mask)
3242 {
3243 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3244 	u64 end = start + PAGE_CACHE_SIZE - 1;
3245 	int ret = 1;
3246 
3247 	if (test_range_bit(tree, start, end,
3248 			   EXTENT_IOBITS, 0, NULL))
3249 		ret = 0;
3250 	else {
3251 		if ((mask & GFP_NOFS) == GFP_NOFS)
3252 			mask = GFP_NOFS;
3253 		/*
3254 		 * at this point we can safely clear everything except the
3255 		 * locked bit and the nodatasum bit
3256 		 */
3257 		ret = clear_extent_bit(tree, start, end,
3258 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3259 				 0, 0, NULL, mask);
3260 
3261 		/* if clear_extent_bit failed for enomem reasons,
3262 		 * we can't allow the release to continue.
3263 		 */
3264 		if (ret < 0)
3265 			ret = 0;
3266 		else
3267 			ret = 1;
3268 	}
3269 	return ret;
3270 }
3271 
3272 /*
3273  * a helper for releasepage.  As long as there are no locked extents
3274  * in the range corresponding to the page, both state records and extent
3275  * map records are removed
3276  */
3277 int try_release_extent_mapping(struct extent_map_tree *map,
3278 			       struct extent_io_tree *tree, struct page *page,
3279 			       gfp_t mask)
3280 {
3281 	struct extent_map *em;
3282 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3283 	u64 end = start + PAGE_CACHE_SIZE - 1;
3284 
3285 	if ((mask & __GFP_WAIT) &&
3286 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3287 		u64 len;
3288 		while (start <= end) {
3289 			len = end - start + 1;
3290 			write_lock(&map->lock);
3291 			em = lookup_extent_mapping(map, start, len);
3292 			if (IS_ERR_OR_NULL(em)) {
3293 				write_unlock(&map->lock);
3294 				break;
3295 			}
3296 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3297 			    em->start != start) {
3298 				write_unlock(&map->lock);
3299 				free_extent_map(em);
3300 				break;
3301 			}
3302 			if (!test_range_bit(tree, em->start,
3303 					    extent_map_end(em) - 1,
3304 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3305 					    0, NULL)) {
3306 				remove_extent_mapping(map, em);
3307 				/* once for the rb tree */
3308 				free_extent_map(em);
3309 			}
3310 			start = extent_map_end(em);
3311 			write_unlock(&map->lock);
3312 
3313 			/* once for us */
3314 			free_extent_map(em);
3315 		}
3316 	}
3317 	return try_release_extent_state(map, tree, page, mask);
3318 }
3319 
3320 /*
3321  * helper function for fiemap, which doesn't want to see any holes.
3322  * This maps until we find something past 'last'
3323  */
3324 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3325 						u64 offset,
3326 						u64 last,
3327 						get_extent_t *get_extent)
3328 {
3329 	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3330 	struct extent_map *em;
3331 	u64 len;
3332 
3333 	if (offset >= last)
3334 		return NULL;
3335 
3336 	while(1) {
3337 		len = last - offset;
3338 		if (len == 0)
3339 			break;
3340 		len = (len + sectorsize - 1) & ~(sectorsize - 1);
3341 		em = get_extent(inode, NULL, 0, offset, len, 0);
3342 		if (IS_ERR_OR_NULL(em))
3343 			return em;
3344 
3345 		/* if this isn't a hole return it */
3346 		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3347 		    em->block_start != EXTENT_MAP_HOLE) {
3348 			return em;
3349 		}
3350 
3351 		/* this is a hole, advance to the next extent */
3352 		offset = extent_map_end(em);
3353 		free_extent_map(em);
3354 		if (offset >= last)
3355 			break;
3356 	}
3357 	return NULL;
3358 }
3359 
3360 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3361 		__u64 start, __u64 len, get_extent_t *get_extent)
3362 {
3363 	int ret = 0;
3364 	u64 off = start;
3365 	u64 max = start + len;
3366 	u32 flags = 0;
3367 	u32 found_type;
3368 	u64 last;
3369 	u64 last_for_get_extent = 0;
3370 	u64 disko = 0;
3371 	u64 isize = i_size_read(inode);
3372 	struct btrfs_key found_key;
3373 	struct extent_map *em = NULL;
3374 	struct extent_state *cached_state = NULL;
3375 	struct btrfs_path *path;
3376 	struct btrfs_file_extent_item *item;
3377 	int end = 0;
3378 	u64 em_start = 0;
3379 	u64 em_len = 0;
3380 	u64 em_end = 0;
3381 	unsigned long emflags;
3382 
3383 	if (len == 0)
3384 		return -EINVAL;
3385 
3386 	path = btrfs_alloc_path();
3387 	if (!path)
3388 		return -ENOMEM;
3389 	path->leave_spinning = 1;
3390 
3391 	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3392 	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3393 
3394 	/*
3395 	 * lookup the last file extent.  We're not using i_size here
3396 	 * because there might be preallocation past i_size
3397 	 */
3398 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3399 				       path, btrfs_ino(inode), -1, 0);
3400 	if (ret < 0) {
3401 		btrfs_free_path(path);
3402 		return ret;
3403 	}
3404 	WARN_ON(!ret);
3405 	path->slots[0]--;
3406 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3407 			      struct btrfs_file_extent_item);
3408 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3409 	found_type = btrfs_key_type(&found_key);
3410 
3411 	/* No extents, but there might be delalloc bits */
3412 	if (found_key.objectid != btrfs_ino(inode) ||
3413 	    found_type != BTRFS_EXTENT_DATA_KEY) {
3414 		/* have to trust i_size as the end */
3415 		last = (u64)-1;
3416 		last_for_get_extent = isize;
3417 	} else {
3418 		/*
3419 		 * remember the start of the last extent.  There are a
3420 		 * bunch of different factors that go into the length of the
3421 		 * extent, so its much less complex to remember where it started
3422 		 */
3423 		last = found_key.offset;
3424 		last_for_get_extent = last + 1;
3425 	}
3426 	btrfs_free_path(path);
3427 
3428 	/*
3429 	 * we might have some extents allocated but more delalloc past those
3430 	 * extents.  so, we trust isize unless the start of the last extent is
3431 	 * beyond isize
3432 	 */
3433 	if (last < isize) {
3434 		last = (u64)-1;
3435 		last_for_get_extent = isize;
3436 	}
3437 
3438 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3439 			 &cached_state, GFP_NOFS);
3440 
3441 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3442 				   get_extent);
3443 	if (!em)
3444 		goto out;
3445 	if (IS_ERR(em)) {
3446 		ret = PTR_ERR(em);
3447 		goto out;
3448 	}
3449 
3450 	while (!end) {
3451 		u64 offset_in_extent;
3452 
3453 		/* break if the extent we found is outside the range */
3454 		if (em->start >= max || extent_map_end(em) < off)
3455 			break;
3456 
3457 		/*
3458 		 * get_extent may return an extent that starts before our
3459 		 * requested range.  We have to make sure the ranges
3460 		 * we return to fiemap always move forward and don't
3461 		 * overlap, so adjust the offsets here
3462 		 */
3463 		em_start = max(em->start, off);
3464 
3465 		/*
3466 		 * record the offset from the start of the extent
3467 		 * for adjusting the disk offset below
3468 		 */
3469 		offset_in_extent = em_start - em->start;
3470 		em_end = extent_map_end(em);
3471 		em_len = em_end - em_start;
3472 		emflags = em->flags;
3473 		disko = 0;
3474 		flags = 0;
3475 
3476 		/*
3477 		 * bump off for our next call to get_extent
3478 		 */
3479 		off = extent_map_end(em);
3480 		if (off >= max)
3481 			end = 1;
3482 
3483 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3484 			end = 1;
3485 			flags |= FIEMAP_EXTENT_LAST;
3486 		} else if (em->block_start == EXTENT_MAP_INLINE) {
3487 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
3488 				  FIEMAP_EXTENT_NOT_ALIGNED);
3489 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
3490 			flags |= (FIEMAP_EXTENT_DELALLOC |
3491 				  FIEMAP_EXTENT_UNKNOWN);
3492 		} else {
3493 			disko = em->block_start + offset_in_extent;
3494 		}
3495 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3496 			flags |= FIEMAP_EXTENT_ENCODED;
3497 
3498 		free_extent_map(em);
3499 		em = NULL;
3500 		if ((em_start >= last) || em_len == (u64)-1 ||
3501 		   (last == (u64)-1 && isize <= em_end)) {
3502 			flags |= FIEMAP_EXTENT_LAST;
3503 			end = 1;
3504 		}
3505 
3506 		/* now scan forward to see if this is really the last extent. */
3507 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
3508 					   get_extent);
3509 		if (IS_ERR(em)) {
3510 			ret = PTR_ERR(em);
3511 			goto out;
3512 		}
3513 		if (!em) {
3514 			flags |= FIEMAP_EXTENT_LAST;
3515 			end = 1;
3516 		}
3517 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3518 					      em_len, flags);
3519 		if (ret)
3520 			goto out_free;
3521 	}
3522 out_free:
3523 	free_extent_map(em);
3524 out:
3525 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3526 			     &cached_state, GFP_NOFS);
3527 	return ret;
3528 }
3529 
3530 inline struct page *extent_buffer_page(struct extent_buffer *eb,
3531 					      unsigned long i)
3532 {
3533 	struct page *p;
3534 	struct address_space *mapping;
3535 
3536 	if (i == 0)
3537 		return eb->first_page;
3538 	i += eb->start >> PAGE_CACHE_SHIFT;
3539 	mapping = eb->first_page->mapping;
3540 	if (!mapping)
3541 		return NULL;
3542 
3543 	/*
3544 	 * extent_buffer_page is only called after pinning the page
3545 	 * by increasing the reference count.  So we know the page must
3546 	 * be in the radix tree.
3547 	 */
3548 	rcu_read_lock();
3549 	p = radix_tree_lookup(&mapping->page_tree, i);
3550 	rcu_read_unlock();
3551 
3552 	return p;
3553 }
3554 
3555 inline unsigned long num_extent_pages(u64 start, u64 len)
3556 {
3557 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3558 		(start >> PAGE_CACHE_SHIFT);
3559 }
3560 
3561 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3562 						   u64 start,
3563 						   unsigned long len,
3564 						   gfp_t mask)
3565 {
3566 	struct extent_buffer *eb = NULL;
3567 #if LEAK_DEBUG
3568 	unsigned long flags;
3569 #endif
3570 
3571 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3572 	if (eb == NULL)
3573 		return NULL;
3574 	eb->start = start;
3575 	eb->len = len;
3576 	rwlock_init(&eb->lock);
3577 	atomic_set(&eb->write_locks, 0);
3578 	atomic_set(&eb->read_locks, 0);
3579 	atomic_set(&eb->blocking_readers, 0);
3580 	atomic_set(&eb->blocking_writers, 0);
3581 	atomic_set(&eb->spinning_readers, 0);
3582 	atomic_set(&eb->spinning_writers, 0);
3583 	eb->lock_nested = 0;
3584 	init_waitqueue_head(&eb->write_lock_wq);
3585 	init_waitqueue_head(&eb->read_lock_wq);
3586 
3587 #if LEAK_DEBUG
3588 	spin_lock_irqsave(&leak_lock, flags);
3589 	list_add(&eb->leak_list, &buffers);
3590 	spin_unlock_irqrestore(&leak_lock, flags);
3591 #endif
3592 	atomic_set(&eb->refs, 1);
3593 
3594 	return eb;
3595 }
3596 
3597 static void __free_extent_buffer(struct extent_buffer *eb)
3598 {
3599 #if LEAK_DEBUG
3600 	unsigned long flags;
3601 	spin_lock_irqsave(&leak_lock, flags);
3602 	list_del(&eb->leak_list);
3603 	spin_unlock_irqrestore(&leak_lock, flags);
3604 #endif
3605 	kmem_cache_free(extent_buffer_cache, eb);
3606 }
3607 
3608 /*
3609  * Helper for releasing extent buffer page.
3610  */
3611 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3612 						unsigned long start_idx)
3613 {
3614 	unsigned long index;
3615 	struct page *page;
3616 
3617 	if (!eb->first_page)
3618 		return;
3619 
3620 	index = num_extent_pages(eb->start, eb->len);
3621 	if (start_idx >= index)
3622 		return;
3623 
3624 	do {
3625 		index--;
3626 		page = extent_buffer_page(eb, index);
3627 		if (page)
3628 			page_cache_release(page);
3629 	} while (index != start_idx);
3630 }
3631 
3632 /*
3633  * Helper for releasing the extent buffer.
3634  */
3635 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3636 {
3637 	btrfs_release_extent_buffer_page(eb, 0);
3638 	__free_extent_buffer(eb);
3639 }
3640 
3641 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3642 					  u64 start, unsigned long len,
3643 					  struct page *page0)
3644 {
3645 	unsigned long num_pages = num_extent_pages(start, len);
3646 	unsigned long i;
3647 	unsigned long index = start >> PAGE_CACHE_SHIFT;
3648 	struct extent_buffer *eb;
3649 	struct extent_buffer *exists = NULL;
3650 	struct page *p;
3651 	struct address_space *mapping = tree->mapping;
3652 	int uptodate = 1;
3653 	int ret;
3654 
3655 	rcu_read_lock();
3656 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3657 	if (eb && atomic_inc_not_zero(&eb->refs)) {
3658 		rcu_read_unlock();
3659 		mark_page_accessed(eb->first_page);
3660 		return eb;
3661 	}
3662 	rcu_read_unlock();
3663 
3664 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3665 	if (!eb)
3666 		return NULL;
3667 
3668 	if (page0) {
3669 		eb->first_page = page0;
3670 		i = 1;
3671 		index++;
3672 		page_cache_get(page0);
3673 		mark_page_accessed(page0);
3674 		set_page_extent_mapped(page0);
3675 		set_page_extent_head(page0, len);
3676 		uptodate = PageUptodate(page0);
3677 	} else {
3678 		i = 0;
3679 	}
3680 	for (; i < num_pages; i++, index++) {
3681 		p = find_or_create_page(mapping, index, GFP_NOFS);
3682 		if (!p) {
3683 			WARN_ON(1);
3684 			goto free_eb;
3685 		}
3686 		set_page_extent_mapped(p);
3687 		mark_page_accessed(p);
3688 		if (i == 0) {
3689 			eb->first_page = p;
3690 			set_page_extent_head(p, len);
3691 		} else {
3692 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3693 		}
3694 		if (!PageUptodate(p))
3695 			uptodate = 0;
3696 
3697 		/*
3698 		 * see below about how we avoid a nasty race with release page
3699 		 * and why we unlock later
3700 		 */
3701 		if (i != 0)
3702 			unlock_page(p);
3703 	}
3704 	if (uptodate)
3705 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3706 
3707 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3708 	if (ret)
3709 		goto free_eb;
3710 
3711 	spin_lock(&tree->buffer_lock);
3712 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3713 	if (ret == -EEXIST) {
3714 		exists = radix_tree_lookup(&tree->buffer,
3715 						start >> PAGE_CACHE_SHIFT);
3716 		/* add one reference for the caller */
3717 		atomic_inc(&exists->refs);
3718 		spin_unlock(&tree->buffer_lock);
3719 		radix_tree_preload_end();
3720 		goto free_eb;
3721 	}
3722 	/* add one reference for the tree */
3723 	atomic_inc(&eb->refs);
3724 	spin_unlock(&tree->buffer_lock);
3725 	radix_tree_preload_end();
3726 
3727 	/*
3728 	 * there is a race where release page may have
3729 	 * tried to find this extent buffer in the radix
3730 	 * but failed.  It will tell the VM it is safe to
3731 	 * reclaim the, and it will clear the page private bit.
3732 	 * We must make sure to set the page private bit properly
3733 	 * after the extent buffer is in the radix tree so
3734 	 * it doesn't get lost
3735 	 */
3736 	set_page_extent_mapped(eb->first_page);
3737 	set_page_extent_head(eb->first_page, eb->len);
3738 	if (!page0)
3739 		unlock_page(eb->first_page);
3740 	return eb;
3741 
3742 free_eb:
3743 	if (eb->first_page && !page0)
3744 		unlock_page(eb->first_page);
3745 
3746 	if (!atomic_dec_and_test(&eb->refs))
3747 		return exists;
3748 	btrfs_release_extent_buffer(eb);
3749 	return exists;
3750 }
3751 
3752 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3753 					 u64 start, unsigned long len)
3754 {
3755 	struct extent_buffer *eb;
3756 
3757 	rcu_read_lock();
3758 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3759 	if (eb && atomic_inc_not_zero(&eb->refs)) {
3760 		rcu_read_unlock();
3761 		mark_page_accessed(eb->first_page);
3762 		return eb;
3763 	}
3764 	rcu_read_unlock();
3765 
3766 	return NULL;
3767 }
3768 
3769 void free_extent_buffer(struct extent_buffer *eb)
3770 {
3771 	if (!eb)
3772 		return;
3773 
3774 	if (!atomic_dec_and_test(&eb->refs))
3775 		return;
3776 
3777 	WARN_ON(1);
3778 }
3779 
3780 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3781 			      struct extent_buffer *eb)
3782 {
3783 	unsigned long i;
3784 	unsigned long num_pages;
3785 	struct page *page;
3786 
3787 	num_pages = num_extent_pages(eb->start, eb->len);
3788 
3789 	for (i = 0; i < num_pages; i++) {
3790 		page = extent_buffer_page(eb, i);
3791 		if (!PageDirty(page))
3792 			continue;
3793 
3794 		lock_page(page);
3795 		WARN_ON(!PagePrivate(page));
3796 
3797 		set_page_extent_mapped(page);
3798 		if (i == 0)
3799 			set_page_extent_head(page, eb->len);
3800 
3801 		clear_page_dirty_for_io(page);
3802 		spin_lock_irq(&page->mapping->tree_lock);
3803 		if (!PageDirty(page)) {
3804 			radix_tree_tag_clear(&page->mapping->page_tree,
3805 						page_index(page),
3806 						PAGECACHE_TAG_DIRTY);
3807 		}
3808 		spin_unlock_irq(&page->mapping->tree_lock);
3809 		ClearPageError(page);
3810 		unlock_page(page);
3811 	}
3812 	return 0;
3813 }
3814 
3815 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3816 			     struct extent_buffer *eb)
3817 {
3818 	unsigned long i;
3819 	unsigned long num_pages;
3820 	int was_dirty = 0;
3821 
3822 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3823 	num_pages = num_extent_pages(eb->start, eb->len);
3824 	for (i = 0; i < num_pages; i++)
3825 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3826 	return was_dirty;
3827 }
3828 
3829 static int __eb_straddles_pages(u64 start, u64 len)
3830 {
3831 	if (len < PAGE_CACHE_SIZE)
3832 		return 1;
3833 	if (start & (PAGE_CACHE_SIZE - 1))
3834 		return 1;
3835 	if ((start + len) & (PAGE_CACHE_SIZE - 1))
3836 		return 1;
3837 	return 0;
3838 }
3839 
3840 static int eb_straddles_pages(struct extent_buffer *eb)
3841 {
3842 	return __eb_straddles_pages(eb->start, eb->len);
3843 }
3844 
3845 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3846 				struct extent_buffer *eb,
3847 				struct extent_state **cached_state)
3848 {
3849 	unsigned long i;
3850 	struct page *page;
3851 	unsigned long num_pages;
3852 
3853 	num_pages = num_extent_pages(eb->start, eb->len);
3854 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3855 
3856 	if (eb_straddles_pages(eb)) {
3857 		clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3858 				      cached_state, GFP_NOFS);
3859 	}
3860 	for (i = 0; i < num_pages; i++) {
3861 		page = extent_buffer_page(eb, i);
3862 		if (page)
3863 			ClearPageUptodate(page);
3864 	}
3865 	return 0;
3866 }
3867 
3868 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3869 				struct extent_buffer *eb)
3870 {
3871 	unsigned long i;
3872 	struct page *page;
3873 	unsigned long num_pages;
3874 
3875 	num_pages = num_extent_pages(eb->start, eb->len);
3876 
3877 	if (eb_straddles_pages(eb)) {
3878 		set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3879 				    NULL, GFP_NOFS);
3880 	}
3881 	for (i = 0; i < num_pages; i++) {
3882 		page = extent_buffer_page(eb, i);
3883 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3884 		    ((i == num_pages - 1) &&
3885 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3886 			check_page_uptodate(tree, page);
3887 			continue;
3888 		}
3889 		SetPageUptodate(page);
3890 	}
3891 	return 0;
3892 }
3893 
3894 int extent_range_uptodate(struct extent_io_tree *tree,
3895 			  u64 start, u64 end)
3896 {
3897 	struct page *page;
3898 	int ret;
3899 	int pg_uptodate = 1;
3900 	int uptodate;
3901 	unsigned long index;
3902 
3903 	if (__eb_straddles_pages(start, end - start + 1)) {
3904 		ret = test_range_bit(tree, start, end,
3905 				     EXTENT_UPTODATE, 1, NULL);
3906 		if (ret)
3907 			return 1;
3908 	}
3909 	while (start <= end) {
3910 		index = start >> PAGE_CACHE_SHIFT;
3911 		page = find_get_page(tree->mapping, index);
3912 		uptodate = PageUptodate(page);
3913 		page_cache_release(page);
3914 		if (!uptodate) {
3915 			pg_uptodate = 0;
3916 			break;
3917 		}
3918 		start += PAGE_CACHE_SIZE;
3919 	}
3920 	return pg_uptodate;
3921 }
3922 
3923 int extent_buffer_uptodate(struct extent_io_tree *tree,
3924 			   struct extent_buffer *eb,
3925 			   struct extent_state *cached_state)
3926 {
3927 	int ret = 0;
3928 	unsigned long num_pages;
3929 	unsigned long i;
3930 	struct page *page;
3931 	int pg_uptodate = 1;
3932 
3933 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3934 		return 1;
3935 
3936 	if (eb_straddles_pages(eb)) {
3937 		ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3938 				   EXTENT_UPTODATE, 1, cached_state);
3939 		if (ret)
3940 			return ret;
3941 	}
3942 
3943 	num_pages = num_extent_pages(eb->start, eb->len);
3944 	for (i = 0; i < num_pages; i++) {
3945 		page = extent_buffer_page(eb, i);
3946 		if (!PageUptodate(page)) {
3947 			pg_uptodate = 0;
3948 			break;
3949 		}
3950 	}
3951 	return pg_uptodate;
3952 }
3953 
3954 int read_extent_buffer_pages(struct extent_io_tree *tree,
3955 			     struct extent_buffer *eb, u64 start, int wait,
3956 			     get_extent_t *get_extent, int mirror_num)
3957 {
3958 	unsigned long i;
3959 	unsigned long start_i;
3960 	struct page *page;
3961 	int err;
3962 	int ret = 0;
3963 	int locked_pages = 0;
3964 	int all_uptodate = 1;
3965 	int inc_all_pages = 0;
3966 	unsigned long num_pages;
3967 	struct bio *bio = NULL;
3968 	unsigned long bio_flags = 0;
3969 
3970 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3971 		return 0;
3972 
3973 	if (eb_straddles_pages(eb)) {
3974 		if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3975 				   EXTENT_UPTODATE, 1, NULL)) {
3976 			return 0;
3977 		}
3978 	}
3979 
3980 	if (start) {
3981 		WARN_ON(start < eb->start);
3982 		start_i = (start >> PAGE_CACHE_SHIFT) -
3983 			(eb->start >> PAGE_CACHE_SHIFT);
3984 	} else {
3985 		start_i = 0;
3986 	}
3987 
3988 	num_pages = num_extent_pages(eb->start, eb->len);
3989 	for (i = start_i; i < num_pages; i++) {
3990 		page = extent_buffer_page(eb, i);
3991 		if (wait == WAIT_NONE) {
3992 			if (!trylock_page(page))
3993 				goto unlock_exit;
3994 		} else {
3995 			lock_page(page);
3996 		}
3997 		locked_pages++;
3998 		if (!PageUptodate(page))
3999 			all_uptodate = 0;
4000 	}
4001 	if (all_uptodate) {
4002 		if (start_i == 0)
4003 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4004 		goto unlock_exit;
4005 	}
4006 
4007 	for (i = start_i; i < num_pages; i++) {
4008 		page = extent_buffer_page(eb, i);
4009 
4010 		WARN_ON(!PagePrivate(page));
4011 
4012 		set_page_extent_mapped(page);
4013 		if (i == 0)
4014 			set_page_extent_head(page, eb->len);
4015 
4016 		if (inc_all_pages)
4017 			page_cache_get(page);
4018 		if (!PageUptodate(page)) {
4019 			if (start_i == 0)
4020 				inc_all_pages = 1;
4021 			ClearPageError(page);
4022 			err = __extent_read_full_page(tree, page,
4023 						      get_extent, &bio,
4024 						      mirror_num, &bio_flags);
4025 			if (err)
4026 				ret = err;
4027 		} else {
4028 			unlock_page(page);
4029 		}
4030 	}
4031 
4032 	if (bio)
4033 		submit_one_bio(READ, bio, mirror_num, bio_flags);
4034 
4035 	if (ret || wait != WAIT_COMPLETE)
4036 		return ret;
4037 
4038 	for (i = start_i; i < num_pages; i++) {
4039 		page = extent_buffer_page(eb, i);
4040 		wait_on_page_locked(page);
4041 		if (!PageUptodate(page))
4042 			ret = -EIO;
4043 	}
4044 
4045 	if (!ret)
4046 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4047 	return ret;
4048 
4049 unlock_exit:
4050 	i = start_i;
4051 	while (locked_pages > 0) {
4052 		page = extent_buffer_page(eb, i);
4053 		i++;
4054 		unlock_page(page);
4055 		locked_pages--;
4056 	}
4057 	return ret;
4058 }
4059 
4060 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4061 			unsigned long start,
4062 			unsigned long len)
4063 {
4064 	size_t cur;
4065 	size_t offset;
4066 	struct page *page;
4067 	char *kaddr;
4068 	char *dst = (char *)dstv;
4069 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4070 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4071 
4072 	WARN_ON(start > eb->len);
4073 	WARN_ON(start + len > eb->start + eb->len);
4074 
4075 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4076 
4077 	while (len > 0) {
4078 		page = extent_buffer_page(eb, i);
4079 
4080 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4081 		kaddr = page_address(page);
4082 		memcpy(dst, kaddr + offset, cur);
4083 
4084 		dst += cur;
4085 		len -= cur;
4086 		offset = 0;
4087 		i++;
4088 	}
4089 }
4090 
4091 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4092 			       unsigned long min_len, char **map,
4093 			       unsigned long *map_start,
4094 			       unsigned long *map_len)
4095 {
4096 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4097 	char *kaddr;
4098 	struct page *p;
4099 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4100 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4101 	unsigned long end_i = (start_offset + start + min_len - 1) >>
4102 		PAGE_CACHE_SHIFT;
4103 
4104 	if (i != end_i)
4105 		return -EINVAL;
4106 
4107 	if (i == 0) {
4108 		offset = start_offset;
4109 		*map_start = 0;
4110 	} else {
4111 		offset = 0;
4112 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4113 	}
4114 
4115 	if (start + min_len > eb->len) {
4116 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4117 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4118 		       eb->len, start, min_len);
4119 		WARN_ON(1);
4120 		return -EINVAL;
4121 	}
4122 
4123 	p = extent_buffer_page(eb, i);
4124 	kaddr = page_address(p);
4125 	*map = kaddr + offset;
4126 	*map_len = PAGE_CACHE_SIZE - offset;
4127 	return 0;
4128 }
4129 
4130 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4131 			  unsigned long start,
4132 			  unsigned long len)
4133 {
4134 	size_t cur;
4135 	size_t offset;
4136 	struct page *page;
4137 	char *kaddr;
4138 	char *ptr = (char *)ptrv;
4139 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4140 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4141 	int ret = 0;
4142 
4143 	WARN_ON(start > eb->len);
4144 	WARN_ON(start + len > eb->start + eb->len);
4145 
4146 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4147 
4148 	while (len > 0) {
4149 		page = extent_buffer_page(eb, i);
4150 
4151 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4152 
4153 		kaddr = page_address(page);
4154 		ret = memcmp(ptr, kaddr + offset, cur);
4155 		if (ret)
4156 			break;
4157 
4158 		ptr += cur;
4159 		len -= cur;
4160 		offset = 0;
4161 		i++;
4162 	}
4163 	return ret;
4164 }
4165 
4166 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4167 			 unsigned long start, unsigned long len)
4168 {
4169 	size_t cur;
4170 	size_t offset;
4171 	struct page *page;
4172 	char *kaddr;
4173 	char *src = (char *)srcv;
4174 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4175 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4176 
4177 	WARN_ON(start > eb->len);
4178 	WARN_ON(start + len > eb->start + eb->len);
4179 
4180 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4181 
4182 	while (len > 0) {
4183 		page = extent_buffer_page(eb, i);
4184 		WARN_ON(!PageUptodate(page));
4185 
4186 		cur = min(len, PAGE_CACHE_SIZE - offset);
4187 		kaddr = page_address(page);
4188 		memcpy(kaddr + offset, src, cur);
4189 
4190 		src += cur;
4191 		len -= cur;
4192 		offset = 0;
4193 		i++;
4194 	}
4195 }
4196 
4197 void memset_extent_buffer(struct extent_buffer *eb, char c,
4198 			  unsigned long start, unsigned long len)
4199 {
4200 	size_t cur;
4201 	size_t offset;
4202 	struct page *page;
4203 	char *kaddr;
4204 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4205 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4206 
4207 	WARN_ON(start > eb->len);
4208 	WARN_ON(start + len > eb->start + eb->len);
4209 
4210 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4211 
4212 	while (len > 0) {
4213 		page = extent_buffer_page(eb, i);
4214 		WARN_ON(!PageUptodate(page));
4215 
4216 		cur = min(len, PAGE_CACHE_SIZE - offset);
4217 		kaddr = page_address(page);
4218 		memset(kaddr + offset, c, cur);
4219 
4220 		len -= cur;
4221 		offset = 0;
4222 		i++;
4223 	}
4224 }
4225 
4226 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4227 			unsigned long dst_offset, unsigned long src_offset,
4228 			unsigned long len)
4229 {
4230 	u64 dst_len = dst->len;
4231 	size_t cur;
4232 	size_t offset;
4233 	struct page *page;
4234 	char *kaddr;
4235 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4236 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4237 
4238 	WARN_ON(src->len != dst_len);
4239 
4240 	offset = (start_offset + dst_offset) &
4241 		((unsigned long)PAGE_CACHE_SIZE - 1);
4242 
4243 	while (len > 0) {
4244 		page = extent_buffer_page(dst, i);
4245 		WARN_ON(!PageUptodate(page));
4246 
4247 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4248 
4249 		kaddr = page_address(page);
4250 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4251 
4252 		src_offset += cur;
4253 		len -= cur;
4254 		offset = 0;
4255 		i++;
4256 	}
4257 }
4258 
4259 static void move_pages(struct page *dst_page, struct page *src_page,
4260 		       unsigned long dst_off, unsigned long src_off,
4261 		       unsigned long len)
4262 {
4263 	char *dst_kaddr = page_address(dst_page);
4264 	if (dst_page == src_page) {
4265 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4266 	} else {
4267 		char *src_kaddr = page_address(src_page);
4268 		char *p = dst_kaddr + dst_off + len;
4269 		char *s = src_kaddr + src_off + len;
4270 
4271 		while (len--)
4272 			*--p = *--s;
4273 	}
4274 }
4275 
4276 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4277 {
4278 	unsigned long distance = (src > dst) ? src - dst : dst - src;
4279 	return distance < len;
4280 }
4281 
4282 static void copy_pages(struct page *dst_page, struct page *src_page,
4283 		       unsigned long dst_off, unsigned long src_off,
4284 		       unsigned long len)
4285 {
4286 	char *dst_kaddr = page_address(dst_page);
4287 	char *src_kaddr;
4288 
4289 	if (dst_page != src_page) {
4290 		src_kaddr = page_address(src_page);
4291 	} else {
4292 		src_kaddr = dst_kaddr;
4293 		BUG_ON(areas_overlap(src_off, dst_off, len));
4294 	}
4295 
4296 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4297 }
4298 
4299 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4300 			   unsigned long src_offset, unsigned long len)
4301 {
4302 	size_t cur;
4303 	size_t dst_off_in_page;
4304 	size_t src_off_in_page;
4305 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4306 	unsigned long dst_i;
4307 	unsigned long src_i;
4308 
4309 	if (src_offset + len > dst->len) {
4310 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4311 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4312 		BUG_ON(1);
4313 	}
4314 	if (dst_offset + len > dst->len) {
4315 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4316 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4317 		BUG_ON(1);
4318 	}
4319 
4320 	while (len > 0) {
4321 		dst_off_in_page = (start_offset + dst_offset) &
4322 			((unsigned long)PAGE_CACHE_SIZE - 1);
4323 		src_off_in_page = (start_offset + src_offset) &
4324 			((unsigned long)PAGE_CACHE_SIZE - 1);
4325 
4326 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4327 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4328 
4329 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4330 					       src_off_in_page));
4331 		cur = min_t(unsigned long, cur,
4332 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4333 
4334 		copy_pages(extent_buffer_page(dst, dst_i),
4335 			   extent_buffer_page(dst, src_i),
4336 			   dst_off_in_page, src_off_in_page, cur);
4337 
4338 		src_offset += cur;
4339 		dst_offset += cur;
4340 		len -= cur;
4341 	}
4342 }
4343 
4344 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4345 			   unsigned long src_offset, unsigned long len)
4346 {
4347 	size_t cur;
4348 	size_t dst_off_in_page;
4349 	size_t src_off_in_page;
4350 	unsigned long dst_end = dst_offset + len - 1;
4351 	unsigned long src_end = src_offset + len - 1;
4352 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4353 	unsigned long dst_i;
4354 	unsigned long src_i;
4355 
4356 	if (src_offset + len > dst->len) {
4357 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4358 		       "len %lu len %lu\n", src_offset, len, dst->len);
4359 		BUG_ON(1);
4360 	}
4361 	if (dst_offset + len > dst->len) {
4362 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4363 		       "len %lu len %lu\n", dst_offset, len, dst->len);
4364 		BUG_ON(1);
4365 	}
4366 	if (!areas_overlap(src_offset, dst_offset, len)) {
4367 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4368 		return;
4369 	}
4370 	while (len > 0) {
4371 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4372 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4373 
4374 		dst_off_in_page = (start_offset + dst_end) &
4375 			((unsigned long)PAGE_CACHE_SIZE - 1);
4376 		src_off_in_page = (start_offset + src_end) &
4377 			((unsigned long)PAGE_CACHE_SIZE - 1);
4378 
4379 		cur = min_t(unsigned long, len, src_off_in_page + 1);
4380 		cur = min(cur, dst_off_in_page + 1);
4381 		move_pages(extent_buffer_page(dst, dst_i),
4382 			   extent_buffer_page(dst, src_i),
4383 			   dst_off_in_page - cur + 1,
4384 			   src_off_in_page - cur + 1, cur);
4385 
4386 		dst_end -= cur;
4387 		src_end -= cur;
4388 		len -= cur;
4389 	}
4390 }
4391 
4392 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4393 {
4394 	struct extent_buffer *eb =
4395 			container_of(head, struct extent_buffer, rcu_head);
4396 
4397 	btrfs_release_extent_buffer(eb);
4398 }
4399 
4400 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4401 {
4402 	u64 start = page_offset(page);
4403 	struct extent_buffer *eb;
4404 	int ret = 1;
4405 
4406 	spin_lock(&tree->buffer_lock);
4407 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4408 	if (!eb) {
4409 		spin_unlock(&tree->buffer_lock);
4410 		return ret;
4411 	}
4412 
4413 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4414 		ret = 0;
4415 		goto out;
4416 	}
4417 
4418 	/*
4419 	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
4420 	 * Or go back.
4421 	 */
4422 	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
4423 		ret = 0;
4424 		goto out;
4425 	}
4426 
4427 	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4428 out:
4429 	spin_unlock(&tree->buffer_lock);
4430 
4431 	/* at this point we can safely release the extent buffer */
4432 	if (atomic_read(&eb->refs) == 0)
4433 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4434 	return ret;
4435 }
4436