xref: /linux/fs/btrfs/extent_io.c (revision b7967db75a38df4891b22efe1b0969b9357eb946)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 
20 static struct kmem_cache *extent_state_cache;
21 static struct kmem_cache *extent_buffer_cache;
22 
23 static LIST_HEAD(buffers);
24 static LIST_HEAD(states);
25 
26 #define LEAK_DEBUG 0
27 #if LEAK_DEBUG
28 static DEFINE_SPINLOCK(leak_lock);
29 #endif
30 
31 #define BUFFER_LRU_MAX 64
32 
33 struct tree_entry {
34 	u64 start;
35 	u64 end;
36 	struct rb_node rb_node;
37 };
38 
39 struct extent_page_data {
40 	struct bio *bio;
41 	struct extent_io_tree *tree;
42 	get_extent_t *get_extent;
43 
44 	/* tells writepage not to lock the state bits for this range
45 	 * it still does the unlocking
46 	 */
47 	unsigned int extent_locked:1;
48 
49 	/* tells the submit_bio code to use a WRITE_SYNC */
50 	unsigned int sync_io:1;
51 };
52 
53 int __init extent_io_init(void)
54 {
55 	extent_state_cache = kmem_cache_create("extent_state",
56 			sizeof(struct extent_state), 0,
57 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
58 	if (!extent_state_cache)
59 		return -ENOMEM;
60 
61 	extent_buffer_cache = kmem_cache_create("extent_buffers",
62 			sizeof(struct extent_buffer), 0,
63 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
64 	if (!extent_buffer_cache)
65 		goto free_state_cache;
66 	return 0;
67 
68 free_state_cache:
69 	kmem_cache_destroy(extent_state_cache);
70 	return -ENOMEM;
71 }
72 
73 void extent_io_exit(void)
74 {
75 	struct extent_state *state;
76 	struct extent_buffer *eb;
77 
78 	while (!list_empty(&states)) {
79 		state = list_entry(states.next, struct extent_state, leak_list);
80 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
81 		       "state %lu in tree %p refs %d\n",
82 		       (unsigned long long)state->start,
83 		       (unsigned long long)state->end,
84 		       state->state, state->tree, atomic_read(&state->refs));
85 		list_del(&state->leak_list);
86 		kmem_cache_free(extent_state_cache, state);
87 
88 	}
89 
90 	while (!list_empty(&buffers)) {
91 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
92 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
93 		       "refs %d\n", (unsigned long long)eb->start,
94 		       eb->len, atomic_read(&eb->refs));
95 		list_del(&eb->leak_list);
96 		kmem_cache_free(extent_buffer_cache, eb);
97 	}
98 	if (extent_state_cache)
99 		kmem_cache_destroy(extent_state_cache);
100 	if (extent_buffer_cache)
101 		kmem_cache_destroy(extent_buffer_cache);
102 }
103 
104 void extent_io_tree_init(struct extent_io_tree *tree,
105 			  struct address_space *mapping, gfp_t mask)
106 {
107 	tree->state.rb_node = NULL;
108 	tree->buffer.rb_node = NULL;
109 	tree->ops = NULL;
110 	tree->dirty_bytes = 0;
111 	spin_lock_init(&tree->lock);
112 	spin_lock_init(&tree->buffer_lock);
113 	tree->mapping = mapping;
114 }
115 
116 static struct extent_state *alloc_extent_state(gfp_t mask)
117 {
118 	struct extent_state *state;
119 #if LEAK_DEBUG
120 	unsigned long flags;
121 #endif
122 
123 	state = kmem_cache_alloc(extent_state_cache, mask);
124 	if (!state)
125 		return state;
126 	state->state = 0;
127 	state->private = 0;
128 	state->tree = NULL;
129 #if LEAK_DEBUG
130 	spin_lock_irqsave(&leak_lock, flags);
131 	list_add(&state->leak_list, &states);
132 	spin_unlock_irqrestore(&leak_lock, flags);
133 #endif
134 	atomic_set(&state->refs, 1);
135 	init_waitqueue_head(&state->wq);
136 	return state;
137 }
138 
139 static void free_extent_state(struct extent_state *state)
140 {
141 	if (!state)
142 		return;
143 	if (atomic_dec_and_test(&state->refs)) {
144 #if LEAK_DEBUG
145 		unsigned long flags;
146 #endif
147 		WARN_ON(state->tree);
148 #if LEAK_DEBUG
149 		spin_lock_irqsave(&leak_lock, flags);
150 		list_del(&state->leak_list);
151 		spin_unlock_irqrestore(&leak_lock, flags);
152 #endif
153 		kmem_cache_free(extent_state_cache, state);
154 	}
155 }
156 
157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
158 				   struct rb_node *node)
159 {
160 	struct rb_node **p = &root->rb_node;
161 	struct rb_node *parent = NULL;
162 	struct tree_entry *entry;
163 
164 	while (*p) {
165 		parent = *p;
166 		entry = rb_entry(parent, struct tree_entry, rb_node);
167 
168 		if (offset < entry->start)
169 			p = &(*p)->rb_left;
170 		else if (offset > entry->end)
171 			p = &(*p)->rb_right;
172 		else
173 			return parent;
174 	}
175 
176 	entry = rb_entry(node, struct tree_entry, rb_node);
177 	rb_link_node(node, parent, p);
178 	rb_insert_color(node, root);
179 	return NULL;
180 }
181 
182 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
183 				     struct rb_node **prev_ret,
184 				     struct rb_node **next_ret)
185 {
186 	struct rb_root *root = &tree->state;
187 	struct rb_node *n = root->rb_node;
188 	struct rb_node *prev = NULL;
189 	struct rb_node *orig_prev = NULL;
190 	struct tree_entry *entry;
191 	struct tree_entry *prev_entry = NULL;
192 
193 	while (n) {
194 		entry = rb_entry(n, struct tree_entry, rb_node);
195 		prev = n;
196 		prev_entry = entry;
197 
198 		if (offset < entry->start)
199 			n = n->rb_left;
200 		else if (offset > entry->end)
201 			n = n->rb_right;
202 		else
203 			return n;
204 	}
205 
206 	if (prev_ret) {
207 		orig_prev = prev;
208 		while (prev && offset > prev_entry->end) {
209 			prev = rb_next(prev);
210 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211 		}
212 		*prev_ret = prev;
213 		prev = orig_prev;
214 	}
215 
216 	if (next_ret) {
217 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
218 		while (prev && offset < prev_entry->start) {
219 			prev = rb_prev(prev);
220 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		}
222 		*next_ret = prev;
223 	}
224 	return NULL;
225 }
226 
227 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
228 					  u64 offset)
229 {
230 	struct rb_node *prev = NULL;
231 	struct rb_node *ret;
232 
233 	ret = __etree_search(tree, offset, &prev, NULL);
234 	if (!ret)
235 		return prev;
236 	return ret;
237 }
238 
239 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
240 					  u64 offset, struct rb_node *node)
241 {
242 	struct rb_root *root = &tree->buffer;
243 	struct rb_node **p = &root->rb_node;
244 	struct rb_node *parent = NULL;
245 	struct extent_buffer *eb;
246 
247 	while (*p) {
248 		parent = *p;
249 		eb = rb_entry(parent, struct extent_buffer, rb_node);
250 
251 		if (offset < eb->start)
252 			p = &(*p)->rb_left;
253 		else if (offset > eb->start)
254 			p = &(*p)->rb_right;
255 		else
256 			return eb;
257 	}
258 
259 	rb_link_node(node, parent, p);
260 	rb_insert_color(node, root);
261 	return NULL;
262 }
263 
264 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
265 					   u64 offset)
266 {
267 	struct rb_root *root = &tree->buffer;
268 	struct rb_node *n = root->rb_node;
269 	struct extent_buffer *eb;
270 
271 	while (n) {
272 		eb = rb_entry(n, struct extent_buffer, rb_node);
273 		if (offset < eb->start)
274 			n = n->rb_left;
275 		else if (offset > eb->start)
276 			n = n->rb_right;
277 		else
278 			return eb;
279 	}
280 	return NULL;
281 }
282 
283 /*
284  * utility function to look for merge candidates inside a given range.
285  * Any extents with matching state are merged together into a single
286  * extent in the tree.  Extents with EXTENT_IO in their state field
287  * are not merged because the end_io handlers need to be able to do
288  * operations on them without sleeping (or doing allocations/splits).
289  *
290  * This should be called with the tree lock held.
291  */
292 static int merge_state(struct extent_io_tree *tree,
293 		       struct extent_state *state)
294 {
295 	struct extent_state *other;
296 	struct rb_node *other_node;
297 
298 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
299 		return 0;
300 
301 	other_node = rb_prev(&state->rb_node);
302 	if (other_node) {
303 		other = rb_entry(other_node, struct extent_state, rb_node);
304 		if (other->end == state->start - 1 &&
305 		    other->state == state->state) {
306 			state->start = other->start;
307 			other->tree = NULL;
308 			rb_erase(&other->rb_node, &tree->state);
309 			free_extent_state(other);
310 		}
311 	}
312 	other_node = rb_next(&state->rb_node);
313 	if (other_node) {
314 		other = rb_entry(other_node, struct extent_state, rb_node);
315 		if (other->start == state->end + 1 &&
316 		    other->state == state->state) {
317 			other->start = state->start;
318 			state->tree = NULL;
319 			rb_erase(&state->rb_node, &tree->state);
320 			free_extent_state(state);
321 		}
322 	}
323 	return 0;
324 }
325 
326 static void set_state_cb(struct extent_io_tree *tree,
327 			 struct extent_state *state,
328 			 unsigned long bits)
329 {
330 	if (tree->ops && tree->ops->set_bit_hook) {
331 		tree->ops->set_bit_hook(tree->mapping->host, state->start,
332 					state->end, state->state, bits);
333 	}
334 }
335 
336 static void clear_state_cb(struct extent_io_tree *tree,
337 			   struct extent_state *state,
338 			   unsigned long bits)
339 {
340 	if (tree->ops && tree->ops->clear_bit_hook) {
341 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
342 					  state->end, state->state, bits);
343 	}
344 }
345 
346 /*
347  * insert an extent_state struct into the tree.  'bits' are set on the
348  * struct before it is inserted.
349  *
350  * This may return -EEXIST if the extent is already there, in which case the
351  * state struct is freed.
352  *
353  * The tree lock is not taken internally.  This is a utility function and
354  * probably isn't what you want to call (see set/clear_extent_bit).
355  */
356 static int insert_state(struct extent_io_tree *tree,
357 			struct extent_state *state, u64 start, u64 end,
358 			int bits)
359 {
360 	struct rb_node *node;
361 
362 	if (end < start) {
363 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
364 		       (unsigned long long)end,
365 		       (unsigned long long)start);
366 		WARN_ON(1);
367 	}
368 	if (bits & EXTENT_DIRTY)
369 		tree->dirty_bytes += end - start + 1;
370 	set_state_cb(tree, state, bits);
371 	state->state |= bits;
372 	state->start = start;
373 	state->end = end;
374 	node = tree_insert(&tree->state, end, &state->rb_node);
375 	if (node) {
376 		struct extent_state *found;
377 		found = rb_entry(node, struct extent_state, rb_node);
378 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
379 		       "%llu %llu\n", (unsigned long long)found->start,
380 		       (unsigned long long)found->end,
381 		       (unsigned long long)start, (unsigned long long)end);
382 		free_extent_state(state);
383 		return -EEXIST;
384 	}
385 	state->tree = tree;
386 	merge_state(tree, state);
387 	return 0;
388 }
389 
390 /*
391  * split a given extent state struct in two, inserting the preallocated
392  * struct 'prealloc' as the newly created second half.  'split' indicates an
393  * offset inside 'orig' where it should be split.
394  *
395  * Before calling,
396  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
397  * are two extent state structs in the tree:
398  * prealloc: [orig->start, split - 1]
399  * orig: [ split, orig->end ]
400  *
401  * The tree locks are not taken by this function. They need to be held
402  * by the caller.
403  */
404 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 		       struct extent_state *prealloc, u64 split)
406 {
407 	struct rb_node *node;
408 	prealloc->start = orig->start;
409 	prealloc->end = split - 1;
410 	prealloc->state = orig->state;
411 	orig->start = split;
412 
413 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
414 	if (node) {
415 		free_extent_state(prealloc);
416 		return -EEXIST;
417 	}
418 	prealloc->tree = tree;
419 	return 0;
420 }
421 
422 /*
423  * utility function to clear some bits in an extent state struct.
424  * it will optionally wake up any one waiting on this state (wake == 1), or
425  * forcibly remove the state from the tree (delete == 1).
426  *
427  * If no bits are set on the state struct after clearing things, the
428  * struct is freed and removed from the tree
429  */
430 static int clear_state_bit(struct extent_io_tree *tree,
431 			    struct extent_state *state, int bits, int wake,
432 			    int delete)
433 {
434 	int ret = state->state & bits;
435 
436 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437 		u64 range = state->end - state->start + 1;
438 		WARN_ON(range > tree->dirty_bytes);
439 		tree->dirty_bytes -= range;
440 	}
441 	clear_state_cb(tree, state, bits);
442 	state->state &= ~bits;
443 	if (wake)
444 		wake_up(&state->wq);
445 	if (delete || state->state == 0) {
446 		if (state->tree) {
447 			clear_state_cb(tree, state, state->state);
448 			rb_erase(&state->rb_node, &tree->state);
449 			state->tree = NULL;
450 			free_extent_state(state);
451 		} else {
452 			WARN_ON(1);
453 		}
454 	} else {
455 		merge_state(tree, state);
456 	}
457 	return ret;
458 }
459 
460 /*
461  * clear some bits on a range in the tree.  This may require splitting
462  * or inserting elements in the tree, so the gfp mask is used to
463  * indicate which allocations or sleeping are allowed.
464  *
465  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
466  * the given range from the tree regardless of state (ie for truncate).
467  *
468  * the range [start, end] is inclusive.
469  *
470  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
471  * bits were already set, or zero if none of the bits were already set.
472  */
473 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
474 		     int bits, int wake, int delete, gfp_t mask)
475 {
476 	struct extent_state *state;
477 	struct extent_state *prealloc = NULL;
478 	struct rb_node *node;
479 	int err;
480 	int set = 0;
481 
482 again:
483 	if (!prealloc && (mask & __GFP_WAIT)) {
484 		prealloc = alloc_extent_state(mask);
485 		if (!prealloc)
486 			return -ENOMEM;
487 	}
488 
489 	spin_lock(&tree->lock);
490 	/*
491 	 * this search will find the extents that end after
492 	 * our range starts
493 	 */
494 	node = tree_search(tree, start);
495 	if (!node)
496 		goto out;
497 	state = rb_entry(node, struct extent_state, rb_node);
498 	if (state->start > end)
499 		goto out;
500 	WARN_ON(state->end < start);
501 
502 	/*
503 	 *     | ---- desired range ---- |
504 	 *  | state | or
505 	 *  | ------------- state -------------- |
506 	 *
507 	 * We need to split the extent we found, and may flip
508 	 * bits on second half.
509 	 *
510 	 * If the extent we found extends past our range, we
511 	 * just split and search again.  It'll get split again
512 	 * the next time though.
513 	 *
514 	 * If the extent we found is inside our range, we clear
515 	 * the desired bit on it.
516 	 */
517 
518 	if (state->start < start) {
519 		if (!prealloc)
520 			prealloc = alloc_extent_state(GFP_ATOMIC);
521 		err = split_state(tree, state, prealloc, start);
522 		BUG_ON(err == -EEXIST);
523 		prealloc = NULL;
524 		if (err)
525 			goto out;
526 		if (state->end <= end) {
527 			start = state->end + 1;
528 			set |= clear_state_bit(tree, state, bits,
529 					wake, delete);
530 		} else {
531 			start = state->start;
532 		}
533 		goto search_again;
534 	}
535 	/*
536 	 * | ---- desired range ---- |
537 	 *                        | state |
538 	 * We need to split the extent, and clear the bit
539 	 * on the first half
540 	 */
541 	if (state->start <= end && state->end > end) {
542 		if (!prealloc)
543 			prealloc = alloc_extent_state(GFP_ATOMIC);
544 		err = split_state(tree, state, prealloc, end + 1);
545 		BUG_ON(err == -EEXIST);
546 
547 		if (wake)
548 			wake_up(&state->wq);
549 		set |= clear_state_bit(tree, prealloc, bits,
550 				       wake, delete);
551 		prealloc = NULL;
552 		goto out;
553 	}
554 
555 	start = state->end + 1;
556 	set |= clear_state_bit(tree, state, bits, wake, delete);
557 	goto search_again;
558 
559 out:
560 	spin_unlock(&tree->lock);
561 	if (prealloc)
562 		free_extent_state(prealloc);
563 
564 	return set;
565 
566 search_again:
567 	if (start > end)
568 		goto out;
569 	spin_unlock(&tree->lock);
570 	if (mask & __GFP_WAIT)
571 		cond_resched();
572 	goto again;
573 }
574 
575 static int wait_on_state(struct extent_io_tree *tree,
576 			 struct extent_state *state)
577 		__releases(tree->lock)
578 		__acquires(tree->lock)
579 {
580 	DEFINE_WAIT(wait);
581 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
582 	spin_unlock(&tree->lock);
583 	schedule();
584 	spin_lock(&tree->lock);
585 	finish_wait(&state->wq, &wait);
586 	return 0;
587 }
588 
589 /*
590  * waits for one or more bits to clear on a range in the state tree.
591  * The range [start, end] is inclusive.
592  * The tree lock is taken by this function
593  */
594 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
595 {
596 	struct extent_state *state;
597 	struct rb_node *node;
598 
599 	spin_lock(&tree->lock);
600 again:
601 	while (1) {
602 		/*
603 		 * this search will find all the extents that end after
604 		 * our range starts
605 		 */
606 		node = tree_search(tree, start);
607 		if (!node)
608 			break;
609 
610 		state = rb_entry(node, struct extent_state, rb_node);
611 
612 		if (state->start > end)
613 			goto out;
614 
615 		if (state->state & bits) {
616 			start = state->start;
617 			atomic_inc(&state->refs);
618 			wait_on_state(tree, state);
619 			free_extent_state(state);
620 			goto again;
621 		}
622 		start = state->end + 1;
623 
624 		if (start > end)
625 			break;
626 
627 		if (need_resched()) {
628 			spin_unlock(&tree->lock);
629 			cond_resched();
630 			spin_lock(&tree->lock);
631 		}
632 	}
633 out:
634 	spin_unlock(&tree->lock);
635 	return 0;
636 }
637 
638 static void set_state_bits(struct extent_io_tree *tree,
639 			   struct extent_state *state,
640 			   int bits)
641 {
642 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
643 		u64 range = state->end - state->start + 1;
644 		tree->dirty_bytes += range;
645 	}
646 	set_state_cb(tree, state, bits);
647 	state->state |= bits;
648 }
649 
650 /*
651  * set some bits on a range in the tree.  This may require allocations
652  * or sleeping, so the gfp mask is used to indicate what is allowed.
653  *
654  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
655  * range already has the desired bits set.  The start of the existing
656  * range is returned in failed_start in this case.
657  *
658  * [start, end] is inclusive
659  * This takes the tree lock.
660  */
661 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
662 			  int bits, int exclusive, u64 *failed_start,
663 			  gfp_t mask)
664 {
665 	struct extent_state *state;
666 	struct extent_state *prealloc = NULL;
667 	struct rb_node *node;
668 	int err = 0;
669 	int set;
670 	u64 last_start;
671 	u64 last_end;
672 again:
673 	if (!prealloc && (mask & __GFP_WAIT)) {
674 		prealloc = alloc_extent_state(mask);
675 		if (!prealloc)
676 			return -ENOMEM;
677 	}
678 
679 	spin_lock(&tree->lock);
680 	/*
681 	 * this search will find all the extents that end after
682 	 * our range starts.
683 	 */
684 	node = tree_search(tree, start);
685 	if (!node) {
686 		err = insert_state(tree, prealloc, start, end, bits);
687 		prealloc = NULL;
688 		BUG_ON(err == -EEXIST);
689 		goto out;
690 	}
691 
692 	state = rb_entry(node, struct extent_state, rb_node);
693 	last_start = state->start;
694 	last_end = state->end;
695 
696 	/*
697 	 * | ---- desired range ---- |
698 	 * | state |
699 	 *
700 	 * Just lock what we found and keep going
701 	 */
702 	if (state->start == start && state->end <= end) {
703 		set = state->state & bits;
704 		if (set && exclusive) {
705 			*failed_start = state->start;
706 			err = -EEXIST;
707 			goto out;
708 		}
709 		set_state_bits(tree, state, bits);
710 		start = state->end + 1;
711 		merge_state(tree, state);
712 		goto search_again;
713 	}
714 
715 	/*
716 	 *     | ---- desired range ---- |
717 	 * | state |
718 	 *   or
719 	 * | ------------- state -------------- |
720 	 *
721 	 * We need to split the extent we found, and may flip bits on
722 	 * second half.
723 	 *
724 	 * If the extent we found extends past our
725 	 * range, we just split and search again.  It'll get split
726 	 * again the next time though.
727 	 *
728 	 * If the extent we found is inside our range, we set the
729 	 * desired bit on it.
730 	 */
731 	if (state->start < start) {
732 		set = state->state & bits;
733 		if (exclusive && set) {
734 			*failed_start = start;
735 			err = -EEXIST;
736 			goto out;
737 		}
738 		err = split_state(tree, state, prealloc, start);
739 		BUG_ON(err == -EEXIST);
740 		prealloc = NULL;
741 		if (err)
742 			goto out;
743 		if (state->end <= end) {
744 			set_state_bits(tree, state, bits);
745 			start = state->end + 1;
746 			merge_state(tree, state);
747 		} else {
748 			start = state->start;
749 		}
750 		goto search_again;
751 	}
752 	/*
753 	 * | ---- desired range ---- |
754 	 *     | state | or               | state |
755 	 *
756 	 * There's a hole, we need to insert something in it and
757 	 * ignore the extent we found.
758 	 */
759 	if (state->start > start) {
760 		u64 this_end;
761 		if (end < last_start)
762 			this_end = end;
763 		else
764 			this_end = last_start - 1;
765 		err = insert_state(tree, prealloc, start, this_end,
766 				   bits);
767 		prealloc = NULL;
768 		BUG_ON(err == -EEXIST);
769 		if (err)
770 			goto out;
771 		start = this_end + 1;
772 		goto search_again;
773 	}
774 	/*
775 	 * | ---- desired range ---- |
776 	 *                        | state |
777 	 * We need to split the extent, and set the bit
778 	 * on the first half
779 	 */
780 	if (state->start <= end && state->end > end) {
781 		set = state->state & bits;
782 		if (exclusive && set) {
783 			*failed_start = start;
784 			err = -EEXIST;
785 			goto out;
786 		}
787 		err = split_state(tree, state, prealloc, end + 1);
788 		BUG_ON(err == -EEXIST);
789 
790 		set_state_bits(tree, prealloc, bits);
791 		merge_state(tree, prealloc);
792 		prealloc = NULL;
793 		goto out;
794 	}
795 
796 	goto search_again;
797 
798 out:
799 	spin_unlock(&tree->lock);
800 	if (prealloc)
801 		free_extent_state(prealloc);
802 
803 	return err;
804 
805 search_again:
806 	if (start > end)
807 		goto out;
808 	spin_unlock(&tree->lock);
809 	if (mask & __GFP_WAIT)
810 		cond_resched();
811 	goto again;
812 }
813 
814 /* wrappers around set/clear extent bit */
815 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
816 		     gfp_t mask)
817 {
818 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
819 			      mask);
820 }
821 
822 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
823 		       gfp_t mask)
824 {
825 	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
826 }
827 
828 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
829 		    int bits, gfp_t mask)
830 {
831 	return set_extent_bit(tree, start, end, bits, 0, NULL,
832 			      mask);
833 }
834 
835 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
836 		      int bits, gfp_t mask)
837 {
838 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
839 }
840 
841 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
842 		     gfp_t mask)
843 {
844 	return set_extent_bit(tree, start, end,
845 			      EXTENT_DELALLOC | EXTENT_DIRTY,
846 			      0, NULL, mask);
847 }
848 
849 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
850 		       gfp_t mask)
851 {
852 	return clear_extent_bit(tree, start, end,
853 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
854 }
855 
856 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
857 			 gfp_t mask)
858 {
859 	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
860 }
861 
862 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
863 		     gfp_t mask)
864 {
865 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
866 			      mask);
867 }
868 
869 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
870 		       gfp_t mask)
871 {
872 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
873 }
874 
875 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
876 			gfp_t mask)
877 {
878 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
879 			      mask);
880 }
881 
882 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
883 				 u64 end, gfp_t mask)
884 {
885 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
886 }
887 
888 static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
889 			 gfp_t mask)
890 {
891 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
892 			      0, NULL, mask);
893 }
894 
895 static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
896 				  u64 end, gfp_t mask)
897 {
898 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
899 }
900 
901 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
902 {
903 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
904 }
905 
906 /*
907  * either insert or lock state struct between start and end use mask to tell
908  * us if waiting is desired.
909  */
910 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
911 {
912 	int err;
913 	u64 failed_start;
914 	while (1) {
915 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
916 				     &failed_start, mask);
917 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
918 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
919 			start = failed_start;
920 		} else {
921 			break;
922 		}
923 		WARN_ON(start > end);
924 	}
925 	return err;
926 }
927 
928 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
929 		    gfp_t mask)
930 {
931 	int err;
932 	u64 failed_start;
933 
934 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
935 			     &failed_start, mask);
936 	if (err == -EEXIST) {
937 		if (failed_start > start)
938 			clear_extent_bit(tree, start, failed_start - 1,
939 					 EXTENT_LOCKED, 1, 0, mask);
940 		return 0;
941 	}
942 	return 1;
943 }
944 
945 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
946 		  gfp_t mask)
947 {
948 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
949 }
950 
951 /*
952  * helper function to set pages and extents in the tree dirty
953  */
954 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
955 {
956 	unsigned long index = start >> PAGE_CACHE_SHIFT;
957 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
958 	struct page *page;
959 
960 	while (index <= end_index) {
961 		page = find_get_page(tree->mapping, index);
962 		BUG_ON(!page);
963 		__set_page_dirty_nobuffers(page);
964 		page_cache_release(page);
965 		index++;
966 	}
967 	set_extent_dirty(tree, start, end, GFP_NOFS);
968 	return 0;
969 }
970 
971 /*
972  * helper function to set both pages and extents in the tree writeback
973  */
974 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
975 {
976 	unsigned long index = start >> PAGE_CACHE_SHIFT;
977 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
978 	struct page *page;
979 
980 	while (index <= end_index) {
981 		page = find_get_page(tree->mapping, index);
982 		BUG_ON(!page);
983 		set_page_writeback(page);
984 		page_cache_release(page);
985 		index++;
986 	}
987 	set_extent_writeback(tree, start, end, GFP_NOFS);
988 	return 0;
989 }
990 
991 /*
992  * find the first offset in the io tree with 'bits' set. zero is
993  * returned if we find something, and *start_ret and *end_ret are
994  * set to reflect the state struct that was found.
995  *
996  * If nothing was found, 1 is returned, < 0 on error
997  */
998 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
999 			  u64 *start_ret, u64 *end_ret, int bits)
1000 {
1001 	struct rb_node *node;
1002 	struct extent_state *state;
1003 	int ret = 1;
1004 
1005 	spin_lock(&tree->lock);
1006 	/*
1007 	 * this search will find all the extents that end after
1008 	 * our range starts.
1009 	 */
1010 	node = tree_search(tree, start);
1011 	if (!node)
1012 		goto out;
1013 
1014 	while (1) {
1015 		state = rb_entry(node, struct extent_state, rb_node);
1016 		if (state->end >= start && (state->state & bits)) {
1017 			*start_ret = state->start;
1018 			*end_ret = state->end;
1019 			ret = 0;
1020 			break;
1021 		}
1022 		node = rb_next(node);
1023 		if (!node)
1024 			break;
1025 	}
1026 out:
1027 	spin_unlock(&tree->lock);
1028 	return ret;
1029 }
1030 
1031 /* find the first state struct with 'bits' set after 'start', and
1032  * return it.  tree->lock must be held.  NULL will returned if
1033  * nothing was found after 'start'
1034  */
1035 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1036 						 u64 start, int bits)
1037 {
1038 	struct rb_node *node;
1039 	struct extent_state *state;
1040 
1041 	/*
1042 	 * this search will find all the extents that end after
1043 	 * our range starts.
1044 	 */
1045 	node = tree_search(tree, start);
1046 	if (!node)
1047 		goto out;
1048 
1049 	while (1) {
1050 		state = rb_entry(node, struct extent_state, rb_node);
1051 		if (state->end >= start && (state->state & bits))
1052 			return state;
1053 
1054 		node = rb_next(node);
1055 		if (!node)
1056 			break;
1057 	}
1058 out:
1059 	return NULL;
1060 }
1061 
1062 /*
1063  * find a contiguous range of bytes in the file marked as delalloc, not
1064  * more than 'max_bytes'.  start and end are used to return the range,
1065  *
1066  * 1 is returned if we find something, 0 if nothing was in the tree
1067  */
1068 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1069 					u64 *start, u64 *end, u64 max_bytes)
1070 {
1071 	struct rb_node *node;
1072 	struct extent_state *state;
1073 	u64 cur_start = *start;
1074 	u64 found = 0;
1075 	u64 total_bytes = 0;
1076 
1077 	spin_lock(&tree->lock);
1078 
1079 	/*
1080 	 * this search will find all the extents that end after
1081 	 * our range starts.
1082 	 */
1083 	node = tree_search(tree, cur_start);
1084 	if (!node) {
1085 		if (!found)
1086 			*end = (u64)-1;
1087 		goto out;
1088 	}
1089 
1090 	while (1) {
1091 		state = rb_entry(node, struct extent_state, rb_node);
1092 		if (found && (state->start != cur_start ||
1093 			      (state->state & EXTENT_BOUNDARY))) {
1094 			goto out;
1095 		}
1096 		if (!(state->state & EXTENT_DELALLOC)) {
1097 			if (!found)
1098 				*end = state->end;
1099 			goto out;
1100 		}
1101 		if (!found)
1102 			*start = state->start;
1103 		found++;
1104 		*end = state->end;
1105 		cur_start = state->end + 1;
1106 		node = rb_next(node);
1107 		if (!node)
1108 			break;
1109 		total_bytes += state->end - state->start + 1;
1110 		if (total_bytes >= max_bytes)
1111 			break;
1112 	}
1113 out:
1114 	spin_unlock(&tree->lock);
1115 	return found;
1116 }
1117 
1118 static noinline int __unlock_for_delalloc(struct inode *inode,
1119 					  struct page *locked_page,
1120 					  u64 start, u64 end)
1121 {
1122 	int ret;
1123 	struct page *pages[16];
1124 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1125 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1126 	unsigned long nr_pages = end_index - index + 1;
1127 	int i;
1128 
1129 	if (index == locked_page->index && end_index == index)
1130 		return 0;
1131 
1132 	while (nr_pages > 0) {
1133 		ret = find_get_pages_contig(inode->i_mapping, index,
1134 				     min_t(unsigned long, nr_pages,
1135 				     ARRAY_SIZE(pages)), pages);
1136 		for (i = 0; i < ret; i++) {
1137 			if (pages[i] != locked_page)
1138 				unlock_page(pages[i]);
1139 			page_cache_release(pages[i]);
1140 		}
1141 		nr_pages -= ret;
1142 		index += ret;
1143 		cond_resched();
1144 	}
1145 	return 0;
1146 }
1147 
1148 static noinline int lock_delalloc_pages(struct inode *inode,
1149 					struct page *locked_page,
1150 					u64 delalloc_start,
1151 					u64 delalloc_end)
1152 {
1153 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1154 	unsigned long start_index = index;
1155 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1156 	unsigned long pages_locked = 0;
1157 	struct page *pages[16];
1158 	unsigned long nrpages;
1159 	int ret;
1160 	int i;
1161 
1162 	/* the caller is responsible for locking the start index */
1163 	if (index == locked_page->index && index == end_index)
1164 		return 0;
1165 
1166 	/* skip the page at the start index */
1167 	nrpages = end_index - index + 1;
1168 	while (nrpages > 0) {
1169 		ret = find_get_pages_contig(inode->i_mapping, index,
1170 				     min_t(unsigned long,
1171 				     nrpages, ARRAY_SIZE(pages)), pages);
1172 		if (ret == 0) {
1173 			ret = -EAGAIN;
1174 			goto done;
1175 		}
1176 		/* now we have an array of pages, lock them all */
1177 		for (i = 0; i < ret; i++) {
1178 			/*
1179 			 * the caller is taking responsibility for
1180 			 * locked_page
1181 			 */
1182 			if (pages[i] != locked_page) {
1183 				lock_page(pages[i]);
1184 				if (!PageDirty(pages[i]) ||
1185 				    pages[i]->mapping != inode->i_mapping) {
1186 					ret = -EAGAIN;
1187 					unlock_page(pages[i]);
1188 					page_cache_release(pages[i]);
1189 					goto done;
1190 				}
1191 			}
1192 			page_cache_release(pages[i]);
1193 			pages_locked++;
1194 		}
1195 		nrpages -= ret;
1196 		index += ret;
1197 		cond_resched();
1198 	}
1199 	ret = 0;
1200 done:
1201 	if (ret && pages_locked) {
1202 		__unlock_for_delalloc(inode, locked_page,
1203 			      delalloc_start,
1204 			      ((u64)(start_index + pages_locked - 1)) <<
1205 			      PAGE_CACHE_SHIFT);
1206 	}
1207 	return ret;
1208 }
1209 
1210 /*
1211  * find a contiguous range of bytes in the file marked as delalloc, not
1212  * more than 'max_bytes'.  start and end are used to return the range,
1213  *
1214  * 1 is returned if we find something, 0 if nothing was in the tree
1215  */
1216 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1217 					     struct extent_io_tree *tree,
1218 					     struct page *locked_page,
1219 					     u64 *start, u64 *end,
1220 					     u64 max_bytes)
1221 {
1222 	u64 delalloc_start;
1223 	u64 delalloc_end;
1224 	u64 found;
1225 	int ret;
1226 	int loops = 0;
1227 
1228 again:
1229 	/* step one, find a bunch of delalloc bytes starting at start */
1230 	delalloc_start = *start;
1231 	delalloc_end = 0;
1232 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1233 				    max_bytes);
1234 	if (!found || delalloc_end <= *start) {
1235 		*start = delalloc_start;
1236 		*end = delalloc_end;
1237 		return found;
1238 	}
1239 
1240 	/*
1241 	 * start comes from the offset of locked_page.  We have to lock
1242 	 * pages in order, so we can't process delalloc bytes before
1243 	 * locked_page
1244 	 */
1245 	if (delalloc_start < *start)
1246 		delalloc_start = *start;
1247 
1248 	/*
1249 	 * make sure to limit the number of pages we try to lock down
1250 	 * if we're looping.
1251 	 */
1252 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1253 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1254 
1255 	/* step two, lock all the pages after the page that has start */
1256 	ret = lock_delalloc_pages(inode, locked_page,
1257 				  delalloc_start, delalloc_end);
1258 	if (ret == -EAGAIN) {
1259 		/* some of the pages are gone, lets avoid looping by
1260 		 * shortening the size of the delalloc range we're searching
1261 		 */
1262 		if (!loops) {
1263 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1264 			max_bytes = PAGE_CACHE_SIZE - offset;
1265 			loops = 1;
1266 			goto again;
1267 		} else {
1268 			found = 0;
1269 			goto out_failed;
1270 		}
1271 	}
1272 	BUG_ON(ret);
1273 
1274 	/* step three, lock the state bits for the whole range */
1275 	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1276 
1277 	/* then test to make sure it is all still delalloc */
1278 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1279 			     EXTENT_DELALLOC, 1);
1280 	if (!ret) {
1281 		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282 		__unlock_for_delalloc(inode, locked_page,
1283 			      delalloc_start, delalloc_end);
1284 		cond_resched();
1285 		goto again;
1286 	}
1287 	*start = delalloc_start;
1288 	*end = delalloc_end;
1289 out_failed:
1290 	return found;
1291 }
1292 
1293 int extent_clear_unlock_delalloc(struct inode *inode,
1294 				struct extent_io_tree *tree,
1295 				u64 start, u64 end, struct page *locked_page,
1296 				int unlock_pages,
1297 				int clear_unlock,
1298 				int clear_delalloc, int clear_dirty,
1299 				int set_writeback,
1300 				int end_writeback)
1301 {
1302 	int ret;
1303 	struct page *pages[16];
1304 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1305 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1306 	unsigned long nr_pages = end_index - index + 1;
1307 	int i;
1308 	int clear_bits = 0;
1309 
1310 	if (clear_unlock)
1311 		clear_bits |= EXTENT_LOCKED;
1312 	if (clear_dirty)
1313 		clear_bits |= EXTENT_DIRTY;
1314 
1315 	if (clear_delalloc)
1316 		clear_bits |= EXTENT_DELALLOC;
1317 
1318 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1319 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1320 		return 0;
1321 
1322 	while (nr_pages > 0) {
1323 		ret = find_get_pages_contig(inode->i_mapping, index,
1324 				     min_t(unsigned long,
1325 				     nr_pages, ARRAY_SIZE(pages)), pages);
1326 		for (i = 0; i < ret; i++) {
1327 			if (pages[i] == locked_page) {
1328 				page_cache_release(pages[i]);
1329 				continue;
1330 			}
1331 			if (clear_dirty)
1332 				clear_page_dirty_for_io(pages[i]);
1333 			if (set_writeback)
1334 				set_page_writeback(pages[i]);
1335 			if (end_writeback)
1336 				end_page_writeback(pages[i]);
1337 			if (unlock_pages)
1338 				unlock_page(pages[i]);
1339 			page_cache_release(pages[i]);
1340 		}
1341 		nr_pages -= ret;
1342 		index += ret;
1343 		cond_resched();
1344 	}
1345 	return 0;
1346 }
1347 
1348 /*
1349  * count the number of bytes in the tree that have a given bit(s)
1350  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1351  * cached.  The total number found is returned.
1352  */
1353 u64 count_range_bits(struct extent_io_tree *tree,
1354 		     u64 *start, u64 search_end, u64 max_bytes,
1355 		     unsigned long bits)
1356 {
1357 	struct rb_node *node;
1358 	struct extent_state *state;
1359 	u64 cur_start = *start;
1360 	u64 total_bytes = 0;
1361 	int found = 0;
1362 
1363 	if (search_end <= cur_start) {
1364 		WARN_ON(1);
1365 		return 0;
1366 	}
1367 
1368 	spin_lock(&tree->lock);
1369 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1370 		total_bytes = tree->dirty_bytes;
1371 		goto out;
1372 	}
1373 	/*
1374 	 * this search will find all the extents that end after
1375 	 * our range starts.
1376 	 */
1377 	node = tree_search(tree, cur_start);
1378 	if (!node)
1379 		goto out;
1380 
1381 	while (1) {
1382 		state = rb_entry(node, struct extent_state, rb_node);
1383 		if (state->start > search_end)
1384 			break;
1385 		if (state->end >= cur_start && (state->state & bits)) {
1386 			total_bytes += min(search_end, state->end) + 1 -
1387 				       max(cur_start, state->start);
1388 			if (total_bytes >= max_bytes)
1389 				break;
1390 			if (!found) {
1391 				*start = state->start;
1392 				found = 1;
1393 			}
1394 		}
1395 		node = rb_next(node);
1396 		if (!node)
1397 			break;
1398 	}
1399 out:
1400 	spin_unlock(&tree->lock);
1401 	return total_bytes;
1402 }
1403 
1404 /*
1405  * set the private field for a given byte offset in the tree.  If there isn't
1406  * an extent_state there already, this does nothing.
1407  */
1408 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1409 {
1410 	struct rb_node *node;
1411 	struct extent_state *state;
1412 	int ret = 0;
1413 
1414 	spin_lock(&tree->lock);
1415 	/*
1416 	 * this search will find all the extents that end after
1417 	 * our range starts.
1418 	 */
1419 	node = tree_search(tree, start);
1420 	if (!node) {
1421 		ret = -ENOENT;
1422 		goto out;
1423 	}
1424 	state = rb_entry(node, struct extent_state, rb_node);
1425 	if (state->start != start) {
1426 		ret = -ENOENT;
1427 		goto out;
1428 	}
1429 	state->private = private;
1430 out:
1431 	spin_unlock(&tree->lock);
1432 	return ret;
1433 }
1434 
1435 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1436 {
1437 	struct rb_node *node;
1438 	struct extent_state *state;
1439 	int ret = 0;
1440 
1441 	spin_lock(&tree->lock);
1442 	/*
1443 	 * this search will find all the extents that end after
1444 	 * our range starts.
1445 	 */
1446 	node = tree_search(tree, start);
1447 	if (!node) {
1448 		ret = -ENOENT;
1449 		goto out;
1450 	}
1451 	state = rb_entry(node, struct extent_state, rb_node);
1452 	if (state->start != start) {
1453 		ret = -ENOENT;
1454 		goto out;
1455 	}
1456 	*private = state->private;
1457 out:
1458 	spin_unlock(&tree->lock);
1459 	return ret;
1460 }
1461 
1462 /*
1463  * searches a range in the state tree for a given mask.
1464  * If 'filled' == 1, this returns 1 only if every extent in the tree
1465  * has the bits set.  Otherwise, 1 is returned if any bit in the
1466  * range is found set.
1467  */
1468 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1469 		   int bits, int filled)
1470 {
1471 	struct extent_state *state = NULL;
1472 	struct rb_node *node;
1473 	int bitset = 0;
1474 
1475 	spin_lock(&tree->lock);
1476 	node = tree_search(tree, start);
1477 	while (node && start <= end) {
1478 		state = rb_entry(node, struct extent_state, rb_node);
1479 
1480 		if (filled && state->start > start) {
1481 			bitset = 0;
1482 			break;
1483 		}
1484 
1485 		if (state->start > end)
1486 			break;
1487 
1488 		if (state->state & bits) {
1489 			bitset = 1;
1490 			if (!filled)
1491 				break;
1492 		} else if (filled) {
1493 			bitset = 0;
1494 			break;
1495 		}
1496 		start = state->end + 1;
1497 		if (start > end)
1498 			break;
1499 		node = rb_next(node);
1500 		if (!node) {
1501 			if (filled)
1502 				bitset = 0;
1503 			break;
1504 		}
1505 	}
1506 	spin_unlock(&tree->lock);
1507 	return bitset;
1508 }
1509 
1510 /*
1511  * helper function to set a given page up to date if all the
1512  * extents in the tree for that page are up to date
1513  */
1514 static int check_page_uptodate(struct extent_io_tree *tree,
1515 			       struct page *page)
1516 {
1517 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1518 	u64 end = start + PAGE_CACHE_SIZE - 1;
1519 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1520 		SetPageUptodate(page);
1521 	return 0;
1522 }
1523 
1524 /*
1525  * helper function to unlock a page if all the extents in the tree
1526  * for that page are unlocked
1527  */
1528 static int check_page_locked(struct extent_io_tree *tree,
1529 			     struct page *page)
1530 {
1531 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1532 	u64 end = start + PAGE_CACHE_SIZE - 1;
1533 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1534 		unlock_page(page);
1535 	return 0;
1536 }
1537 
1538 /*
1539  * helper function to end page writeback if all the extents
1540  * in the tree for that page are done with writeback
1541  */
1542 static int check_page_writeback(struct extent_io_tree *tree,
1543 			     struct page *page)
1544 {
1545 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1546 	u64 end = start + PAGE_CACHE_SIZE - 1;
1547 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1548 		end_page_writeback(page);
1549 	return 0;
1550 }
1551 
1552 /* lots and lots of room for performance fixes in the end_bio funcs */
1553 
1554 /*
1555  * after a writepage IO is done, we need to:
1556  * clear the uptodate bits on error
1557  * clear the writeback bits in the extent tree for this IO
1558  * end_page_writeback if the page has no more pending IO
1559  *
1560  * Scheduling is not allowed, so the extent state tree is expected
1561  * to have one and only one object corresponding to this IO.
1562  */
1563 static void end_bio_extent_writepage(struct bio *bio, int err)
1564 {
1565 	int uptodate = err == 0;
1566 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1567 	struct extent_io_tree *tree;
1568 	u64 start;
1569 	u64 end;
1570 	int whole_page;
1571 	int ret;
1572 
1573 	do {
1574 		struct page *page = bvec->bv_page;
1575 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1576 
1577 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1578 			 bvec->bv_offset;
1579 		end = start + bvec->bv_len - 1;
1580 
1581 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1582 			whole_page = 1;
1583 		else
1584 			whole_page = 0;
1585 
1586 		if (--bvec >= bio->bi_io_vec)
1587 			prefetchw(&bvec->bv_page->flags);
1588 		if (tree->ops && tree->ops->writepage_end_io_hook) {
1589 			ret = tree->ops->writepage_end_io_hook(page, start,
1590 						       end, NULL, uptodate);
1591 			if (ret)
1592 				uptodate = 0;
1593 		}
1594 
1595 		if (!uptodate && tree->ops &&
1596 		    tree->ops->writepage_io_failed_hook) {
1597 			ret = tree->ops->writepage_io_failed_hook(bio, page,
1598 							 start, end, NULL);
1599 			if (ret == 0) {
1600 				uptodate = (err == 0);
1601 				continue;
1602 			}
1603 		}
1604 
1605 		if (!uptodate) {
1606 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1607 			ClearPageUptodate(page);
1608 			SetPageError(page);
1609 		}
1610 
1611 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1612 
1613 		if (whole_page)
1614 			end_page_writeback(page);
1615 		else
1616 			check_page_writeback(tree, page);
1617 	} while (bvec >= bio->bi_io_vec);
1618 
1619 	bio_put(bio);
1620 }
1621 
1622 /*
1623  * after a readpage IO is done, we need to:
1624  * clear the uptodate bits on error
1625  * set the uptodate bits if things worked
1626  * set the page up to date if all extents in the tree are uptodate
1627  * clear the lock bit in the extent tree
1628  * unlock the page if there are no other extents locked for it
1629  *
1630  * Scheduling is not allowed, so the extent state tree is expected
1631  * to have one and only one object corresponding to this IO.
1632  */
1633 static void end_bio_extent_readpage(struct bio *bio, int err)
1634 {
1635 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1636 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1637 	struct extent_io_tree *tree;
1638 	u64 start;
1639 	u64 end;
1640 	int whole_page;
1641 	int ret;
1642 
1643 	if (err)
1644 		uptodate = 0;
1645 
1646 	do {
1647 		struct page *page = bvec->bv_page;
1648 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1649 
1650 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1651 			bvec->bv_offset;
1652 		end = start + bvec->bv_len - 1;
1653 
1654 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1655 			whole_page = 1;
1656 		else
1657 			whole_page = 0;
1658 
1659 		if (--bvec >= bio->bi_io_vec)
1660 			prefetchw(&bvec->bv_page->flags);
1661 
1662 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1663 			ret = tree->ops->readpage_end_io_hook(page, start, end,
1664 							      NULL);
1665 			if (ret)
1666 				uptodate = 0;
1667 		}
1668 		if (!uptodate && tree->ops &&
1669 		    tree->ops->readpage_io_failed_hook) {
1670 			ret = tree->ops->readpage_io_failed_hook(bio, page,
1671 							 start, end, NULL);
1672 			if (ret == 0) {
1673 				uptodate =
1674 					test_bit(BIO_UPTODATE, &bio->bi_flags);
1675 				if (err)
1676 					uptodate = 0;
1677 				continue;
1678 			}
1679 		}
1680 
1681 		if (uptodate) {
1682 			set_extent_uptodate(tree, start, end,
1683 					    GFP_ATOMIC);
1684 		}
1685 		unlock_extent(tree, start, end, GFP_ATOMIC);
1686 
1687 		if (whole_page) {
1688 			if (uptodate) {
1689 				SetPageUptodate(page);
1690 			} else {
1691 				ClearPageUptodate(page);
1692 				SetPageError(page);
1693 			}
1694 			unlock_page(page);
1695 		} else {
1696 			if (uptodate) {
1697 				check_page_uptodate(tree, page);
1698 			} else {
1699 				ClearPageUptodate(page);
1700 				SetPageError(page);
1701 			}
1702 			check_page_locked(tree, page);
1703 		}
1704 	} while (bvec >= bio->bi_io_vec);
1705 
1706 	bio_put(bio);
1707 }
1708 
1709 /*
1710  * IO done from prepare_write is pretty simple, we just unlock
1711  * the structs in the extent tree when done, and set the uptodate bits
1712  * as appropriate.
1713  */
1714 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1715 {
1716 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1717 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1718 	struct extent_io_tree *tree;
1719 	u64 start;
1720 	u64 end;
1721 
1722 	do {
1723 		struct page *page = bvec->bv_page;
1724 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1725 
1726 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1727 			bvec->bv_offset;
1728 		end = start + bvec->bv_len - 1;
1729 
1730 		if (--bvec >= bio->bi_io_vec)
1731 			prefetchw(&bvec->bv_page->flags);
1732 
1733 		if (uptodate) {
1734 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1735 		} else {
1736 			ClearPageUptodate(page);
1737 			SetPageError(page);
1738 		}
1739 
1740 		unlock_extent(tree, start, end, GFP_ATOMIC);
1741 
1742 	} while (bvec >= bio->bi_io_vec);
1743 
1744 	bio_put(bio);
1745 }
1746 
1747 static struct bio *
1748 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1749 		 gfp_t gfp_flags)
1750 {
1751 	struct bio *bio;
1752 
1753 	bio = bio_alloc(gfp_flags, nr_vecs);
1754 
1755 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1756 		while (!bio && (nr_vecs /= 2))
1757 			bio = bio_alloc(gfp_flags, nr_vecs);
1758 	}
1759 
1760 	if (bio) {
1761 		bio->bi_size = 0;
1762 		bio->bi_bdev = bdev;
1763 		bio->bi_sector = first_sector;
1764 	}
1765 	return bio;
1766 }
1767 
1768 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1769 			  unsigned long bio_flags)
1770 {
1771 	int ret = 0;
1772 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1773 	struct page *page = bvec->bv_page;
1774 	struct extent_io_tree *tree = bio->bi_private;
1775 	u64 start;
1776 	u64 end;
1777 
1778 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1779 	end = start + bvec->bv_len - 1;
1780 
1781 	bio->bi_private = NULL;
1782 
1783 	bio_get(bio);
1784 
1785 	if (tree->ops && tree->ops->submit_bio_hook)
1786 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1787 					   mirror_num, bio_flags);
1788 	else
1789 		submit_bio(rw, bio);
1790 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1791 		ret = -EOPNOTSUPP;
1792 	bio_put(bio);
1793 	return ret;
1794 }
1795 
1796 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1797 			      struct page *page, sector_t sector,
1798 			      size_t size, unsigned long offset,
1799 			      struct block_device *bdev,
1800 			      struct bio **bio_ret,
1801 			      unsigned long max_pages,
1802 			      bio_end_io_t end_io_func,
1803 			      int mirror_num,
1804 			      unsigned long prev_bio_flags,
1805 			      unsigned long bio_flags)
1806 {
1807 	int ret = 0;
1808 	struct bio *bio;
1809 	int nr;
1810 	int contig = 0;
1811 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1812 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1813 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1814 
1815 	if (bio_ret && *bio_ret) {
1816 		bio = *bio_ret;
1817 		if (old_compressed)
1818 			contig = bio->bi_sector == sector;
1819 		else
1820 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1821 				sector;
1822 
1823 		if (prev_bio_flags != bio_flags || !contig ||
1824 		    (tree->ops && tree->ops->merge_bio_hook &&
1825 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1826 					       bio_flags)) ||
1827 		    bio_add_page(bio, page, page_size, offset) < page_size) {
1828 			ret = submit_one_bio(rw, bio, mirror_num,
1829 					     prev_bio_flags);
1830 			bio = NULL;
1831 		} else {
1832 			return 0;
1833 		}
1834 	}
1835 	if (this_compressed)
1836 		nr = BIO_MAX_PAGES;
1837 	else
1838 		nr = bio_get_nr_vecs(bdev);
1839 
1840 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1841 
1842 	bio_add_page(bio, page, page_size, offset);
1843 	bio->bi_end_io = end_io_func;
1844 	bio->bi_private = tree;
1845 
1846 	if (bio_ret)
1847 		*bio_ret = bio;
1848 	else
1849 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1850 
1851 	return ret;
1852 }
1853 
1854 void set_page_extent_mapped(struct page *page)
1855 {
1856 	if (!PagePrivate(page)) {
1857 		SetPagePrivate(page);
1858 		page_cache_get(page);
1859 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1860 	}
1861 }
1862 
1863 static void set_page_extent_head(struct page *page, unsigned long len)
1864 {
1865 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1866 }
1867 
1868 /*
1869  * basic readpage implementation.  Locked extent state structs are inserted
1870  * into the tree that are removed when the IO is done (by the end_io
1871  * handlers)
1872  */
1873 static int __extent_read_full_page(struct extent_io_tree *tree,
1874 				   struct page *page,
1875 				   get_extent_t *get_extent,
1876 				   struct bio **bio, int mirror_num,
1877 				   unsigned long *bio_flags)
1878 {
1879 	struct inode *inode = page->mapping->host;
1880 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1881 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1882 	u64 end;
1883 	u64 cur = start;
1884 	u64 extent_offset;
1885 	u64 last_byte = i_size_read(inode);
1886 	u64 block_start;
1887 	u64 cur_end;
1888 	sector_t sector;
1889 	struct extent_map *em;
1890 	struct block_device *bdev;
1891 	int ret;
1892 	int nr = 0;
1893 	size_t page_offset = 0;
1894 	size_t iosize;
1895 	size_t disk_io_size;
1896 	size_t blocksize = inode->i_sb->s_blocksize;
1897 	unsigned long this_bio_flag = 0;
1898 
1899 	set_page_extent_mapped(page);
1900 
1901 	end = page_end;
1902 	lock_extent(tree, start, end, GFP_NOFS);
1903 
1904 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1905 		char *userpage;
1906 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1907 
1908 		if (zero_offset) {
1909 			iosize = PAGE_CACHE_SIZE - zero_offset;
1910 			userpage = kmap_atomic(page, KM_USER0);
1911 			memset(userpage + zero_offset, 0, iosize);
1912 			flush_dcache_page(page);
1913 			kunmap_atomic(userpage, KM_USER0);
1914 		}
1915 	}
1916 	while (cur <= end) {
1917 		if (cur >= last_byte) {
1918 			char *userpage;
1919 			iosize = PAGE_CACHE_SIZE - page_offset;
1920 			userpage = kmap_atomic(page, KM_USER0);
1921 			memset(userpage + page_offset, 0, iosize);
1922 			flush_dcache_page(page);
1923 			kunmap_atomic(userpage, KM_USER0);
1924 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1925 					    GFP_NOFS);
1926 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1927 			break;
1928 		}
1929 		em = get_extent(inode, page, page_offset, cur,
1930 				end - cur + 1, 0);
1931 		if (IS_ERR(em) || !em) {
1932 			SetPageError(page);
1933 			unlock_extent(tree, cur, end, GFP_NOFS);
1934 			break;
1935 		}
1936 		extent_offset = cur - em->start;
1937 		BUG_ON(extent_map_end(em) <= cur);
1938 		BUG_ON(end < cur);
1939 
1940 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1941 			this_bio_flag = EXTENT_BIO_COMPRESSED;
1942 
1943 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
1944 		cur_end = min(extent_map_end(em) - 1, end);
1945 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1946 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
1947 			disk_io_size = em->block_len;
1948 			sector = em->block_start >> 9;
1949 		} else {
1950 			sector = (em->block_start + extent_offset) >> 9;
1951 			disk_io_size = iosize;
1952 		}
1953 		bdev = em->bdev;
1954 		block_start = em->block_start;
1955 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
1956 			block_start = EXTENT_MAP_HOLE;
1957 		free_extent_map(em);
1958 		em = NULL;
1959 
1960 		/* we've found a hole, just zero and go on */
1961 		if (block_start == EXTENT_MAP_HOLE) {
1962 			char *userpage;
1963 			userpage = kmap_atomic(page, KM_USER0);
1964 			memset(userpage + page_offset, 0, iosize);
1965 			flush_dcache_page(page);
1966 			kunmap_atomic(userpage, KM_USER0);
1967 
1968 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1969 					    GFP_NOFS);
1970 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1971 			cur = cur + iosize;
1972 			page_offset += iosize;
1973 			continue;
1974 		}
1975 		/* the get_extent function already copied into the page */
1976 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1977 			check_page_uptodate(tree, page);
1978 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1979 			cur = cur + iosize;
1980 			page_offset += iosize;
1981 			continue;
1982 		}
1983 		/* we have an inline extent but it didn't get marked up
1984 		 * to date.  Error out
1985 		 */
1986 		if (block_start == EXTENT_MAP_INLINE) {
1987 			SetPageError(page);
1988 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1989 			cur = cur + iosize;
1990 			page_offset += iosize;
1991 			continue;
1992 		}
1993 
1994 		ret = 0;
1995 		if (tree->ops && tree->ops->readpage_io_hook) {
1996 			ret = tree->ops->readpage_io_hook(page, cur,
1997 							  cur + iosize - 1);
1998 		}
1999 		if (!ret) {
2000 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2001 			pnr -= page->index;
2002 			ret = submit_extent_page(READ, tree, page,
2003 					 sector, disk_io_size, page_offset,
2004 					 bdev, bio, pnr,
2005 					 end_bio_extent_readpage, mirror_num,
2006 					 *bio_flags,
2007 					 this_bio_flag);
2008 			nr++;
2009 			*bio_flags = this_bio_flag;
2010 		}
2011 		if (ret)
2012 			SetPageError(page);
2013 		cur = cur + iosize;
2014 		page_offset += iosize;
2015 	}
2016 	if (!nr) {
2017 		if (!PageError(page))
2018 			SetPageUptodate(page);
2019 		unlock_page(page);
2020 	}
2021 	return 0;
2022 }
2023 
2024 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2025 			    get_extent_t *get_extent)
2026 {
2027 	struct bio *bio = NULL;
2028 	unsigned long bio_flags = 0;
2029 	int ret;
2030 
2031 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2032 				      &bio_flags);
2033 	if (bio)
2034 		submit_one_bio(READ, bio, 0, bio_flags);
2035 	return ret;
2036 }
2037 
2038 static noinline void update_nr_written(struct page *page,
2039 				      struct writeback_control *wbc,
2040 				      unsigned long nr_written)
2041 {
2042 	wbc->nr_to_write -= nr_written;
2043 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2044 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2045 		page->mapping->writeback_index = page->index + nr_written;
2046 }
2047 
2048 /*
2049  * the writepage semantics are similar to regular writepage.  extent
2050  * records are inserted to lock ranges in the tree, and as dirty areas
2051  * are found, they are marked writeback.  Then the lock bits are removed
2052  * and the end_io handler clears the writeback ranges
2053  */
2054 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2055 			      void *data)
2056 {
2057 	struct inode *inode = page->mapping->host;
2058 	struct extent_page_data *epd = data;
2059 	struct extent_io_tree *tree = epd->tree;
2060 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2061 	u64 delalloc_start;
2062 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2063 	u64 end;
2064 	u64 cur = start;
2065 	u64 extent_offset;
2066 	u64 last_byte = i_size_read(inode);
2067 	u64 block_start;
2068 	u64 iosize;
2069 	u64 unlock_start;
2070 	sector_t sector;
2071 	struct extent_map *em;
2072 	struct block_device *bdev;
2073 	int ret;
2074 	int nr = 0;
2075 	size_t pg_offset = 0;
2076 	size_t blocksize;
2077 	loff_t i_size = i_size_read(inode);
2078 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2079 	u64 nr_delalloc;
2080 	u64 delalloc_end;
2081 	int page_started;
2082 	int compressed;
2083 	int write_flags;
2084 	unsigned long nr_written = 0;
2085 
2086 	if (wbc->sync_mode == WB_SYNC_ALL)
2087 		write_flags = WRITE_SYNC_PLUG;
2088 	else
2089 		write_flags = WRITE;
2090 
2091 	WARN_ON(!PageLocked(page));
2092 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2093 	if (page->index > end_index ||
2094 	   (page->index == end_index && !pg_offset)) {
2095 		page->mapping->a_ops->invalidatepage(page, 0);
2096 		unlock_page(page);
2097 		return 0;
2098 	}
2099 
2100 	if (page->index == end_index) {
2101 		char *userpage;
2102 
2103 		userpage = kmap_atomic(page, KM_USER0);
2104 		memset(userpage + pg_offset, 0,
2105 		       PAGE_CACHE_SIZE - pg_offset);
2106 		kunmap_atomic(userpage, KM_USER0);
2107 		flush_dcache_page(page);
2108 	}
2109 	pg_offset = 0;
2110 
2111 	set_page_extent_mapped(page);
2112 
2113 	delalloc_start = start;
2114 	delalloc_end = 0;
2115 	page_started = 0;
2116 	if (!epd->extent_locked) {
2117 		/*
2118 		 * make sure the wbc mapping index is at least updated
2119 		 * to this page.
2120 		 */
2121 		update_nr_written(page, wbc, 0);
2122 
2123 		while (delalloc_end < page_end) {
2124 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2125 						       page,
2126 						       &delalloc_start,
2127 						       &delalloc_end,
2128 						       128 * 1024 * 1024);
2129 			if (nr_delalloc == 0) {
2130 				delalloc_start = delalloc_end + 1;
2131 				continue;
2132 			}
2133 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2134 						 delalloc_end, &page_started,
2135 						 &nr_written);
2136 			delalloc_start = delalloc_end + 1;
2137 		}
2138 
2139 		/* did the fill delalloc function already unlock and start
2140 		 * the IO?
2141 		 */
2142 		if (page_started) {
2143 			ret = 0;
2144 			/*
2145 			 * we've unlocked the page, so we can't update
2146 			 * the mapping's writeback index, just update
2147 			 * nr_to_write.
2148 			 */
2149 			wbc->nr_to_write -= nr_written;
2150 			goto done_unlocked;
2151 		}
2152 	}
2153 	lock_extent(tree, start, page_end, GFP_NOFS);
2154 
2155 	unlock_start = start;
2156 
2157 	if (tree->ops && tree->ops->writepage_start_hook) {
2158 		ret = tree->ops->writepage_start_hook(page, start,
2159 						      page_end);
2160 		if (ret == -EAGAIN) {
2161 			unlock_extent(tree, start, page_end, GFP_NOFS);
2162 			redirty_page_for_writepage(wbc, page);
2163 			update_nr_written(page, wbc, nr_written);
2164 			unlock_page(page);
2165 			ret = 0;
2166 			goto done_unlocked;
2167 		}
2168 	}
2169 
2170 	/*
2171 	 * we don't want to touch the inode after unlocking the page,
2172 	 * so we update the mapping writeback index now
2173 	 */
2174 	update_nr_written(page, wbc, nr_written + 1);
2175 
2176 	end = page_end;
2177 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2178 		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2179 
2180 	if (last_byte <= start) {
2181 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2182 		unlock_extent(tree, start, page_end, GFP_NOFS);
2183 		if (tree->ops && tree->ops->writepage_end_io_hook)
2184 			tree->ops->writepage_end_io_hook(page, start,
2185 							 page_end, NULL, 1);
2186 		unlock_start = page_end + 1;
2187 		goto done;
2188 	}
2189 
2190 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2191 	blocksize = inode->i_sb->s_blocksize;
2192 
2193 	while (cur <= end) {
2194 		if (cur >= last_byte) {
2195 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2196 			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2197 			if (tree->ops && tree->ops->writepage_end_io_hook)
2198 				tree->ops->writepage_end_io_hook(page, cur,
2199 							 page_end, NULL, 1);
2200 			unlock_start = page_end + 1;
2201 			break;
2202 		}
2203 		em = epd->get_extent(inode, page, pg_offset, cur,
2204 				     end - cur + 1, 1);
2205 		if (IS_ERR(em) || !em) {
2206 			SetPageError(page);
2207 			break;
2208 		}
2209 
2210 		extent_offset = cur - em->start;
2211 		BUG_ON(extent_map_end(em) <= cur);
2212 		BUG_ON(end < cur);
2213 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2214 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2215 		sector = (em->block_start + extent_offset) >> 9;
2216 		bdev = em->bdev;
2217 		block_start = em->block_start;
2218 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2219 		free_extent_map(em);
2220 		em = NULL;
2221 
2222 		/*
2223 		 * compressed and inline extents are written through other
2224 		 * paths in the FS
2225 		 */
2226 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2227 		    block_start == EXTENT_MAP_INLINE) {
2228 			clear_extent_dirty(tree, cur,
2229 					   cur + iosize - 1, GFP_NOFS);
2230 
2231 			unlock_extent(tree, unlock_start, cur + iosize - 1,
2232 				      GFP_NOFS);
2233 
2234 			/*
2235 			 * end_io notification does not happen here for
2236 			 * compressed extents
2237 			 */
2238 			if (!compressed && tree->ops &&
2239 			    tree->ops->writepage_end_io_hook)
2240 				tree->ops->writepage_end_io_hook(page, cur,
2241 							 cur + iosize - 1,
2242 							 NULL, 1);
2243 			else if (compressed) {
2244 				/* we don't want to end_page_writeback on
2245 				 * a compressed extent.  this happens
2246 				 * elsewhere
2247 				 */
2248 				nr++;
2249 			}
2250 
2251 			cur += iosize;
2252 			pg_offset += iosize;
2253 			unlock_start = cur;
2254 			continue;
2255 		}
2256 		/* leave this out until we have a page_mkwrite call */
2257 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2258 				   EXTENT_DIRTY, 0)) {
2259 			cur = cur + iosize;
2260 			pg_offset += iosize;
2261 			continue;
2262 		}
2263 
2264 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2265 		if (tree->ops && tree->ops->writepage_io_hook) {
2266 			ret = tree->ops->writepage_io_hook(page, cur,
2267 						cur + iosize - 1);
2268 		} else {
2269 			ret = 0;
2270 		}
2271 		if (ret) {
2272 			SetPageError(page);
2273 		} else {
2274 			unsigned long max_nr = end_index + 1;
2275 
2276 			set_range_writeback(tree, cur, cur + iosize - 1);
2277 			if (!PageWriteback(page)) {
2278 				printk(KERN_ERR "btrfs warning page %lu not "
2279 				       "writeback, cur %llu end %llu\n",
2280 				       page->index, (unsigned long long)cur,
2281 				       (unsigned long long)end);
2282 			}
2283 
2284 			ret = submit_extent_page(write_flags, tree, page,
2285 						 sector, iosize, pg_offset,
2286 						 bdev, &epd->bio, max_nr,
2287 						 end_bio_extent_writepage,
2288 						 0, 0, 0);
2289 			if (ret)
2290 				SetPageError(page);
2291 		}
2292 		cur = cur + iosize;
2293 		pg_offset += iosize;
2294 		nr++;
2295 	}
2296 done:
2297 	if (nr == 0) {
2298 		/* make sure the mapping tag for page dirty gets cleared */
2299 		set_page_writeback(page);
2300 		end_page_writeback(page);
2301 	}
2302 	if (unlock_start <= page_end)
2303 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2304 	unlock_page(page);
2305 
2306 done_unlocked:
2307 
2308 	return 0;
2309 }
2310 
2311 /**
2312  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2313  * @mapping: address space structure to write
2314  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2315  * @writepage: function called for each page
2316  * @data: data passed to writepage function
2317  *
2318  * If a page is already under I/O, write_cache_pages() skips it, even
2319  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2320  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2321  * and msync() need to guarantee that all the data which was dirty at the time
2322  * the call was made get new I/O started against them.  If wbc->sync_mode is
2323  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2324  * existing IO to complete.
2325  */
2326 static int extent_write_cache_pages(struct extent_io_tree *tree,
2327 			     struct address_space *mapping,
2328 			     struct writeback_control *wbc,
2329 			     writepage_t writepage, void *data,
2330 			     void (*flush_fn)(void *))
2331 {
2332 	struct backing_dev_info *bdi = mapping->backing_dev_info;
2333 	int ret = 0;
2334 	int done = 0;
2335 	struct pagevec pvec;
2336 	int nr_pages;
2337 	pgoff_t index;
2338 	pgoff_t end;		/* Inclusive */
2339 	int scanned = 0;
2340 	int range_whole = 0;
2341 
2342 	pagevec_init(&pvec, 0);
2343 	if (wbc->range_cyclic) {
2344 		index = mapping->writeback_index; /* Start from prev offset */
2345 		end = -1;
2346 	} else {
2347 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2348 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2349 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2350 			range_whole = 1;
2351 		scanned = 1;
2352 	}
2353 retry:
2354 	while (!done && (index <= end) &&
2355 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2356 			      PAGECACHE_TAG_DIRTY, min(end - index,
2357 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2358 		unsigned i;
2359 
2360 		scanned = 1;
2361 		for (i = 0; i < nr_pages; i++) {
2362 			struct page *page = pvec.pages[i];
2363 
2364 			/*
2365 			 * At this point we hold neither mapping->tree_lock nor
2366 			 * lock on the page itself: the page may be truncated or
2367 			 * invalidated (changing page->mapping to NULL), or even
2368 			 * swizzled back from swapper_space to tmpfs file
2369 			 * mapping
2370 			 */
2371 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2372 				tree->ops->write_cache_pages_lock_hook(page);
2373 			else
2374 				lock_page(page);
2375 
2376 			if (unlikely(page->mapping != mapping)) {
2377 				unlock_page(page);
2378 				continue;
2379 			}
2380 
2381 			if (!wbc->range_cyclic && page->index > end) {
2382 				done = 1;
2383 				unlock_page(page);
2384 				continue;
2385 			}
2386 
2387 			if (wbc->sync_mode != WB_SYNC_NONE) {
2388 				if (PageWriteback(page))
2389 					flush_fn(data);
2390 				wait_on_page_writeback(page);
2391 			}
2392 
2393 			if (PageWriteback(page) ||
2394 			    !clear_page_dirty_for_io(page)) {
2395 				unlock_page(page);
2396 				continue;
2397 			}
2398 
2399 			ret = (*writepage)(page, wbc, data);
2400 
2401 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2402 				unlock_page(page);
2403 				ret = 0;
2404 			}
2405 			if (ret || wbc->nr_to_write <= 0)
2406 				done = 1;
2407 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
2408 				wbc->encountered_congestion = 1;
2409 				done = 1;
2410 			}
2411 		}
2412 		pagevec_release(&pvec);
2413 		cond_resched();
2414 	}
2415 	if (!scanned && !done) {
2416 		/*
2417 		 * We hit the last page and there is more work to be done: wrap
2418 		 * back to the start of the file
2419 		 */
2420 		scanned = 1;
2421 		index = 0;
2422 		goto retry;
2423 	}
2424 	return ret;
2425 }
2426 
2427 static void flush_epd_write_bio(struct extent_page_data *epd)
2428 {
2429 	if (epd->bio) {
2430 		if (epd->sync_io)
2431 			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2432 		else
2433 			submit_one_bio(WRITE, epd->bio, 0, 0);
2434 		epd->bio = NULL;
2435 	}
2436 }
2437 
2438 static noinline void flush_write_bio(void *data)
2439 {
2440 	struct extent_page_data *epd = data;
2441 	flush_epd_write_bio(epd);
2442 }
2443 
2444 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2445 			  get_extent_t *get_extent,
2446 			  struct writeback_control *wbc)
2447 {
2448 	int ret;
2449 	struct address_space *mapping = page->mapping;
2450 	struct extent_page_data epd = {
2451 		.bio = NULL,
2452 		.tree = tree,
2453 		.get_extent = get_extent,
2454 		.extent_locked = 0,
2455 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2456 	};
2457 	struct writeback_control wbc_writepages = {
2458 		.bdi		= wbc->bdi,
2459 		.sync_mode	= wbc->sync_mode,
2460 		.older_than_this = NULL,
2461 		.nr_to_write	= 64,
2462 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2463 		.range_end	= (loff_t)-1,
2464 	};
2465 
2466 	ret = __extent_writepage(page, wbc, &epd);
2467 
2468 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2469 				 __extent_writepage, &epd, flush_write_bio);
2470 	flush_epd_write_bio(&epd);
2471 	return ret;
2472 }
2473 
2474 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2475 			      u64 start, u64 end, get_extent_t *get_extent,
2476 			      int mode)
2477 {
2478 	int ret = 0;
2479 	struct address_space *mapping = inode->i_mapping;
2480 	struct page *page;
2481 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2482 		PAGE_CACHE_SHIFT;
2483 
2484 	struct extent_page_data epd = {
2485 		.bio = NULL,
2486 		.tree = tree,
2487 		.get_extent = get_extent,
2488 		.extent_locked = 1,
2489 		.sync_io = mode == WB_SYNC_ALL,
2490 	};
2491 	struct writeback_control wbc_writepages = {
2492 		.bdi		= inode->i_mapping->backing_dev_info,
2493 		.sync_mode	= mode,
2494 		.older_than_this = NULL,
2495 		.nr_to_write	= nr_pages * 2,
2496 		.range_start	= start,
2497 		.range_end	= end + 1,
2498 	};
2499 
2500 	while (start <= end) {
2501 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2502 		if (clear_page_dirty_for_io(page))
2503 			ret = __extent_writepage(page, &wbc_writepages, &epd);
2504 		else {
2505 			if (tree->ops && tree->ops->writepage_end_io_hook)
2506 				tree->ops->writepage_end_io_hook(page, start,
2507 						 start + PAGE_CACHE_SIZE - 1,
2508 						 NULL, 1);
2509 			unlock_page(page);
2510 		}
2511 		page_cache_release(page);
2512 		start += PAGE_CACHE_SIZE;
2513 	}
2514 
2515 	flush_epd_write_bio(&epd);
2516 	return ret;
2517 }
2518 
2519 int extent_writepages(struct extent_io_tree *tree,
2520 		      struct address_space *mapping,
2521 		      get_extent_t *get_extent,
2522 		      struct writeback_control *wbc)
2523 {
2524 	int ret = 0;
2525 	struct extent_page_data epd = {
2526 		.bio = NULL,
2527 		.tree = tree,
2528 		.get_extent = get_extent,
2529 		.extent_locked = 0,
2530 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2531 	};
2532 
2533 	ret = extent_write_cache_pages(tree, mapping, wbc,
2534 				       __extent_writepage, &epd,
2535 				       flush_write_bio);
2536 	flush_epd_write_bio(&epd);
2537 	return ret;
2538 }
2539 
2540 int extent_readpages(struct extent_io_tree *tree,
2541 		     struct address_space *mapping,
2542 		     struct list_head *pages, unsigned nr_pages,
2543 		     get_extent_t get_extent)
2544 {
2545 	struct bio *bio = NULL;
2546 	unsigned page_idx;
2547 	struct pagevec pvec;
2548 	unsigned long bio_flags = 0;
2549 
2550 	pagevec_init(&pvec, 0);
2551 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2552 		struct page *page = list_entry(pages->prev, struct page, lru);
2553 
2554 		prefetchw(&page->flags);
2555 		list_del(&page->lru);
2556 		/*
2557 		 * what we want to do here is call add_to_page_cache_lru,
2558 		 * but that isn't exported, so we reproduce it here
2559 		 */
2560 		if (!add_to_page_cache(page, mapping,
2561 					page->index, GFP_KERNEL)) {
2562 
2563 			/* open coding of lru_cache_add, also not exported */
2564 			page_cache_get(page);
2565 			if (!pagevec_add(&pvec, page))
2566 				__pagevec_lru_add_file(&pvec);
2567 			__extent_read_full_page(tree, page, get_extent,
2568 						&bio, 0, &bio_flags);
2569 		}
2570 		page_cache_release(page);
2571 	}
2572 	if (pagevec_count(&pvec))
2573 		__pagevec_lru_add_file(&pvec);
2574 	BUG_ON(!list_empty(pages));
2575 	if (bio)
2576 		submit_one_bio(READ, bio, 0, bio_flags);
2577 	return 0;
2578 }
2579 
2580 /*
2581  * basic invalidatepage code, this waits on any locked or writeback
2582  * ranges corresponding to the page, and then deletes any extent state
2583  * records from the tree
2584  */
2585 int extent_invalidatepage(struct extent_io_tree *tree,
2586 			  struct page *page, unsigned long offset)
2587 {
2588 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2589 	u64 end = start + PAGE_CACHE_SIZE - 1;
2590 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2591 
2592 	start += (offset + blocksize - 1) & ~(blocksize - 1);
2593 	if (start > end)
2594 		return 0;
2595 
2596 	lock_extent(tree, start, end, GFP_NOFS);
2597 	wait_on_extent_writeback(tree, start, end);
2598 	clear_extent_bit(tree, start, end,
2599 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2600 			 1, 1, GFP_NOFS);
2601 	return 0;
2602 }
2603 
2604 /*
2605  * simple commit_write call, set_range_dirty is used to mark both
2606  * the pages and the extent records as dirty
2607  */
2608 int extent_commit_write(struct extent_io_tree *tree,
2609 			struct inode *inode, struct page *page,
2610 			unsigned from, unsigned to)
2611 {
2612 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2613 
2614 	set_page_extent_mapped(page);
2615 	set_page_dirty(page);
2616 
2617 	if (pos > inode->i_size) {
2618 		i_size_write(inode, pos);
2619 		mark_inode_dirty(inode);
2620 	}
2621 	return 0;
2622 }
2623 
2624 int extent_prepare_write(struct extent_io_tree *tree,
2625 			 struct inode *inode, struct page *page,
2626 			 unsigned from, unsigned to, get_extent_t *get_extent)
2627 {
2628 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2629 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2630 	u64 block_start;
2631 	u64 orig_block_start;
2632 	u64 block_end;
2633 	u64 cur_end;
2634 	struct extent_map *em;
2635 	unsigned blocksize = 1 << inode->i_blkbits;
2636 	size_t page_offset = 0;
2637 	size_t block_off_start;
2638 	size_t block_off_end;
2639 	int err = 0;
2640 	int iocount = 0;
2641 	int ret = 0;
2642 	int isnew;
2643 
2644 	set_page_extent_mapped(page);
2645 
2646 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2647 	block_end = (page_start + to - 1) | (blocksize - 1);
2648 	orig_block_start = block_start;
2649 
2650 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2651 	while (block_start <= block_end) {
2652 		em = get_extent(inode, page, page_offset, block_start,
2653 				block_end - block_start + 1, 1);
2654 		if (IS_ERR(em) || !em)
2655 			goto err;
2656 
2657 		cur_end = min(block_end, extent_map_end(em) - 1);
2658 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2659 		block_off_end = block_off_start + blocksize;
2660 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2661 
2662 		if (!PageUptodate(page) && isnew &&
2663 		    (block_off_end > to || block_off_start < from)) {
2664 			void *kaddr;
2665 
2666 			kaddr = kmap_atomic(page, KM_USER0);
2667 			if (block_off_end > to)
2668 				memset(kaddr + to, 0, block_off_end - to);
2669 			if (block_off_start < from)
2670 				memset(kaddr + block_off_start, 0,
2671 				       from - block_off_start);
2672 			flush_dcache_page(page);
2673 			kunmap_atomic(kaddr, KM_USER0);
2674 		}
2675 		if ((em->block_start != EXTENT_MAP_HOLE &&
2676 		     em->block_start != EXTENT_MAP_INLINE) &&
2677 		    !isnew && !PageUptodate(page) &&
2678 		    (block_off_end > to || block_off_start < from) &&
2679 		    !test_range_bit(tree, block_start, cur_end,
2680 				    EXTENT_UPTODATE, 1)) {
2681 			u64 sector;
2682 			u64 extent_offset = block_start - em->start;
2683 			size_t iosize;
2684 			sector = (em->block_start + extent_offset) >> 9;
2685 			iosize = (cur_end - block_start + blocksize) &
2686 				~((u64)blocksize - 1);
2687 			/*
2688 			 * we've already got the extent locked, but we
2689 			 * need to split the state such that our end_bio
2690 			 * handler can clear the lock.
2691 			 */
2692 			set_extent_bit(tree, block_start,
2693 				       block_start + iosize - 1,
2694 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2695 			ret = submit_extent_page(READ, tree, page,
2696 					 sector, iosize, page_offset, em->bdev,
2697 					 NULL, 1,
2698 					 end_bio_extent_preparewrite, 0,
2699 					 0, 0);
2700 			iocount++;
2701 			block_start = block_start + iosize;
2702 		} else {
2703 			set_extent_uptodate(tree, block_start, cur_end,
2704 					    GFP_NOFS);
2705 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2706 			block_start = cur_end + 1;
2707 		}
2708 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2709 		free_extent_map(em);
2710 	}
2711 	if (iocount) {
2712 		wait_extent_bit(tree, orig_block_start,
2713 				block_end, EXTENT_LOCKED);
2714 	}
2715 	check_page_uptodate(tree, page);
2716 err:
2717 	/* FIXME, zero out newly allocated blocks on error */
2718 	return err;
2719 }
2720 
2721 /*
2722  * a helper for releasepage, this tests for areas of the page that
2723  * are locked or under IO and drops the related state bits if it is safe
2724  * to drop the page.
2725  */
2726 int try_release_extent_state(struct extent_map_tree *map,
2727 			     struct extent_io_tree *tree, struct page *page,
2728 			     gfp_t mask)
2729 {
2730 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2731 	u64 end = start + PAGE_CACHE_SIZE - 1;
2732 	int ret = 1;
2733 
2734 	if (test_range_bit(tree, start, end,
2735 			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
2736 		ret = 0;
2737 	else {
2738 		if ((mask & GFP_NOFS) == GFP_NOFS)
2739 			mask = GFP_NOFS;
2740 		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2741 				 1, 1, mask);
2742 	}
2743 	return ret;
2744 }
2745 
2746 /*
2747  * a helper for releasepage.  As long as there are no locked extents
2748  * in the range corresponding to the page, both state records and extent
2749  * map records are removed
2750  */
2751 int try_release_extent_mapping(struct extent_map_tree *map,
2752 			       struct extent_io_tree *tree, struct page *page,
2753 			       gfp_t mask)
2754 {
2755 	struct extent_map *em;
2756 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2757 	u64 end = start + PAGE_CACHE_SIZE - 1;
2758 
2759 	if ((mask & __GFP_WAIT) &&
2760 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2761 		u64 len;
2762 		while (start <= end) {
2763 			len = end - start + 1;
2764 			spin_lock(&map->lock);
2765 			em = lookup_extent_mapping(map, start, len);
2766 			if (!em || IS_ERR(em)) {
2767 				spin_unlock(&map->lock);
2768 				break;
2769 			}
2770 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2771 			    em->start != start) {
2772 				spin_unlock(&map->lock);
2773 				free_extent_map(em);
2774 				break;
2775 			}
2776 			if (!test_range_bit(tree, em->start,
2777 					    extent_map_end(em) - 1,
2778 					    EXTENT_LOCKED | EXTENT_WRITEBACK |
2779 					    EXTENT_ORDERED,
2780 					    0)) {
2781 				remove_extent_mapping(map, em);
2782 				/* once for the rb tree */
2783 				free_extent_map(em);
2784 			}
2785 			start = extent_map_end(em);
2786 			spin_unlock(&map->lock);
2787 
2788 			/* once for us */
2789 			free_extent_map(em);
2790 		}
2791 	}
2792 	return try_release_extent_state(map, tree, page, mask);
2793 }
2794 
2795 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2796 		get_extent_t *get_extent)
2797 {
2798 	struct inode *inode = mapping->host;
2799 	u64 start = iblock << inode->i_blkbits;
2800 	sector_t sector = 0;
2801 	size_t blksize = (1 << inode->i_blkbits);
2802 	struct extent_map *em;
2803 
2804 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2805 		    GFP_NOFS);
2806 	em = get_extent(inode, NULL, 0, start, blksize, 0);
2807 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2808 		      GFP_NOFS);
2809 	if (!em || IS_ERR(em))
2810 		return 0;
2811 
2812 	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2813 		goto out;
2814 
2815 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2816 out:
2817 	free_extent_map(em);
2818 	return sector;
2819 }
2820 
2821 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2822 		__u64 start, __u64 len, get_extent_t *get_extent)
2823 {
2824 	int ret;
2825 	u64 off = start;
2826 	u64 max = start + len;
2827 	u32 flags = 0;
2828 	u64 disko = 0;
2829 	struct extent_map *em = NULL;
2830 	int end = 0;
2831 	u64 em_start = 0, em_len = 0;
2832 	unsigned long emflags;
2833 	ret = 0;
2834 
2835 	if (len == 0)
2836 		return -EINVAL;
2837 
2838 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2839 		GFP_NOFS);
2840 	em = get_extent(inode, NULL, 0, off, max - off, 0);
2841 	if (!em)
2842 		goto out;
2843 	if (IS_ERR(em)) {
2844 		ret = PTR_ERR(em);
2845 		goto out;
2846 	}
2847 	while (!end) {
2848 		off = em->start + em->len;
2849 		if (off >= max)
2850 			end = 1;
2851 
2852 		em_start = em->start;
2853 		em_len = em->len;
2854 
2855 		disko = 0;
2856 		flags = 0;
2857 
2858 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2859 			end = 1;
2860 			flags |= FIEMAP_EXTENT_LAST;
2861 		} else if (em->block_start == EXTENT_MAP_HOLE) {
2862 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2863 		} else if (em->block_start == EXTENT_MAP_INLINE) {
2864 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2865 				  FIEMAP_EXTENT_NOT_ALIGNED);
2866 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
2867 			flags |= (FIEMAP_EXTENT_DELALLOC |
2868 				  FIEMAP_EXTENT_UNKNOWN);
2869 		} else {
2870 			disko = em->block_start;
2871 		}
2872 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2873 			flags |= FIEMAP_EXTENT_ENCODED;
2874 
2875 		emflags = em->flags;
2876 		free_extent_map(em);
2877 		em = NULL;
2878 
2879 		if (!end) {
2880 			em = get_extent(inode, NULL, 0, off, max - off, 0);
2881 			if (!em)
2882 				goto out;
2883 			if (IS_ERR(em)) {
2884 				ret = PTR_ERR(em);
2885 				goto out;
2886 			}
2887 			emflags = em->flags;
2888 		}
2889 		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2890 			flags |= FIEMAP_EXTENT_LAST;
2891 			end = 1;
2892 		}
2893 
2894 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2895 					em_len, flags);
2896 		if (ret)
2897 			goto out_free;
2898 	}
2899 out_free:
2900 	free_extent_map(em);
2901 out:
2902 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2903 			GFP_NOFS);
2904 	return ret;
2905 }
2906 
2907 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2908 					      unsigned long i)
2909 {
2910 	struct page *p;
2911 	struct address_space *mapping;
2912 
2913 	if (i == 0)
2914 		return eb->first_page;
2915 	i += eb->start >> PAGE_CACHE_SHIFT;
2916 	mapping = eb->first_page->mapping;
2917 	if (!mapping)
2918 		return NULL;
2919 
2920 	/*
2921 	 * extent_buffer_page is only called after pinning the page
2922 	 * by increasing the reference count.  So we know the page must
2923 	 * be in the radix tree.
2924 	 */
2925 	rcu_read_lock();
2926 	p = radix_tree_lookup(&mapping->page_tree, i);
2927 	rcu_read_unlock();
2928 
2929 	return p;
2930 }
2931 
2932 static inline unsigned long num_extent_pages(u64 start, u64 len)
2933 {
2934 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2935 		(start >> PAGE_CACHE_SHIFT);
2936 }
2937 
2938 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2939 						   u64 start,
2940 						   unsigned long len,
2941 						   gfp_t mask)
2942 {
2943 	struct extent_buffer *eb = NULL;
2944 #if LEAK_DEBUG
2945 	unsigned long flags;
2946 #endif
2947 
2948 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2949 	eb->start = start;
2950 	eb->len = len;
2951 	spin_lock_init(&eb->lock);
2952 	init_waitqueue_head(&eb->lock_wq);
2953 
2954 #if LEAK_DEBUG
2955 	spin_lock_irqsave(&leak_lock, flags);
2956 	list_add(&eb->leak_list, &buffers);
2957 	spin_unlock_irqrestore(&leak_lock, flags);
2958 #endif
2959 	atomic_set(&eb->refs, 1);
2960 
2961 	return eb;
2962 }
2963 
2964 static void __free_extent_buffer(struct extent_buffer *eb)
2965 {
2966 #if LEAK_DEBUG
2967 	unsigned long flags;
2968 	spin_lock_irqsave(&leak_lock, flags);
2969 	list_del(&eb->leak_list);
2970 	spin_unlock_irqrestore(&leak_lock, flags);
2971 #endif
2972 	kmem_cache_free(extent_buffer_cache, eb);
2973 }
2974 
2975 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2976 					  u64 start, unsigned long len,
2977 					  struct page *page0,
2978 					  gfp_t mask)
2979 {
2980 	unsigned long num_pages = num_extent_pages(start, len);
2981 	unsigned long i;
2982 	unsigned long index = start >> PAGE_CACHE_SHIFT;
2983 	struct extent_buffer *eb;
2984 	struct extent_buffer *exists = NULL;
2985 	struct page *p;
2986 	struct address_space *mapping = tree->mapping;
2987 	int uptodate = 1;
2988 
2989 	spin_lock(&tree->buffer_lock);
2990 	eb = buffer_search(tree, start);
2991 	if (eb) {
2992 		atomic_inc(&eb->refs);
2993 		spin_unlock(&tree->buffer_lock);
2994 		mark_page_accessed(eb->first_page);
2995 		return eb;
2996 	}
2997 	spin_unlock(&tree->buffer_lock);
2998 
2999 	eb = __alloc_extent_buffer(tree, start, len, mask);
3000 	if (!eb)
3001 		return NULL;
3002 
3003 	if (page0) {
3004 		eb->first_page = page0;
3005 		i = 1;
3006 		index++;
3007 		page_cache_get(page0);
3008 		mark_page_accessed(page0);
3009 		set_page_extent_mapped(page0);
3010 		set_page_extent_head(page0, len);
3011 		uptodate = PageUptodate(page0);
3012 	} else {
3013 		i = 0;
3014 	}
3015 	for (; i < num_pages; i++, index++) {
3016 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3017 		if (!p) {
3018 			WARN_ON(1);
3019 			goto free_eb;
3020 		}
3021 		set_page_extent_mapped(p);
3022 		mark_page_accessed(p);
3023 		if (i == 0) {
3024 			eb->first_page = p;
3025 			set_page_extent_head(p, len);
3026 		} else {
3027 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3028 		}
3029 		if (!PageUptodate(p))
3030 			uptodate = 0;
3031 		unlock_page(p);
3032 	}
3033 	if (uptodate)
3034 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3035 
3036 	spin_lock(&tree->buffer_lock);
3037 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3038 	if (exists) {
3039 		/* add one reference for the caller */
3040 		atomic_inc(&exists->refs);
3041 		spin_unlock(&tree->buffer_lock);
3042 		goto free_eb;
3043 	}
3044 	spin_unlock(&tree->buffer_lock);
3045 
3046 	/* add one reference for the tree */
3047 	atomic_inc(&eb->refs);
3048 	return eb;
3049 
3050 free_eb:
3051 	if (!atomic_dec_and_test(&eb->refs))
3052 		return exists;
3053 	for (index = 1; index < i; index++)
3054 		page_cache_release(extent_buffer_page(eb, index));
3055 	page_cache_release(extent_buffer_page(eb, 0));
3056 	__free_extent_buffer(eb);
3057 	return exists;
3058 }
3059 
3060 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3061 					 u64 start, unsigned long len,
3062 					  gfp_t mask)
3063 {
3064 	struct extent_buffer *eb;
3065 
3066 	spin_lock(&tree->buffer_lock);
3067 	eb = buffer_search(tree, start);
3068 	if (eb)
3069 		atomic_inc(&eb->refs);
3070 	spin_unlock(&tree->buffer_lock);
3071 
3072 	if (eb)
3073 		mark_page_accessed(eb->first_page);
3074 
3075 	return eb;
3076 }
3077 
3078 void free_extent_buffer(struct extent_buffer *eb)
3079 {
3080 	if (!eb)
3081 		return;
3082 
3083 	if (!atomic_dec_and_test(&eb->refs))
3084 		return;
3085 
3086 	WARN_ON(1);
3087 }
3088 
3089 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3090 			      struct extent_buffer *eb)
3091 {
3092 	unsigned long i;
3093 	unsigned long num_pages;
3094 	struct page *page;
3095 
3096 	num_pages = num_extent_pages(eb->start, eb->len);
3097 
3098 	for (i = 0; i < num_pages; i++) {
3099 		page = extent_buffer_page(eb, i);
3100 		if (!PageDirty(page))
3101 			continue;
3102 
3103 		lock_page(page);
3104 		if (i == 0)
3105 			set_page_extent_head(page, eb->len);
3106 		else
3107 			set_page_private(page, EXTENT_PAGE_PRIVATE);
3108 
3109 		clear_page_dirty_for_io(page);
3110 		spin_lock_irq(&page->mapping->tree_lock);
3111 		if (!PageDirty(page)) {
3112 			radix_tree_tag_clear(&page->mapping->page_tree,
3113 						page_index(page),
3114 						PAGECACHE_TAG_DIRTY);
3115 		}
3116 		spin_unlock_irq(&page->mapping->tree_lock);
3117 		unlock_page(page);
3118 	}
3119 	return 0;
3120 }
3121 
3122 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3123 				    struct extent_buffer *eb)
3124 {
3125 	return wait_on_extent_writeback(tree, eb->start,
3126 					eb->start + eb->len - 1);
3127 }
3128 
3129 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3130 			     struct extent_buffer *eb)
3131 {
3132 	unsigned long i;
3133 	unsigned long num_pages;
3134 	int was_dirty = 0;
3135 
3136 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3137 	num_pages = num_extent_pages(eb->start, eb->len);
3138 	for (i = 0; i < num_pages; i++)
3139 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3140 	return was_dirty;
3141 }
3142 
3143 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3144 				struct extent_buffer *eb)
3145 {
3146 	unsigned long i;
3147 	struct page *page;
3148 	unsigned long num_pages;
3149 
3150 	num_pages = num_extent_pages(eb->start, eb->len);
3151 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3152 
3153 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3154 			      GFP_NOFS);
3155 	for (i = 0; i < num_pages; i++) {
3156 		page = extent_buffer_page(eb, i);
3157 		if (page)
3158 			ClearPageUptodate(page);
3159 	}
3160 	return 0;
3161 }
3162 
3163 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3164 				struct extent_buffer *eb)
3165 {
3166 	unsigned long i;
3167 	struct page *page;
3168 	unsigned long num_pages;
3169 
3170 	num_pages = num_extent_pages(eb->start, eb->len);
3171 
3172 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3173 			    GFP_NOFS);
3174 	for (i = 0; i < num_pages; i++) {
3175 		page = extent_buffer_page(eb, i);
3176 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3177 		    ((i == num_pages - 1) &&
3178 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3179 			check_page_uptodate(tree, page);
3180 			continue;
3181 		}
3182 		SetPageUptodate(page);
3183 	}
3184 	return 0;
3185 }
3186 
3187 int extent_range_uptodate(struct extent_io_tree *tree,
3188 			  u64 start, u64 end)
3189 {
3190 	struct page *page;
3191 	int ret;
3192 	int pg_uptodate = 1;
3193 	int uptodate;
3194 	unsigned long index;
3195 
3196 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3197 	if (ret)
3198 		return 1;
3199 	while (start <= end) {
3200 		index = start >> PAGE_CACHE_SHIFT;
3201 		page = find_get_page(tree->mapping, index);
3202 		uptodate = PageUptodate(page);
3203 		page_cache_release(page);
3204 		if (!uptodate) {
3205 			pg_uptodate = 0;
3206 			break;
3207 		}
3208 		start += PAGE_CACHE_SIZE;
3209 	}
3210 	return pg_uptodate;
3211 }
3212 
3213 int extent_buffer_uptodate(struct extent_io_tree *tree,
3214 			   struct extent_buffer *eb)
3215 {
3216 	int ret = 0;
3217 	unsigned long num_pages;
3218 	unsigned long i;
3219 	struct page *page;
3220 	int pg_uptodate = 1;
3221 
3222 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3223 		return 1;
3224 
3225 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3226 			   EXTENT_UPTODATE, 1);
3227 	if (ret)
3228 		return ret;
3229 
3230 	num_pages = num_extent_pages(eb->start, eb->len);
3231 	for (i = 0; i < num_pages; i++) {
3232 		page = extent_buffer_page(eb, i);
3233 		if (!PageUptodate(page)) {
3234 			pg_uptodate = 0;
3235 			break;
3236 		}
3237 	}
3238 	return pg_uptodate;
3239 }
3240 
3241 int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 			     struct extent_buffer *eb,
3243 			     u64 start, int wait,
3244 			     get_extent_t *get_extent, int mirror_num)
3245 {
3246 	unsigned long i;
3247 	unsigned long start_i;
3248 	struct page *page;
3249 	int err;
3250 	int ret = 0;
3251 	int locked_pages = 0;
3252 	int all_uptodate = 1;
3253 	int inc_all_pages = 0;
3254 	unsigned long num_pages;
3255 	struct bio *bio = NULL;
3256 	unsigned long bio_flags = 0;
3257 
3258 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3259 		return 0;
3260 
3261 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3262 			   EXTENT_UPTODATE, 1)) {
3263 		return 0;
3264 	}
3265 
3266 	if (start) {
3267 		WARN_ON(start < eb->start);
3268 		start_i = (start >> PAGE_CACHE_SHIFT) -
3269 			(eb->start >> PAGE_CACHE_SHIFT);
3270 	} else {
3271 		start_i = 0;
3272 	}
3273 
3274 	num_pages = num_extent_pages(eb->start, eb->len);
3275 	for (i = start_i; i < num_pages; i++) {
3276 		page = extent_buffer_page(eb, i);
3277 		if (!wait) {
3278 			if (!trylock_page(page))
3279 				goto unlock_exit;
3280 		} else {
3281 			lock_page(page);
3282 		}
3283 		locked_pages++;
3284 		if (!PageUptodate(page))
3285 			all_uptodate = 0;
3286 	}
3287 	if (all_uptodate) {
3288 		if (start_i == 0)
3289 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3290 		goto unlock_exit;
3291 	}
3292 
3293 	for (i = start_i; i < num_pages; i++) {
3294 		page = extent_buffer_page(eb, i);
3295 		if (inc_all_pages)
3296 			page_cache_get(page);
3297 		if (!PageUptodate(page)) {
3298 			if (start_i == 0)
3299 				inc_all_pages = 1;
3300 			ClearPageError(page);
3301 			err = __extent_read_full_page(tree, page,
3302 						      get_extent, &bio,
3303 						      mirror_num, &bio_flags);
3304 			if (err)
3305 				ret = err;
3306 		} else {
3307 			unlock_page(page);
3308 		}
3309 	}
3310 
3311 	if (bio)
3312 		submit_one_bio(READ, bio, mirror_num, bio_flags);
3313 
3314 	if (ret || !wait)
3315 		return ret;
3316 
3317 	for (i = start_i; i < num_pages; i++) {
3318 		page = extent_buffer_page(eb, i);
3319 		wait_on_page_locked(page);
3320 		if (!PageUptodate(page))
3321 			ret = -EIO;
3322 	}
3323 
3324 	if (!ret)
3325 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3326 	return ret;
3327 
3328 unlock_exit:
3329 	i = start_i;
3330 	while (locked_pages > 0) {
3331 		page = extent_buffer_page(eb, i);
3332 		i++;
3333 		unlock_page(page);
3334 		locked_pages--;
3335 	}
3336 	return ret;
3337 }
3338 
3339 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3340 			unsigned long start,
3341 			unsigned long len)
3342 {
3343 	size_t cur;
3344 	size_t offset;
3345 	struct page *page;
3346 	char *kaddr;
3347 	char *dst = (char *)dstv;
3348 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3349 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3350 
3351 	WARN_ON(start > eb->len);
3352 	WARN_ON(start + len > eb->start + eb->len);
3353 
3354 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3355 
3356 	while (len > 0) {
3357 		page = extent_buffer_page(eb, i);
3358 
3359 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3360 		kaddr = kmap_atomic(page, KM_USER1);
3361 		memcpy(dst, kaddr + offset, cur);
3362 		kunmap_atomic(kaddr, KM_USER1);
3363 
3364 		dst += cur;
3365 		len -= cur;
3366 		offset = 0;
3367 		i++;
3368 	}
3369 }
3370 
3371 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3372 			       unsigned long min_len, char **token, char **map,
3373 			       unsigned long *map_start,
3374 			       unsigned long *map_len, int km)
3375 {
3376 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3377 	char *kaddr;
3378 	struct page *p;
3379 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3380 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3381 	unsigned long end_i = (start_offset + start + min_len - 1) >>
3382 		PAGE_CACHE_SHIFT;
3383 
3384 	if (i != end_i)
3385 		return -EINVAL;
3386 
3387 	if (i == 0) {
3388 		offset = start_offset;
3389 		*map_start = 0;
3390 	} else {
3391 		offset = 0;
3392 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3393 	}
3394 
3395 	if (start + min_len > eb->len) {
3396 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3397 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3398 		       eb->len, start, min_len);
3399 		WARN_ON(1);
3400 	}
3401 
3402 	p = extent_buffer_page(eb, i);
3403 	kaddr = kmap_atomic(p, km);
3404 	*token = kaddr;
3405 	*map = kaddr + offset;
3406 	*map_len = PAGE_CACHE_SIZE - offset;
3407 	return 0;
3408 }
3409 
3410 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3411 		      unsigned long min_len,
3412 		      char **token, char **map,
3413 		      unsigned long *map_start,
3414 		      unsigned long *map_len, int km)
3415 {
3416 	int err;
3417 	int save = 0;
3418 	if (eb->map_token) {
3419 		unmap_extent_buffer(eb, eb->map_token, km);
3420 		eb->map_token = NULL;
3421 		save = 1;
3422 	}
3423 	err = map_private_extent_buffer(eb, start, min_len, token, map,
3424 				       map_start, map_len, km);
3425 	if (!err && save) {
3426 		eb->map_token = *token;
3427 		eb->kaddr = *map;
3428 		eb->map_start = *map_start;
3429 		eb->map_len = *map_len;
3430 	}
3431 	return err;
3432 }
3433 
3434 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3435 {
3436 	kunmap_atomic(token, km);
3437 }
3438 
3439 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3440 			  unsigned long start,
3441 			  unsigned long len)
3442 {
3443 	size_t cur;
3444 	size_t offset;
3445 	struct page *page;
3446 	char *kaddr;
3447 	char *ptr = (char *)ptrv;
3448 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3449 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3450 	int ret = 0;
3451 
3452 	WARN_ON(start > eb->len);
3453 	WARN_ON(start + len > eb->start + eb->len);
3454 
3455 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3456 
3457 	while (len > 0) {
3458 		page = extent_buffer_page(eb, i);
3459 
3460 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3461 
3462 		kaddr = kmap_atomic(page, KM_USER0);
3463 		ret = memcmp(ptr, kaddr + offset, cur);
3464 		kunmap_atomic(kaddr, KM_USER0);
3465 		if (ret)
3466 			break;
3467 
3468 		ptr += cur;
3469 		len -= cur;
3470 		offset = 0;
3471 		i++;
3472 	}
3473 	return ret;
3474 }
3475 
3476 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3477 			 unsigned long start, unsigned long len)
3478 {
3479 	size_t cur;
3480 	size_t offset;
3481 	struct page *page;
3482 	char *kaddr;
3483 	char *src = (char *)srcv;
3484 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3485 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3486 
3487 	WARN_ON(start > eb->len);
3488 	WARN_ON(start + len > eb->start + eb->len);
3489 
3490 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3491 
3492 	while (len > 0) {
3493 		page = extent_buffer_page(eb, i);
3494 		WARN_ON(!PageUptodate(page));
3495 
3496 		cur = min(len, PAGE_CACHE_SIZE - offset);
3497 		kaddr = kmap_atomic(page, KM_USER1);
3498 		memcpy(kaddr + offset, src, cur);
3499 		kunmap_atomic(kaddr, KM_USER1);
3500 
3501 		src += cur;
3502 		len -= cur;
3503 		offset = 0;
3504 		i++;
3505 	}
3506 }
3507 
3508 void memset_extent_buffer(struct extent_buffer *eb, char c,
3509 			  unsigned long start, unsigned long len)
3510 {
3511 	size_t cur;
3512 	size_t offset;
3513 	struct page *page;
3514 	char *kaddr;
3515 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3516 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3517 
3518 	WARN_ON(start > eb->len);
3519 	WARN_ON(start + len > eb->start + eb->len);
3520 
3521 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3522 
3523 	while (len > 0) {
3524 		page = extent_buffer_page(eb, i);
3525 		WARN_ON(!PageUptodate(page));
3526 
3527 		cur = min(len, PAGE_CACHE_SIZE - offset);
3528 		kaddr = kmap_atomic(page, KM_USER0);
3529 		memset(kaddr + offset, c, cur);
3530 		kunmap_atomic(kaddr, KM_USER0);
3531 
3532 		len -= cur;
3533 		offset = 0;
3534 		i++;
3535 	}
3536 }
3537 
3538 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3539 			unsigned long dst_offset, unsigned long src_offset,
3540 			unsigned long len)
3541 {
3542 	u64 dst_len = dst->len;
3543 	size_t cur;
3544 	size_t offset;
3545 	struct page *page;
3546 	char *kaddr;
3547 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3548 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3549 
3550 	WARN_ON(src->len != dst_len);
3551 
3552 	offset = (start_offset + dst_offset) &
3553 		((unsigned long)PAGE_CACHE_SIZE - 1);
3554 
3555 	while (len > 0) {
3556 		page = extent_buffer_page(dst, i);
3557 		WARN_ON(!PageUptodate(page));
3558 
3559 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3560 
3561 		kaddr = kmap_atomic(page, KM_USER0);
3562 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3563 		kunmap_atomic(kaddr, KM_USER0);
3564 
3565 		src_offset += cur;
3566 		len -= cur;
3567 		offset = 0;
3568 		i++;
3569 	}
3570 }
3571 
3572 static void move_pages(struct page *dst_page, struct page *src_page,
3573 		       unsigned long dst_off, unsigned long src_off,
3574 		       unsigned long len)
3575 {
3576 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3577 	if (dst_page == src_page) {
3578 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3579 	} else {
3580 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3581 		char *p = dst_kaddr + dst_off + len;
3582 		char *s = src_kaddr + src_off + len;
3583 
3584 		while (len--)
3585 			*--p = *--s;
3586 
3587 		kunmap_atomic(src_kaddr, KM_USER1);
3588 	}
3589 	kunmap_atomic(dst_kaddr, KM_USER0);
3590 }
3591 
3592 static void copy_pages(struct page *dst_page, struct page *src_page,
3593 		       unsigned long dst_off, unsigned long src_off,
3594 		       unsigned long len)
3595 {
3596 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3597 	char *src_kaddr;
3598 
3599 	if (dst_page != src_page)
3600 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3601 	else
3602 		src_kaddr = dst_kaddr;
3603 
3604 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3605 	kunmap_atomic(dst_kaddr, KM_USER0);
3606 	if (dst_page != src_page)
3607 		kunmap_atomic(src_kaddr, KM_USER1);
3608 }
3609 
3610 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3611 			   unsigned long src_offset, unsigned long len)
3612 {
3613 	size_t cur;
3614 	size_t dst_off_in_page;
3615 	size_t src_off_in_page;
3616 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3617 	unsigned long dst_i;
3618 	unsigned long src_i;
3619 
3620 	if (src_offset + len > dst->len) {
3621 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3622 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3623 		BUG_ON(1);
3624 	}
3625 	if (dst_offset + len > dst->len) {
3626 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3627 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3628 		BUG_ON(1);
3629 	}
3630 
3631 	while (len > 0) {
3632 		dst_off_in_page = (start_offset + dst_offset) &
3633 			((unsigned long)PAGE_CACHE_SIZE - 1);
3634 		src_off_in_page = (start_offset + src_offset) &
3635 			((unsigned long)PAGE_CACHE_SIZE - 1);
3636 
3637 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3638 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3639 
3640 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3641 					       src_off_in_page));
3642 		cur = min_t(unsigned long, cur,
3643 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3644 
3645 		copy_pages(extent_buffer_page(dst, dst_i),
3646 			   extent_buffer_page(dst, src_i),
3647 			   dst_off_in_page, src_off_in_page, cur);
3648 
3649 		src_offset += cur;
3650 		dst_offset += cur;
3651 		len -= cur;
3652 	}
3653 }
3654 
3655 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3656 			   unsigned long src_offset, unsigned long len)
3657 {
3658 	size_t cur;
3659 	size_t dst_off_in_page;
3660 	size_t src_off_in_page;
3661 	unsigned long dst_end = dst_offset + len - 1;
3662 	unsigned long src_end = src_offset + len - 1;
3663 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3664 	unsigned long dst_i;
3665 	unsigned long src_i;
3666 
3667 	if (src_offset + len > dst->len) {
3668 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3669 		       "len %lu len %lu\n", src_offset, len, dst->len);
3670 		BUG_ON(1);
3671 	}
3672 	if (dst_offset + len > dst->len) {
3673 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3674 		       "len %lu len %lu\n", dst_offset, len, dst->len);
3675 		BUG_ON(1);
3676 	}
3677 	if (dst_offset < src_offset) {
3678 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3679 		return;
3680 	}
3681 	while (len > 0) {
3682 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3683 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3684 
3685 		dst_off_in_page = (start_offset + dst_end) &
3686 			((unsigned long)PAGE_CACHE_SIZE - 1);
3687 		src_off_in_page = (start_offset + src_end) &
3688 			((unsigned long)PAGE_CACHE_SIZE - 1);
3689 
3690 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3691 		cur = min(cur, dst_off_in_page + 1);
3692 		move_pages(extent_buffer_page(dst, dst_i),
3693 			   extent_buffer_page(dst, src_i),
3694 			   dst_off_in_page - cur + 1,
3695 			   src_off_in_page - cur + 1, cur);
3696 
3697 		dst_end -= cur;
3698 		src_end -= cur;
3699 		len -= cur;
3700 	}
3701 }
3702 
3703 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3704 {
3705 	u64 start = page_offset(page);
3706 	struct extent_buffer *eb;
3707 	int ret = 1;
3708 	unsigned long i;
3709 	unsigned long num_pages;
3710 
3711 	spin_lock(&tree->buffer_lock);
3712 	eb = buffer_search(tree, start);
3713 	if (!eb)
3714 		goto out;
3715 
3716 	if (atomic_read(&eb->refs) > 1) {
3717 		ret = 0;
3718 		goto out;
3719 	}
3720 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3721 		ret = 0;
3722 		goto out;
3723 	}
3724 	/* at this point we can safely release the extent buffer */
3725 	num_pages = num_extent_pages(eb->start, eb->len);
3726 	for (i = 0; i < num_pages; i++)
3727 		page_cache_release(extent_buffer_page(eb, i));
3728 	rb_erase(&eb->rb_node, &tree->buffer);
3729 	__free_extent_buffer(eb);
3730 out:
3731 	spin_unlock(&tree->buffer_lock);
3732 	return ret;
3733 }
3734