xref: /linux/fs/btrfs/extent_io.c (revision b233b28eac0cc37d07c2d007ea08c86c778c5af4)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 
20 /* temporary define until extent_map moves out of btrfs */
21 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
22 				       unsigned long extra_flags,
23 				       void (*ctor)(void *, struct kmem_cache *,
24 						    unsigned long));
25 
26 static struct kmem_cache *extent_state_cache;
27 static struct kmem_cache *extent_buffer_cache;
28 
29 static LIST_HEAD(buffers);
30 static LIST_HEAD(states);
31 
32 #define LEAK_DEBUG 0
33 #if LEAK_DEBUG
34 static DEFINE_SPINLOCK(leak_lock);
35 #endif
36 
37 #define BUFFER_LRU_MAX 64
38 
39 struct tree_entry {
40 	u64 start;
41 	u64 end;
42 	struct rb_node rb_node;
43 };
44 
45 struct extent_page_data {
46 	struct bio *bio;
47 	struct extent_io_tree *tree;
48 	get_extent_t *get_extent;
49 
50 	/* tells writepage not to lock the state bits for this range
51 	 * it still does the unlocking
52 	 */
53 	int extent_locked;
54 };
55 
56 int __init extent_io_init(void)
57 {
58 	extent_state_cache = btrfs_cache_create("extent_state",
59 					    sizeof(struct extent_state), 0,
60 					    NULL);
61 	if (!extent_state_cache)
62 		return -ENOMEM;
63 
64 	extent_buffer_cache = btrfs_cache_create("extent_buffers",
65 					    sizeof(struct extent_buffer), 0,
66 					    NULL);
67 	if (!extent_buffer_cache)
68 		goto free_state_cache;
69 	return 0;
70 
71 free_state_cache:
72 	kmem_cache_destroy(extent_state_cache);
73 	return -ENOMEM;
74 }
75 
76 void extent_io_exit(void)
77 {
78 	struct extent_state *state;
79 	struct extent_buffer *eb;
80 
81 	while (!list_empty(&states)) {
82 		state = list_entry(states.next, struct extent_state, leak_list);
83 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
84 		       "state %lu in tree %p refs %d\n",
85 		       (unsigned long long)state->start,
86 		       (unsigned long long)state->end,
87 		       state->state, state->tree, atomic_read(&state->refs));
88 		list_del(&state->leak_list);
89 		kmem_cache_free(extent_state_cache, state);
90 
91 	}
92 
93 	while (!list_empty(&buffers)) {
94 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
95 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
96 		       "refs %d\n", (unsigned long long)eb->start,
97 		       eb->len, atomic_read(&eb->refs));
98 		list_del(&eb->leak_list);
99 		kmem_cache_free(extent_buffer_cache, eb);
100 	}
101 	if (extent_state_cache)
102 		kmem_cache_destroy(extent_state_cache);
103 	if (extent_buffer_cache)
104 		kmem_cache_destroy(extent_buffer_cache);
105 }
106 
107 void extent_io_tree_init(struct extent_io_tree *tree,
108 			  struct address_space *mapping, gfp_t mask)
109 {
110 	tree->state.rb_node = NULL;
111 	tree->buffer.rb_node = NULL;
112 	tree->ops = NULL;
113 	tree->dirty_bytes = 0;
114 	spin_lock_init(&tree->lock);
115 	spin_lock_init(&tree->buffer_lock);
116 	tree->mapping = mapping;
117 }
118 
119 static struct extent_state *alloc_extent_state(gfp_t mask)
120 {
121 	struct extent_state *state;
122 #if LEAK_DEBUG
123 	unsigned long flags;
124 #endif
125 
126 	state = kmem_cache_alloc(extent_state_cache, mask);
127 	if (!state)
128 		return state;
129 	state->state = 0;
130 	state->private = 0;
131 	state->tree = NULL;
132 #if LEAK_DEBUG
133 	spin_lock_irqsave(&leak_lock, flags);
134 	list_add(&state->leak_list, &states);
135 	spin_unlock_irqrestore(&leak_lock, flags);
136 #endif
137 	atomic_set(&state->refs, 1);
138 	init_waitqueue_head(&state->wq);
139 	return state;
140 }
141 
142 static void free_extent_state(struct extent_state *state)
143 {
144 	if (!state)
145 		return;
146 	if (atomic_dec_and_test(&state->refs)) {
147 #if LEAK_DEBUG
148 		unsigned long flags;
149 #endif
150 		WARN_ON(state->tree);
151 #if LEAK_DEBUG
152 		spin_lock_irqsave(&leak_lock, flags);
153 		list_del(&state->leak_list);
154 		spin_unlock_irqrestore(&leak_lock, flags);
155 #endif
156 		kmem_cache_free(extent_state_cache, state);
157 	}
158 }
159 
160 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
161 				   struct rb_node *node)
162 {
163 	struct rb_node **p = &root->rb_node;
164 	struct rb_node *parent = NULL;
165 	struct tree_entry *entry;
166 
167 	while (*p) {
168 		parent = *p;
169 		entry = rb_entry(parent, struct tree_entry, rb_node);
170 
171 		if (offset < entry->start)
172 			p = &(*p)->rb_left;
173 		else if (offset > entry->end)
174 			p = &(*p)->rb_right;
175 		else
176 			return parent;
177 	}
178 
179 	entry = rb_entry(node, struct tree_entry, rb_node);
180 	rb_link_node(node, parent, p);
181 	rb_insert_color(node, root);
182 	return NULL;
183 }
184 
185 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
186 				     struct rb_node **prev_ret,
187 				     struct rb_node **next_ret)
188 {
189 	struct rb_root *root = &tree->state;
190 	struct rb_node *n = root->rb_node;
191 	struct rb_node *prev = NULL;
192 	struct rb_node *orig_prev = NULL;
193 	struct tree_entry *entry;
194 	struct tree_entry *prev_entry = NULL;
195 
196 	while (n) {
197 		entry = rb_entry(n, struct tree_entry, rb_node);
198 		prev = n;
199 		prev_entry = entry;
200 
201 		if (offset < entry->start)
202 			n = n->rb_left;
203 		else if (offset > entry->end)
204 			n = n->rb_right;
205 		else
206 			return n;
207 	}
208 
209 	if (prev_ret) {
210 		orig_prev = prev;
211 		while (prev && offset > prev_entry->end) {
212 			prev = rb_next(prev);
213 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214 		}
215 		*prev_ret = prev;
216 		prev = orig_prev;
217 	}
218 
219 	if (next_ret) {
220 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		while (prev && offset < prev_entry->start) {
222 			prev = rb_prev(prev);
223 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
224 		}
225 		*next_ret = prev;
226 	}
227 	return NULL;
228 }
229 
230 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
231 					  u64 offset)
232 {
233 	struct rb_node *prev = NULL;
234 	struct rb_node *ret;
235 
236 	ret = __etree_search(tree, offset, &prev, NULL);
237 	if (!ret)
238 		return prev;
239 	return ret;
240 }
241 
242 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
243 					  u64 offset, struct rb_node *node)
244 {
245 	struct rb_root *root = &tree->buffer;
246 	struct rb_node **p = &root->rb_node;
247 	struct rb_node *parent = NULL;
248 	struct extent_buffer *eb;
249 
250 	while (*p) {
251 		parent = *p;
252 		eb = rb_entry(parent, struct extent_buffer, rb_node);
253 
254 		if (offset < eb->start)
255 			p = &(*p)->rb_left;
256 		else if (offset > eb->start)
257 			p = &(*p)->rb_right;
258 		else
259 			return eb;
260 	}
261 
262 	rb_link_node(node, parent, p);
263 	rb_insert_color(node, root);
264 	return NULL;
265 }
266 
267 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
268 					   u64 offset)
269 {
270 	struct rb_root *root = &tree->buffer;
271 	struct rb_node *n = root->rb_node;
272 	struct extent_buffer *eb;
273 
274 	while (n) {
275 		eb = rb_entry(n, struct extent_buffer, rb_node);
276 		if (offset < eb->start)
277 			n = n->rb_left;
278 		else if (offset > eb->start)
279 			n = n->rb_right;
280 		else
281 			return eb;
282 	}
283 	return NULL;
284 }
285 
286 /*
287  * utility function to look for merge candidates inside a given range.
288  * Any extents with matching state are merged together into a single
289  * extent in the tree.  Extents with EXTENT_IO in their state field
290  * are not merged because the end_io handlers need to be able to do
291  * operations on them without sleeping (or doing allocations/splits).
292  *
293  * This should be called with the tree lock held.
294  */
295 static int merge_state(struct extent_io_tree *tree,
296 		       struct extent_state *state)
297 {
298 	struct extent_state *other;
299 	struct rb_node *other_node;
300 
301 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
302 		return 0;
303 
304 	other_node = rb_prev(&state->rb_node);
305 	if (other_node) {
306 		other = rb_entry(other_node, struct extent_state, rb_node);
307 		if (other->end == state->start - 1 &&
308 		    other->state == state->state) {
309 			state->start = other->start;
310 			other->tree = NULL;
311 			rb_erase(&other->rb_node, &tree->state);
312 			free_extent_state(other);
313 		}
314 	}
315 	other_node = rb_next(&state->rb_node);
316 	if (other_node) {
317 		other = rb_entry(other_node, struct extent_state, rb_node);
318 		if (other->start == state->end + 1 &&
319 		    other->state == state->state) {
320 			other->start = state->start;
321 			state->tree = NULL;
322 			rb_erase(&state->rb_node, &tree->state);
323 			free_extent_state(state);
324 		}
325 	}
326 	return 0;
327 }
328 
329 static void set_state_cb(struct extent_io_tree *tree,
330 			 struct extent_state *state,
331 			 unsigned long bits)
332 {
333 	if (tree->ops && tree->ops->set_bit_hook) {
334 		tree->ops->set_bit_hook(tree->mapping->host, state->start,
335 					state->end, state->state, bits);
336 	}
337 }
338 
339 static void clear_state_cb(struct extent_io_tree *tree,
340 			   struct extent_state *state,
341 			   unsigned long bits)
342 {
343 	if (tree->ops && tree->ops->clear_bit_hook) {
344 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
345 					  state->end, state->state, bits);
346 	}
347 }
348 
349 /*
350  * insert an extent_state struct into the tree.  'bits' are set on the
351  * struct before it is inserted.
352  *
353  * This may return -EEXIST if the extent is already there, in which case the
354  * state struct is freed.
355  *
356  * The tree lock is not taken internally.  This is a utility function and
357  * probably isn't what you want to call (see set/clear_extent_bit).
358  */
359 static int insert_state(struct extent_io_tree *tree,
360 			struct extent_state *state, u64 start, u64 end,
361 			int bits)
362 {
363 	struct rb_node *node;
364 
365 	if (end < start) {
366 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
367 		       (unsigned long long)end,
368 		       (unsigned long long)start);
369 		WARN_ON(1);
370 	}
371 	if (bits & EXTENT_DIRTY)
372 		tree->dirty_bytes += end - start + 1;
373 	set_state_cb(tree, state, bits);
374 	state->state |= bits;
375 	state->start = start;
376 	state->end = end;
377 	node = tree_insert(&tree->state, end, &state->rb_node);
378 	if (node) {
379 		struct extent_state *found;
380 		found = rb_entry(node, struct extent_state, rb_node);
381 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
382 		       "%llu %llu\n", (unsigned long long)found->start,
383 		       (unsigned long long)found->end,
384 		       (unsigned long long)start, (unsigned long long)end);
385 		free_extent_state(state);
386 		return -EEXIST;
387 	}
388 	state->tree = tree;
389 	merge_state(tree, state);
390 	return 0;
391 }
392 
393 /*
394  * split a given extent state struct in two, inserting the preallocated
395  * struct 'prealloc' as the newly created second half.  'split' indicates an
396  * offset inside 'orig' where it should be split.
397  *
398  * Before calling,
399  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
400  * are two extent state structs in the tree:
401  * prealloc: [orig->start, split - 1]
402  * orig: [ split, orig->end ]
403  *
404  * The tree locks are not taken by this function. They need to be held
405  * by the caller.
406  */
407 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
408 		       struct extent_state *prealloc, u64 split)
409 {
410 	struct rb_node *node;
411 	prealloc->start = orig->start;
412 	prealloc->end = split - 1;
413 	prealloc->state = orig->state;
414 	orig->start = split;
415 
416 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
417 	if (node) {
418 		struct extent_state *found;
419 		found = rb_entry(node, struct extent_state, rb_node);
420 		free_extent_state(prealloc);
421 		return -EEXIST;
422 	}
423 	prealloc->tree = tree;
424 	return 0;
425 }
426 
427 /*
428  * utility function to clear some bits in an extent state struct.
429  * it will optionally wake up any one waiting on this state (wake == 1), or
430  * forcibly remove the state from the tree (delete == 1).
431  *
432  * If no bits are set on the state struct after clearing things, the
433  * struct is freed and removed from the tree
434  */
435 static int clear_state_bit(struct extent_io_tree *tree,
436 			    struct extent_state *state, int bits, int wake,
437 			    int delete)
438 {
439 	int ret = state->state & bits;
440 
441 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
442 		u64 range = state->end - state->start + 1;
443 		WARN_ON(range > tree->dirty_bytes);
444 		tree->dirty_bytes -= range;
445 	}
446 	clear_state_cb(tree, state, bits);
447 	state->state &= ~bits;
448 	if (wake)
449 		wake_up(&state->wq);
450 	if (delete || state->state == 0) {
451 		if (state->tree) {
452 			clear_state_cb(tree, state, state->state);
453 			rb_erase(&state->rb_node, &tree->state);
454 			state->tree = NULL;
455 			free_extent_state(state);
456 		} else {
457 			WARN_ON(1);
458 		}
459 	} else {
460 		merge_state(tree, state);
461 	}
462 	return ret;
463 }
464 
465 /*
466  * clear some bits on a range in the tree.  This may require splitting
467  * or inserting elements in the tree, so the gfp mask is used to
468  * indicate which allocations or sleeping are allowed.
469  *
470  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
471  * the given range from the tree regardless of state (ie for truncate).
472  *
473  * the range [start, end] is inclusive.
474  *
475  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
476  * bits were already set, or zero if none of the bits were already set.
477  */
478 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
479 		     int bits, int wake, int delete, gfp_t mask)
480 {
481 	struct extent_state *state;
482 	struct extent_state *prealloc = NULL;
483 	struct rb_node *node;
484 	int err;
485 	int set = 0;
486 
487 again:
488 	if (!prealloc && (mask & __GFP_WAIT)) {
489 		prealloc = alloc_extent_state(mask);
490 		if (!prealloc)
491 			return -ENOMEM;
492 	}
493 
494 	spin_lock(&tree->lock);
495 	/*
496 	 * this search will find the extents that end after
497 	 * our range starts
498 	 */
499 	node = tree_search(tree, start);
500 	if (!node)
501 		goto out;
502 	state = rb_entry(node, struct extent_state, rb_node);
503 	if (state->start > end)
504 		goto out;
505 	WARN_ON(state->end < start);
506 
507 	/*
508 	 *     | ---- desired range ---- |
509 	 *  | state | or
510 	 *  | ------------- state -------------- |
511 	 *
512 	 * We need to split the extent we found, and may flip
513 	 * bits on second half.
514 	 *
515 	 * If the extent we found extends past our range, we
516 	 * just split and search again.  It'll get split again
517 	 * the next time though.
518 	 *
519 	 * If the extent we found is inside our range, we clear
520 	 * the desired bit on it.
521 	 */
522 
523 	if (state->start < start) {
524 		if (!prealloc)
525 			prealloc = alloc_extent_state(GFP_ATOMIC);
526 		err = split_state(tree, state, prealloc, start);
527 		BUG_ON(err == -EEXIST);
528 		prealloc = NULL;
529 		if (err)
530 			goto out;
531 		if (state->end <= end) {
532 			start = state->end + 1;
533 			set |= clear_state_bit(tree, state, bits,
534 					wake, delete);
535 		} else {
536 			start = state->start;
537 		}
538 		goto search_again;
539 	}
540 	/*
541 	 * | ---- desired range ---- |
542 	 *                        | state |
543 	 * We need to split the extent, and clear the bit
544 	 * on the first half
545 	 */
546 	if (state->start <= end && state->end > end) {
547 		if (!prealloc)
548 			prealloc = alloc_extent_state(GFP_ATOMIC);
549 		err = split_state(tree, state, prealloc, end + 1);
550 		BUG_ON(err == -EEXIST);
551 
552 		if (wake)
553 			wake_up(&state->wq);
554 		set |= clear_state_bit(tree, prealloc, bits,
555 				       wake, delete);
556 		prealloc = NULL;
557 		goto out;
558 	}
559 
560 	start = state->end + 1;
561 	set |= clear_state_bit(tree, state, bits, wake, delete);
562 	goto search_again;
563 
564 out:
565 	spin_unlock(&tree->lock);
566 	if (prealloc)
567 		free_extent_state(prealloc);
568 
569 	return set;
570 
571 search_again:
572 	if (start > end)
573 		goto out;
574 	spin_unlock(&tree->lock);
575 	if (mask & __GFP_WAIT)
576 		cond_resched();
577 	goto again;
578 }
579 
580 static int wait_on_state(struct extent_io_tree *tree,
581 			 struct extent_state *state)
582 		__releases(tree->lock)
583 		__acquires(tree->lock)
584 {
585 	DEFINE_WAIT(wait);
586 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
587 	spin_unlock(&tree->lock);
588 	schedule();
589 	spin_lock(&tree->lock);
590 	finish_wait(&state->wq, &wait);
591 	return 0;
592 }
593 
594 /*
595  * waits for one or more bits to clear on a range in the state tree.
596  * The range [start, end] is inclusive.
597  * The tree lock is taken by this function
598  */
599 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
600 {
601 	struct extent_state *state;
602 	struct rb_node *node;
603 
604 	spin_lock(&tree->lock);
605 again:
606 	while (1) {
607 		/*
608 		 * this search will find all the extents that end after
609 		 * our range starts
610 		 */
611 		node = tree_search(tree, start);
612 		if (!node)
613 			break;
614 
615 		state = rb_entry(node, struct extent_state, rb_node);
616 
617 		if (state->start > end)
618 			goto out;
619 
620 		if (state->state & bits) {
621 			start = state->start;
622 			atomic_inc(&state->refs);
623 			wait_on_state(tree, state);
624 			free_extent_state(state);
625 			goto again;
626 		}
627 		start = state->end + 1;
628 
629 		if (start > end)
630 			break;
631 
632 		if (need_resched()) {
633 			spin_unlock(&tree->lock);
634 			cond_resched();
635 			spin_lock(&tree->lock);
636 		}
637 	}
638 out:
639 	spin_unlock(&tree->lock);
640 	return 0;
641 }
642 
643 static void set_state_bits(struct extent_io_tree *tree,
644 			   struct extent_state *state,
645 			   int bits)
646 {
647 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
648 		u64 range = state->end - state->start + 1;
649 		tree->dirty_bytes += range;
650 	}
651 	set_state_cb(tree, state, bits);
652 	state->state |= bits;
653 }
654 
655 /*
656  * set some bits on a range in the tree.  This may require allocations
657  * or sleeping, so the gfp mask is used to indicate what is allowed.
658  *
659  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
660  * range already has the desired bits set.  The start of the existing
661  * range is returned in failed_start in this case.
662  *
663  * [start, end] is inclusive
664  * This takes the tree lock.
665  */
666 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
667 			  int bits, int exclusive, u64 *failed_start,
668 			  gfp_t mask)
669 {
670 	struct extent_state *state;
671 	struct extent_state *prealloc = NULL;
672 	struct rb_node *node;
673 	int err = 0;
674 	int set;
675 	u64 last_start;
676 	u64 last_end;
677 again:
678 	if (!prealloc && (mask & __GFP_WAIT)) {
679 		prealloc = alloc_extent_state(mask);
680 		if (!prealloc)
681 			return -ENOMEM;
682 	}
683 
684 	spin_lock(&tree->lock);
685 	/*
686 	 * this search will find all the extents that end after
687 	 * our range starts.
688 	 */
689 	node = tree_search(tree, start);
690 	if (!node) {
691 		err = insert_state(tree, prealloc, start, end, bits);
692 		prealloc = NULL;
693 		BUG_ON(err == -EEXIST);
694 		goto out;
695 	}
696 
697 	state = rb_entry(node, struct extent_state, rb_node);
698 	last_start = state->start;
699 	last_end = state->end;
700 
701 	/*
702 	 * | ---- desired range ---- |
703 	 * | state |
704 	 *
705 	 * Just lock what we found and keep going
706 	 */
707 	if (state->start == start && state->end <= end) {
708 		set = state->state & bits;
709 		if (set && exclusive) {
710 			*failed_start = state->start;
711 			err = -EEXIST;
712 			goto out;
713 		}
714 		set_state_bits(tree, state, bits);
715 		start = state->end + 1;
716 		merge_state(tree, state);
717 		goto search_again;
718 	}
719 
720 	/*
721 	 *     | ---- desired range ---- |
722 	 * | state |
723 	 *   or
724 	 * | ------------- state -------------- |
725 	 *
726 	 * We need to split the extent we found, and may flip bits on
727 	 * second half.
728 	 *
729 	 * If the extent we found extends past our
730 	 * range, we just split and search again.  It'll get split
731 	 * again the next time though.
732 	 *
733 	 * If the extent we found is inside our range, we set the
734 	 * desired bit on it.
735 	 */
736 	if (state->start < start) {
737 		set = state->state & bits;
738 		if (exclusive && set) {
739 			*failed_start = start;
740 			err = -EEXIST;
741 			goto out;
742 		}
743 		err = split_state(tree, state, prealloc, start);
744 		BUG_ON(err == -EEXIST);
745 		prealloc = NULL;
746 		if (err)
747 			goto out;
748 		if (state->end <= end) {
749 			set_state_bits(tree, state, bits);
750 			start = state->end + 1;
751 			merge_state(tree, state);
752 		} else {
753 			start = state->start;
754 		}
755 		goto search_again;
756 	}
757 	/*
758 	 * | ---- desired range ---- |
759 	 *     | state | or               | state |
760 	 *
761 	 * There's a hole, we need to insert something in it and
762 	 * ignore the extent we found.
763 	 */
764 	if (state->start > start) {
765 		u64 this_end;
766 		if (end < last_start)
767 			this_end = end;
768 		else
769 			this_end = last_start - 1;
770 		err = insert_state(tree, prealloc, start, this_end,
771 				   bits);
772 		prealloc = NULL;
773 		BUG_ON(err == -EEXIST);
774 		if (err)
775 			goto out;
776 		start = this_end + 1;
777 		goto search_again;
778 	}
779 	/*
780 	 * | ---- desired range ---- |
781 	 *                        | state |
782 	 * We need to split the extent, and set the bit
783 	 * on the first half
784 	 */
785 	if (state->start <= end && state->end > end) {
786 		set = state->state & bits;
787 		if (exclusive && set) {
788 			*failed_start = start;
789 			err = -EEXIST;
790 			goto out;
791 		}
792 		err = split_state(tree, state, prealloc, end + 1);
793 		BUG_ON(err == -EEXIST);
794 
795 		set_state_bits(tree, prealloc, bits);
796 		merge_state(tree, prealloc);
797 		prealloc = NULL;
798 		goto out;
799 	}
800 
801 	goto search_again;
802 
803 out:
804 	spin_unlock(&tree->lock);
805 	if (prealloc)
806 		free_extent_state(prealloc);
807 
808 	return err;
809 
810 search_again:
811 	if (start > end)
812 		goto out;
813 	spin_unlock(&tree->lock);
814 	if (mask & __GFP_WAIT)
815 		cond_resched();
816 	goto again;
817 }
818 
819 /* wrappers around set/clear extent bit */
820 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
821 		     gfp_t mask)
822 {
823 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
824 			      mask);
825 }
826 
827 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
828 		       gfp_t mask)
829 {
830 	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
831 }
832 
833 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
834 		    int bits, gfp_t mask)
835 {
836 	return set_extent_bit(tree, start, end, bits, 0, NULL,
837 			      mask);
838 }
839 
840 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
841 		      int bits, gfp_t mask)
842 {
843 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
844 }
845 
846 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
847 		     gfp_t mask)
848 {
849 	return set_extent_bit(tree, start, end,
850 			      EXTENT_DELALLOC | EXTENT_DIRTY,
851 			      0, NULL, mask);
852 }
853 
854 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
855 		       gfp_t mask)
856 {
857 	return clear_extent_bit(tree, start, end,
858 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
859 }
860 
861 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
862 			 gfp_t mask)
863 {
864 	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
865 }
866 
867 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
868 		     gfp_t mask)
869 {
870 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
871 			      mask);
872 }
873 
874 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
875 		       gfp_t mask)
876 {
877 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
878 }
879 
880 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
881 			gfp_t mask)
882 {
883 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
884 			      mask);
885 }
886 
887 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
888 				 u64 end, gfp_t mask)
889 {
890 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
891 }
892 
893 static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
894 			 gfp_t mask)
895 {
896 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
897 			      0, NULL, mask);
898 }
899 
900 static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
901 				  u64 end, gfp_t mask)
902 {
903 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
904 }
905 
906 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
907 {
908 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
909 }
910 
911 /*
912  * either insert or lock state struct between start and end use mask to tell
913  * us if waiting is desired.
914  */
915 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
916 {
917 	int err;
918 	u64 failed_start;
919 	while (1) {
920 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
921 				     &failed_start, mask);
922 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
923 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
924 			start = failed_start;
925 		} else {
926 			break;
927 		}
928 		WARN_ON(start > end);
929 	}
930 	return err;
931 }
932 
933 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
934 		    gfp_t mask)
935 {
936 	int err;
937 	u64 failed_start;
938 
939 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
940 			     &failed_start, mask);
941 	if (err == -EEXIST) {
942 		if (failed_start > start)
943 			clear_extent_bit(tree, start, failed_start - 1,
944 					 EXTENT_LOCKED, 1, 0, mask);
945 		return 0;
946 	}
947 	return 1;
948 }
949 
950 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
951 		  gfp_t mask)
952 {
953 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
954 }
955 
956 /*
957  * helper function to set pages and extents in the tree dirty
958  */
959 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
960 {
961 	unsigned long index = start >> PAGE_CACHE_SHIFT;
962 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
963 	struct page *page;
964 
965 	while (index <= end_index) {
966 		page = find_get_page(tree->mapping, index);
967 		BUG_ON(!page);
968 		__set_page_dirty_nobuffers(page);
969 		page_cache_release(page);
970 		index++;
971 	}
972 	set_extent_dirty(tree, start, end, GFP_NOFS);
973 	return 0;
974 }
975 
976 /*
977  * helper function to set both pages and extents in the tree writeback
978  */
979 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
980 {
981 	unsigned long index = start >> PAGE_CACHE_SHIFT;
982 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
983 	struct page *page;
984 
985 	while (index <= end_index) {
986 		page = find_get_page(tree->mapping, index);
987 		BUG_ON(!page);
988 		set_page_writeback(page);
989 		page_cache_release(page);
990 		index++;
991 	}
992 	set_extent_writeback(tree, start, end, GFP_NOFS);
993 	return 0;
994 }
995 
996 /*
997  * find the first offset in the io tree with 'bits' set. zero is
998  * returned if we find something, and *start_ret and *end_ret are
999  * set to reflect the state struct that was found.
1000  *
1001  * If nothing was found, 1 is returned, < 0 on error
1002  */
1003 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1004 			  u64 *start_ret, u64 *end_ret, int bits)
1005 {
1006 	struct rb_node *node;
1007 	struct extent_state *state;
1008 	int ret = 1;
1009 
1010 	spin_lock(&tree->lock);
1011 	/*
1012 	 * this search will find all the extents that end after
1013 	 * our range starts.
1014 	 */
1015 	node = tree_search(tree, start);
1016 	if (!node)
1017 		goto out;
1018 
1019 	while (1) {
1020 		state = rb_entry(node, struct extent_state, rb_node);
1021 		if (state->end >= start && (state->state & bits)) {
1022 			*start_ret = state->start;
1023 			*end_ret = state->end;
1024 			ret = 0;
1025 			break;
1026 		}
1027 		node = rb_next(node);
1028 		if (!node)
1029 			break;
1030 	}
1031 out:
1032 	spin_unlock(&tree->lock);
1033 	return ret;
1034 }
1035 
1036 /* find the first state struct with 'bits' set after 'start', and
1037  * return it.  tree->lock must be held.  NULL will returned if
1038  * nothing was found after 'start'
1039  */
1040 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1041 						 u64 start, int bits)
1042 {
1043 	struct rb_node *node;
1044 	struct extent_state *state;
1045 
1046 	/*
1047 	 * this search will find all the extents that end after
1048 	 * our range starts.
1049 	 */
1050 	node = tree_search(tree, start);
1051 	if (!node)
1052 		goto out;
1053 
1054 	while (1) {
1055 		state = rb_entry(node, struct extent_state, rb_node);
1056 		if (state->end >= start && (state->state & bits))
1057 			return state;
1058 
1059 		node = rb_next(node);
1060 		if (!node)
1061 			break;
1062 	}
1063 out:
1064 	return NULL;
1065 }
1066 
1067 /*
1068  * find a contiguous range of bytes in the file marked as delalloc, not
1069  * more than 'max_bytes'.  start and end are used to return the range,
1070  *
1071  * 1 is returned if we find something, 0 if nothing was in the tree
1072  */
1073 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1074 					u64 *start, u64 *end, u64 max_bytes)
1075 {
1076 	struct rb_node *node;
1077 	struct extent_state *state;
1078 	u64 cur_start = *start;
1079 	u64 found = 0;
1080 	u64 total_bytes = 0;
1081 
1082 	spin_lock(&tree->lock);
1083 
1084 	/*
1085 	 * this search will find all the extents that end after
1086 	 * our range starts.
1087 	 */
1088 	node = tree_search(tree, cur_start);
1089 	if (!node) {
1090 		if (!found)
1091 			*end = (u64)-1;
1092 		goto out;
1093 	}
1094 
1095 	while (1) {
1096 		state = rb_entry(node, struct extent_state, rb_node);
1097 		if (found && (state->start != cur_start ||
1098 			      (state->state & EXTENT_BOUNDARY))) {
1099 			goto out;
1100 		}
1101 		if (!(state->state & EXTENT_DELALLOC)) {
1102 			if (!found)
1103 				*end = state->end;
1104 			goto out;
1105 		}
1106 		if (!found)
1107 			*start = state->start;
1108 		found++;
1109 		*end = state->end;
1110 		cur_start = state->end + 1;
1111 		node = rb_next(node);
1112 		if (!node)
1113 			break;
1114 		total_bytes += state->end - state->start + 1;
1115 		if (total_bytes >= max_bytes)
1116 			break;
1117 	}
1118 out:
1119 	spin_unlock(&tree->lock);
1120 	return found;
1121 }
1122 
1123 static noinline int __unlock_for_delalloc(struct inode *inode,
1124 					  struct page *locked_page,
1125 					  u64 start, u64 end)
1126 {
1127 	int ret;
1128 	struct page *pages[16];
1129 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1130 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1131 	unsigned long nr_pages = end_index - index + 1;
1132 	int i;
1133 
1134 	if (index == locked_page->index && end_index == index)
1135 		return 0;
1136 
1137 	while (nr_pages > 0) {
1138 		ret = find_get_pages_contig(inode->i_mapping, index,
1139 				     min_t(unsigned long, nr_pages,
1140 				     ARRAY_SIZE(pages)), pages);
1141 		for (i = 0; i < ret; i++) {
1142 			if (pages[i] != locked_page)
1143 				unlock_page(pages[i]);
1144 			page_cache_release(pages[i]);
1145 		}
1146 		nr_pages -= ret;
1147 		index += ret;
1148 		cond_resched();
1149 	}
1150 	return 0;
1151 }
1152 
1153 static noinline int lock_delalloc_pages(struct inode *inode,
1154 					struct page *locked_page,
1155 					u64 delalloc_start,
1156 					u64 delalloc_end)
1157 {
1158 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1159 	unsigned long start_index = index;
1160 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1161 	unsigned long pages_locked = 0;
1162 	struct page *pages[16];
1163 	unsigned long nrpages;
1164 	int ret;
1165 	int i;
1166 
1167 	/* the caller is responsible for locking the start index */
1168 	if (index == locked_page->index && index == end_index)
1169 		return 0;
1170 
1171 	/* skip the page at the start index */
1172 	nrpages = end_index - index + 1;
1173 	while (nrpages > 0) {
1174 		ret = find_get_pages_contig(inode->i_mapping, index,
1175 				     min_t(unsigned long,
1176 				     nrpages, ARRAY_SIZE(pages)), pages);
1177 		if (ret == 0) {
1178 			ret = -EAGAIN;
1179 			goto done;
1180 		}
1181 		/* now we have an array of pages, lock them all */
1182 		for (i = 0; i < ret; i++) {
1183 			/*
1184 			 * the caller is taking responsibility for
1185 			 * locked_page
1186 			 */
1187 			if (pages[i] != locked_page) {
1188 				lock_page(pages[i]);
1189 				if (!PageDirty(pages[i]) ||
1190 				    pages[i]->mapping != inode->i_mapping) {
1191 					ret = -EAGAIN;
1192 					unlock_page(pages[i]);
1193 					page_cache_release(pages[i]);
1194 					goto done;
1195 				}
1196 			}
1197 			page_cache_release(pages[i]);
1198 			pages_locked++;
1199 		}
1200 		nrpages -= ret;
1201 		index += ret;
1202 		cond_resched();
1203 	}
1204 	ret = 0;
1205 done:
1206 	if (ret && pages_locked) {
1207 		__unlock_for_delalloc(inode, locked_page,
1208 			      delalloc_start,
1209 			      ((u64)(start_index + pages_locked - 1)) <<
1210 			      PAGE_CACHE_SHIFT);
1211 	}
1212 	return ret;
1213 }
1214 
1215 /*
1216  * find a contiguous range of bytes in the file marked as delalloc, not
1217  * more than 'max_bytes'.  start and end are used to return the range,
1218  *
1219  * 1 is returned if we find something, 0 if nothing was in the tree
1220  */
1221 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1222 					     struct extent_io_tree *tree,
1223 					     struct page *locked_page,
1224 					     u64 *start, u64 *end,
1225 					     u64 max_bytes)
1226 {
1227 	u64 delalloc_start;
1228 	u64 delalloc_end;
1229 	u64 found;
1230 	int ret;
1231 	int loops = 0;
1232 
1233 again:
1234 	/* step one, find a bunch of delalloc bytes starting at start */
1235 	delalloc_start = *start;
1236 	delalloc_end = 0;
1237 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1238 				    max_bytes);
1239 	if (!found || delalloc_end <= *start) {
1240 		*start = delalloc_start;
1241 		*end = delalloc_end;
1242 		return found;
1243 	}
1244 
1245 	/*
1246 	 * start comes from the offset of locked_page.  We have to lock
1247 	 * pages in order, so we can't process delalloc bytes before
1248 	 * locked_page
1249 	 */
1250 	if (delalloc_start < *start)
1251 		delalloc_start = *start;
1252 
1253 	/*
1254 	 * make sure to limit the number of pages we try to lock down
1255 	 * if we're looping.
1256 	 */
1257 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1258 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1259 
1260 	/* step two, lock all the pages after the page that has start */
1261 	ret = lock_delalloc_pages(inode, locked_page,
1262 				  delalloc_start, delalloc_end);
1263 	if (ret == -EAGAIN) {
1264 		/* some of the pages are gone, lets avoid looping by
1265 		 * shortening the size of the delalloc range we're searching
1266 		 */
1267 		if (!loops) {
1268 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1269 			max_bytes = PAGE_CACHE_SIZE - offset;
1270 			loops = 1;
1271 			goto again;
1272 		} else {
1273 			found = 0;
1274 			goto out_failed;
1275 		}
1276 	}
1277 	BUG_ON(ret);
1278 
1279 	/* step three, lock the state bits for the whole range */
1280 	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1281 
1282 	/* then test to make sure it is all still delalloc */
1283 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1284 			     EXTENT_DELALLOC, 1);
1285 	if (!ret) {
1286 		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1287 		__unlock_for_delalloc(inode, locked_page,
1288 			      delalloc_start, delalloc_end);
1289 		cond_resched();
1290 		goto again;
1291 	}
1292 	*start = delalloc_start;
1293 	*end = delalloc_end;
1294 out_failed:
1295 	return found;
1296 }
1297 
1298 int extent_clear_unlock_delalloc(struct inode *inode,
1299 				struct extent_io_tree *tree,
1300 				u64 start, u64 end, struct page *locked_page,
1301 				int unlock_pages,
1302 				int clear_unlock,
1303 				int clear_delalloc, int clear_dirty,
1304 				int set_writeback,
1305 				int end_writeback)
1306 {
1307 	int ret;
1308 	struct page *pages[16];
1309 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1310 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1311 	unsigned long nr_pages = end_index - index + 1;
1312 	int i;
1313 	int clear_bits = 0;
1314 
1315 	if (clear_unlock)
1316 		clear_bits |= EXTENT_LOCKED;
1317 	if (clear_dirty)
1318 		clear_bits |= EXTENT_DIRTY;
1319 
1320 	if (clear_delalloc)
1321 		clear_bits |= EXTENT_DELALLOC;
1322 
1323 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1324 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1325 		return 0;
1326 
1327 	while (nr_pages > 0) {
1328 		ret = find_get_pages_contig(inode->i_mapping, index,
1329 				     min_t(unsigned long,
1330 				     nr_pages, ARRAY_SIZE(pages)), pages);
1331 		for (i = 0; i < ret; i++) {
1332 			if (pages[i] == locked_page) {
1333 				page_cache_release(pages[i]);
1334 				continue;
1335 			}
1336 			if (clear_dirty)
1337 				clear_page_dirty_for_io(pages[i]);
1338 			if (set_writeback)
1339 				set_page_writeback(pages[i]);
1340 			if (end_writeback)
1341 				end_page_writeback(pages[i]);
1342 			if (unlock_pages)
1343 				unlock_page(pages[i]);
1344 			page_cache_release(pages[i]);
1345 		}
1346 		nr_pages -= ret;
1347 		index += ret;
1348 		cond_resched();
1349 	}
1350 	return 0;
1351 }
1352 
1353 /*
1354  * count the number of bytes in the tree that have a given bit(s)
1355  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1356  * cached.  The total number found is returned.
1357  */
1358 u64 count_range_bits(struct extent_io_tree *tree,
1359 		     u64 *start, u64 search_end, u64 max_bytes,
1360 		     unsigned long bits)
1361 {
1362 	struct rb_node *node;
1363 	struct extent_state *state;
1364 	u64 cur_start = *start;
1365 	u64 total_bytes = 0;
1366 	int found = 0;
1367 
1368 	if (search_end <= cur_start) {
1369 		WARN_ON(1);
1370 		return 0;
1371 	}
1372 
1373 	spin_lock(&tree->lock);
1374 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1375 		total_bytes = tree->dirty_bytes;
1376 		goto out;
1377 	}
1378 	/*
1379 	 * this search will find all the extents that end after
1380 	 * our range starts.
1381 	 */
1382 	node = tree_search(tree, cur_start);
1383 	if (!node)
1384 		goto out;
1385 
1386 	while (1) {
1387 		state = rb_entry(node, struct extent_state, rb_node);
1388 		if (state->start > search_end)
1389 			break;
1390 		if (state->end >= cur_start && (state->state & bits)) {
1391 			total_bytes += min(search_end, state->end) + 1 -
1392 				       max(cur_start, state->start);
1393 			if (total_bytes >= max_bytes)
1394 				break;
1395 			if (!found) {
1396 				*start = state->start;
1397 				found = 1;
1398 			}
1399 		}
1400 		node = rb_next(node);
1401 		if (!node)
1402 			break;
1403 	}
1404 out:
1405 	spin_unlock(&tree->lock);
1406 	return total_bytes;
1407 }
1408 
1409 #if 0
1410 /*
1411  * helper function to lock both pages and extents in the tree.
1412  * pages must be locked first.
1413  */
1414 static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1415 {
1416 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1417 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1418 	struct page *page;
1419 	int err;
1420 
1421 	while (index <= end_index) {
1422 		page = grab_cache_page(tree->mapping, index);
1423 		if (!page) {
1424 			err = -ENOMEM;
1425 			goto failed;
1426 		}
1427 		if (IS_ERR(page)) {
1428 			err = PTR_ERR(page);
1429 			goto failed;
1430 		}
1431 		index++;
1432 	}
1433 	lock_extent(tree, start, end, GFP_NOFS);
1434 	return 0;
1435 
1436 failed:
1437 	/*
1438 	 * we failed above in getting the page at 'index', so we undo here
1439 	 * up to but not including the page at 'index'
1440 	 */
1441 	end_index = index;
1442 	index = start >> PAGE_CACHE_SHIFT;
1443 	while (index < end_index) {
1444 		page = find_get_page(tree->mapping, index);
1445 		unlock_page(page);
1446 		page_cache_release(page);
1447 		index++;
1448 	}
1449 	return err;
1450 }
1451 
1452 /*
1453  * helper function to unlock both pages and extents in the tree.
1454  */
1455 static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1456 {
1457 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1458 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1459 	struct page *page;
1460 
1461 	while (index <= end_index) {
1462 		page = find_get_page(tree->mapping, index);
1463 		unlock_page(page);
1464 		page_cache_release(page);
1465 		index++;
1466 	}
1467 	unlock_extent(tree, start, end, GFP_NOFS);
1468 	return 0;
1469 }
1470 #endif
1471 
1472 /*
1473  * set the private field for a given byte offset in the tree.  If there isn't
1474  * an extent_state there already, this does nothing.
1475  */
1476 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1477 {
1478 	struct rb_node *node;
1479 	struct extent_state *state;
1480 	int ret = 0;
1481 
1482 	spin_lock(&tree->lock);
1483 	/*
1484 	 * this search will find all the extents that end after
1485 	 * our range starts.
1486 	 */
1487 	node = tree_search(tree, start);
1488 	if (!node) {
1489 		ret = -ENOENT;
1490 		goto out;
1491 	}
1492 	state = rb_entry(node, struct extent_state, rb_node);
1493 	if (state->start != start) {
1494 		ret = -ENOENT;
1495 		goto out;
1496 	}
1497 	state->private = private;
1498 out:
1499 	spin_unlock(&tree->lock);
1500 	return ret;
1501 }
1502 
1503 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1504 {
1505 	struct rb_node *node;
1506 	struct extent_state *state;
1507 	int ret = 0;
1508 
1509 	spin_lock(&tree->lock);
1510 	/*
1511 	 * this search will find all the extents that end after
1512 	 * our range starts.
1513 	 */
1514 	node = tree_search(tree, start);
1515 	if (!node) {
1516 		ret = -ENOENT;
1517 		goto out;
1518 	}
1519 	state = rb_entry(node, struct extent_state, rb_node);
1520 	if (state->start != start) {
1521 		ret = -ENOENT;
1522 		goto out;
1523 	}
1524 	*private = state->private;
1525 out:
1526 	spin_unlock(&tree->lock);
1527 	return ret;
1528 }
1529 
1530 /*
1531  * searches a range in the state tree for a given mask.
1532  * If 'filled' == 1, this returns 1 only if every extent in the tree
1533  * has the bits set.  Otherwise, 1 is returned if any bit in the
1534  * range is found set.
1535  */
1536 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1537 		   int bits, int filled)
1538 {
1539 	struct extent_state *state = NULL;
1540 	struct rb_node *node;
1541 	int bitset = 0;
1542 
1543 	spin_lock(&tree->lock);
1544 	node = tree_search(tree, start);
1545 	while (node && start <= end) {
1546 		state = rb_entry(node, struct extent_state, rb_node);
1547 
1548 		if (filled && state->start > start) {
1549 			bitset = 0;
1550 			break;
1551 		}
1552 
1553 		if (state->start > end)
1554 			break;
1555 
1556 		if (state->state & bits) {
1557 			bitset = 1;
1558 			if (!filled)
1559 				break;
1560 		} else if (filled) {
1561 			bitset = 0;
1562 			break;
1563 		}
1564 		start = state->end + 1;
1565 		if (start > end)
1566 			break;
1567 		node = rb_next(node);
1568 		if (!node) {
1569 			if (filled)
1570 				bitset = 0;
1571 			break;
1572 		}
1573 	}
1574 	spin_unlock(&tree->lock);
1575 	return bitset;
1576 }
1577 
1578 /*
1579  * helper function to set a given page up to date if all the
1580  * extents in the tree for that page are up to date
1581  */
1582 static int check_page_uptodate(struct extent_io_tree *tree,
1583 			       struct page *page)
1584 {
1585 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1586 	u64 end = start + PAGE_CACHE_SIZE - 1;
1587 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1588 		SetPageUptodate(page);
1589 	return 0;
1590 }
1591 
1592 /*
1593  * helper function to unlock a page if all the extents in the tree
1594  * for that page are unlocked
1595  */
1596 static int check_page_locked(struct extent_io_tree *tree,
1597 			     struct page *page)
1598 {
1599 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1600 	u64 end = start + PAGE_CACHE_SIZE - 1;
1601 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1602 		unlock_page(page);
1603 	return 0;
1604 }
1605 
1606 /*
1607  * helper function to end page writeback if all the extents
1608  * in the tree for that page are done with writeback
1609  */
1610 static int check_page_writeback(struct extent_io_tree *tree,
1611 			     struct page *page)
1612 {
1613 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1614 	u64 end = start + PAGE_CACHE_SIZE - 1;
1615 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1616 		end_page_writeback(page);
1617 	return 0;
1618 }
1619 
1620 /* lots and lots of room for performance fixes in the end_bio funcs */
1621 
1622 /*
1623  * after a writepage IO is done, we need to:
1624  * clear the uptodate bits on error
1625  * clear the writeback bits in the extent tree for this IO
1626  * end_page_writeback if the page has no more pending IO
1627  *
1628  * Scheduling is not allowed, so the extent state tree is expected
1629  * to have one and only one object corresponding to this IO.
1630  */
1631 static void end_bio_extent_writepage(struct bio *bio, int err)
1632 {
1633 	int uptodate = err == 0;
1634 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1635 	struct extent_io_tree *tree;
1636 	u64 start;
1637 	u64 end;
1638 	int whole_page;
1639 	int ret;
1640 
1641 	do {
1642 		struct page *page = bvec->bv_page;
1643 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1644 
1645 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1646 			 bvec->bv_offset;
1647 		end = start + bvec->bv_len - 1;
1648 
1649 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1650 			whole_page = 1;
1651 		else
1652 			whole_page = 0;
1653 
1654 		if (--bvec >= bio->bi_io_vec)
1655 			prefetchw(&bvec->bv_page->flags);
1656 		if (tree->ops && tree->ops->writepage_end_io_hook) {
1657 			ret = tree->ops->writepage_end_io_hook(page, start,
1658 						       end, NULL, uptodate);
1659 			if (ret)
1660 				uptodate = 0;
1661 		}
1662 
1663 		if (!uptodate && tree->ops &&
1664 		    tree->ops->writepage_io_failed_hook) {
1665 			ret = tree->ops->writepage_io_failed_hook(bio, page,
1666 							 start, end, NULL);
1667 			if (ret == 0) {
1668 				uptodate = (err == 0);
1669 				continue;
1670 			}
1671 		}
1672 
1673 		if (!uptodate) {
1674 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1675 			ClearPageUptodate(page);
1676 			SetPageError(page);
1677 		}
1678 
1679 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1680 
1681 		if (whole_page)
1682 			end_page_writeback(page);
1683 		else
1684 			check_page_writeback(tree, page);
1685 	} while (bvec >= bio->bi_io_vec);
1686 
1687 	bio_put(bio);
1688 }
1689 
1690 /*
1691  * after a readpage IO is done, we need to:
1692  * clear the uptodate bits on error
1693  * set the uptodate bits if things worked
1694  * set the page up to date if all extents in the tree are uptodate
1695  * clear the lock bit in the extent tree
1696  * unlock the page if there are no other extents locked for it
1697  *
1698  * Scheduling is not allowed, so the extent state tree is expected
1699  * to have one and only one object corresponding to this IO.
1700  */
1701 static void end_bio_extent_readpage(struct bio *bio, int err)
1702 {
1703 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1704 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1705 	struct extent_io_tree *tree;
1706 	u64 start;
1707 	u64 end;
1708 	int whole_page;
1709 	int ret;
1710 
1711 	if (err)
1712 		uptodate = 0;
1713 
1714 	do {
1715 		struct page *page = bvec->bv_page;
1716 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1717 
1718 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1719 			bvec->bv_offset;
1720 		end = start + bvec->bv_len - 1;
1721 
1722 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1723 			whole_page = 1;
1724 		else
1725 			whole_page = 0;
1726 
1727 		if (--bvec >= bio->bi_io_vec)
1728 			prefetchw(&bvec->bv_page->flags);
1729 
1730 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1731 			ret = tree->ops->readpage_end_io_hook(page, start, end,
1732 							      NULL);
1733 			if (ret)
1734 				uptodate = 0;
1735 		}
1736 		if (!uptodate && tree->ops &&
1737 		    tree->ops->readpage_io_failed_hook) {
1738 			ret = tree->ops->readpage_io_failed_hook(bio, page,
1739 							 start, end, NULL);
1740 			if (ret == 0) {
1741 				uptodate =
1742 					test_bit(BIO_UPTODATE, &bio->bi_flags);
1743 				if (err)
1744 					uptodate = 0;
1745 				continue;
1746 			}
1747 		}
1748 
1749 		if (uptodate) {
1750 			set_extent_uptodate(tree, start, end,
1751 					    GFP_ATOMIC);
1752 		}
1753 		unlock_extent(tree, start, end, GFP_ATOMIC);
1754 
1755 		if (whole_page) {
1756 			if (uptodate) {
1757 				SetPageUptodate(page);
1758 			} else {
1759 				ClearPageUptodate(page);
1760 				SetPageError(page);
1761 			}
1762 			unlock_page(page);
1763 		} else {
1764 			if (uptodate) {
1765 				check_page_uptodate(tree, page);
1766 			} else {
1767 				ClearPageUptodate(page);
1768 				SetPageError(page);
1769 			}
1770 			check_page_locked(tree, page);
1771 		}
1772 	} while (bvec >= bio->bi_io_vec);
1773 
1774 	bio_put(bio);
1775 }
1776 
1777 /*
1778  * IO done from prepare_write is pretty simple, we just unlock
1779  * the structs in the extent tree when done, and set the uptodate bits
1780  * as appropriate.
1781  */
1782 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1783 {
1784 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1785 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1786 	struct extent_io_tree *tree;
1787 	u64 start;
1788 	u64 end;
1789 
1790 	do {
1791 		struct page *page = bvec->bv_page;
1792 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1793 
1794 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1795 			bvec->bv_offset;
1796 		end = start + bvec->bv_len - 1;
1797 
1798 		if (--bvec >= bio->bi_io_vec)
1799 			prefetchw(&bvec->bv_page->flags);
1800 
1801 		if (uptodate) {
1802 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1803 		} else {
1804 			ClearPageUptodate(page);
1805 			SetPageError(page);
1806 		}
1807 
1808 		unlock_extent(tree, start, end, GFP_ATOMIC);
1809 
1810 	} while (bvec >= bio->bi_io_vec);
1811 
1812 	bio_put(bio);
1813 }
1814 
1815 static struct bio *
1816 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1817 		 gfp_t gfp_flags)
1818 {
1819 	struct bio *bio;
1820 
1821 	bio = bio_alloc(gfp_flags, nr_vecs);
1822 
1823 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1824 		while (!bio && (nr_vecs /= 2))
1825 			bio = bio_alloc(gfp_flags, nr_vecs);
1826 	}
1827 
1828 	if (bio) {
1829 		bio->bi_size = 0;
1830 		bio->bi_bdev = bdev;
1831 		bio->bi_sector = first_sector;
1832 	}
1833 	return bio;
1834 }
1835 
1836 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1837 			  unsigned long bio_flags)
1838 {
1839 	int ret = 0;
1840 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1841 	struct page *page = bvec->bv_page;
1842 	struct extent_io_tree *tree = bio->bi_private;
1843 	u64 start;
1844 	u64 end;
1845 
1846 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1847 	end = start + bvec->bv_len - 1;
1848 
1849 	bio->bi_private = NULL;
1850 
1851 	bio_get(bio);
1852 
1853 	if (tree->ops && tree->ops->submit_bio_hook)
1854 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1855 					   mirror_num, bio_flags);
1856 	else
1857 		submit_bio(rw, bio);
1858 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1859 		ret = -EOPNOTSUPP;
1860 	bio_put(bio);
1861 	return ret;
1862 }
1863 
1864 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1865 			      struct page *page, sector_t sector,
1866 			      size_t size, unsigned long offset,
1867 			      struct block_device *bdev,
1868 			      struct bio **bio_ret,
1869 			      unsigned long max_pages,
1870 			      bio_end_io_t end_io_func,
1871 			      int mirror_num,
1872 			      unsigned long prev_bio_flags,
1873 			      unsigned long bio_flags)
1874 {
1875 	int ret = 0;
1876 	struct bio *bio;
1877 	int nr;
1878 	int contig = 0;
1879 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1880 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1881 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1882 
1883 	if (bio_ret && *bio_ret) {
1884 		bio = *bio_ret;
1885 		if (old_compressed)
1886 			contig = bio->bi_sector == sector;
1887 		else
1888 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1889 				sector;
1890 
1891 		if (prev_bio_flags != bio_flags || !contig ||
1892 		    (tree->ops && tree->ops->merge_bio_hook &&
1893 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1894 					       bio_flags)) ||
1895 		    bio_add_page(bio, page, page_size, offset) < page_size) {
1896 			ret = submit_one_bio(rw, bio, mirror_num,
1897 					     prev_bio_flags);
1898 			bio = NULL;
1899 		} else {
1900 			return 0;
1901 		}
1902 	}
1903 	if (this_compressed)
1904 		nr = BIO_MAX_PAGES;
1905 	else
1906 		nr = bio_get_nr_vecs(bdev);
1907 
1908 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1909 
1910 	bio_add_page(bio, page, page_size, offset);
1911 	bio->bi_end_io = end_io_func;
1912 	bio->bi_private = tree;
1913 
1914 	if (bio_ret)
1915 		*bio_ret = bio;
1916 	else
1917 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1918 
1919 	return ret;
1920 }
1921 
1922 void set_page_extent_mapped(struct page *page)
1923 {
1924 	if (!PagePrivate(page)) {
1925 		SetPagePrivate(page);
1926 		page_cache_get(page);
1927 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1928 	}
1929 }
1930 
1931 static void set_page_extent_head(struct page *page, unsigned long len)
1932 {
1933 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1934 }
1935 
1936 /*
1937  * basic readpage implementation.  Locked extent state structs are inserted
1938  * into the tree that are removed when the IO is done (by the end_io
1939  * handlers)
1940  */
1941 static int __extent_read_full_page(struct extent_io_tree *tree,
1942 				   struct page *page,
1943 				   get_extent_t *get_extent,
1944 				   struct bio **bio, int mirror_num,
1945 				   unsigned long *bio_flags)
1946 {
1947 	struct inode *inode = page->mapping->host;
1948 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1949 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1950 	u64 end;
1951 	u64 cur = start;
1952 	u64 extent_offset;
1953 	u64 last_byte = i_size_read(inode);
1954 	u64 block_start;
1955 	u64 cur_end;
1956 	sector_t sector;
1957 	struct extent_map *em;
1958 	struct block_device *bdev;
1959 	int ret;
1960 	int nr = 0;
1961 	size_t page_offset = 0;
1962 	size_t iosize;
1963 	size_t disk_io_size;
1964 	size_t blocksize = inode->i_sb->s_blocksize;
1965 	unsigned long this_bio_flag = 0;
1966 
1967 	set_page_extent_mapped(page);
1968 
1969 	end = page_end;
1970 	lock_extent(tree, start, end, GFP_NOFS);
1971 
1972 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1973 		char *userpage;
1974 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1975 
1976 		if (zero_offset) {
1977 			iosize = PAGE_CACHE_SIZE - zero_offset;
1978 			userpage = kmap_atomic(page, KM_USER0);
1979 			memset(userpage + zero_offset, 0, iosize);
1980 			flush_dcache_page(page);
1981 			kunmap_atomic(userpage, KM_USER0);
1982 		}
1983 	}
1984 	while (cur <= end) {
1985 		if (cur >= last_byte) {
1986 			char *userpage;
1987 			iosize = PAGE_CACHE_SIZE - page_offset;
1988 			userpage = kmap_atomic(page, KM_USER0);
1989 			memset(userpage + page_offset, 0, iosize);
1990 			flush_dcache_page(page);
1991 			kunmap_atomic(userpage, KM_USER0);
1992 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1993 					    GFP_NOFS);
1994 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1995 			break;
1996 		}
1997 		em = get_extent(inode, page, page_offset, cur,
1998 				end - cur + 1, 0);
1999 		if (IS_ERR(em) || !em) {
2000 			SetPageError(page);
2001 			unlock_extent(tree, cur, end, GFP_NOFS);
2002 			break;
2003 		}
2004 		extent_offset = cur - em->start;
2005 		BUG_ON(extent_map_end(em) <= cur);
2006 		BUG_ON(end < cur);
2007 
2008 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2009 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2010 
2011 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2012 		cur_end = min(extent_map_end(em) - 1, end);
2013 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2014 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2015 			disk_io_size = em->block_len;
2016 			sector = em->block_start >> 9;
2017 		} else {
2018 			sector = (em->block_start + extent_offset) >> 9;
2019 			disk_io_size = iosize;
2020 		}
2021 		bdev = em->bdev;
2022 		block_start = em->block_start;
2023 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2024 			block_start = EXTENT_MAP_HOLE;
2025 		free_extent_map(em);
2026 		em = NULL;
2027 
2028 		/* we've found a hole, just zero and go on */
2029 		if (block_start == EXTENT_MAP_HOLE) {
2030 			char *userpage;
2031 			userpage = kmap_atomic(page, KM_USER0);
2032 			memset(userpage + page_offset, 0, iosize);
2033 			flush_dcache_page(page);
2034 			kunmap_atomic(userpage, KM_USER0);
2035 
2036 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2037 					    GFP_NOFS);
2038 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2039 			cur = cur + iosize;
2040 			page_offset += iosize;
2041 			continue;
2042 		}
2043 		/* the get_extent function already copied into the page */
2044 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2045 			check_page_uptodate(tree, page);
2046 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2047 			cur = cur + iosize;
2048 			page_offset += iosize;
2049 			continue;
2050 		}
2051 		/* we have an inline extent but it didn't get marked up
2052 		 * to date.  Error out
2053 		 */
2054 		if (block_start == EXTENT_MAP_INLINE) {
2055 			SetPageError(page);
2056 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2057 			cur = cur + iosize;
2058 			page_offset += iosize;
2059 			continue;
2060 		}
2061 
2062 		ret = 0;
2063 		if (tree->ops && tree->ops->readpage_io_hook) {
2064 			ret = tree->ops->readpage_io_hook(page, cur,
2065 							  cur + iosize - 1);
2066 		}
2067 		if (!ret) {
2068 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2069 			pnr -= page->index;
2070 			ret = submit_extent_page(READ, tree, page,
2071 					 sector, disk_io_size, page_offset,
2072 					 bdev, bio, pnr,
2073 					 end_bio_extent_readpage, mirror_num,
2074 					 *bio_flags,
2075 					 this_bio_flag);
2076 			nr++;
2077 			*bio_flags = this_bio_flag;
2078 		}
2079 		if (ret)
2080 			SetPageError(page);
2081 		cur = cur + iosize;
2082 		page_offset += iosize;
2083 	}
2084 	if (!nr) {
2085 		if (!PageError(page))
2086 			SetPageUptodate(page);
2087 		unlock_page(page);
2088 	}
2089 	return 0;
2090 }
2091 
2092 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2093 			    get_extent_t *get_extent)
2094 {
2095 	struct bio *bio = NULL;
2096 	unsigned long bio_flags = 0;
2097 	int ret;
2098 
2099 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2100 				      &bio_flags);
2101 	if (bio)
2102 		submit_one_bio(READ, bio, 0, bio_flags);
2103 	return ret;
2104 }
2105 
2106 /*
2107  * the writepage semantics are similar to regular writepage.  extent
2108  * records are inserted to lock ranges in the tree, and as dirty areas
2109  * are found, they are marked writeback.  Then the lock bits are removed
2110  * and the end_io handler clears the writeback ranges
2111  */
2112 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2113 			      void *data)
2114 {
2115 	struct inode *inode = page->mapping->host;
2116 	struct extent_page_data *epd = data;
2117 	struct extent_io_tree *tree = epd->tree;
2118 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2119 	u64 delalloc_start;
2120 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2121 	u64 end;
2122 	u64 cur = start;
2123 	u64 extent_offset;
2124 	u64 last_byte = i_size_read(inode);
2125 	u64 block_start;
2126 	u64 iosize;
2127 	u64 unlock_start;
2128 	sector_t sector;
2129 	struct extent_map *em;
2130 	struct block_device *bdev;
2131 	int ret;
2132 	int nr = 0;
2133 	size_t pg_offset = 0;
2134 	size_t blocksize;
2135 	loff_t i_size = i_size_read(inode);
2136 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2137 	u64 nr_delalloc;
2138 	u64 delalloc_end;
2139 	int page_started;
2140 	int compressed;
2141 	unsigned long nr_written = 0;
2142 
2143 	WARN_ON(!PageLocked(page));
2144 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2145 	if (page->index > end_index ||
2146 	   (page->index == end_index && !pg_offset)) {
2147 		page->mapping->a_ops->invalidatepage(page, 0);
2148 		unlock_page(page);
2149 		return 0;
2150 	}
2151 
2152 	if (page->index == end_index) {
2153 		char *userpage;
2154 
2155 		userpage = kmap_atomic(page, KM_USER0);
2156 		memset(userpage + pg_offset, 0,
2157 		       PAGE_CACHE_SIZE - pg_offset);
2158 		kunmap_atomic(userpage, KM_USER0);
2159 		flush_dcache_page(page);
2160 	}
2161 	pg_offset = 0;
2162 
2163 	set_page_extent_mapped(page);
2164 
2165 	delalloc_start = start;
2166 	delalloc_end = 0;
2167 	page_started = 0;
2168 	if (!epd->extent_locked) {
2169 		while (delalloc_end < page_end) {
2170 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2171 						       page,
2172 						       &delalloc_start,
2173 						       &delalloc_end,
2174 						       128 * 1024 * 1024);
2175 			if (nr_delalloc == 0) {
2176 				delalloc_start = delalloc_end + 1;
2177 				continue;
2178 			}
2179 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2180 						 delalloc_end, &page_started,
2181 						 &nr_written);
2182 			delalloc_start = delalloc_end + 1;
2183 		}
2184 
2185 		/* did the fill delalloc function already unlock and start
2186 		 * the IO?
2187 		 */
2188 		if (page_started) {
2189 			ret = 0;
2190 			goto update_nr_written;
2191 		}
2192 	}
2193 	lock_extent(tree, start, page_end, GFP_NOFS);
2194 
2195 	unlock_start = start;
2196 
2197 	if (tree->ops && tree->ops->writepage_start_hook) {
2198 		ret = tree->ops->writepage_start_hook(page, start,
2199 						      page_end);
2200 		if (ret == -EAGAIN) {
2201 			unlock_extent(tree, start, page_end, GFP_NOFS);
2202 			redirty_page_for_writepage(wbc, page);
2203 			unlock_page(page);
2204 			ret = 0;
2205 			goto update_nr_written;
2206 		}
2207 	}
2208 
2209 	nr_written++;
2210 
2211 	end = page_end;
2212 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2213 		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2214 
2215 	if (last_byte <= start) {
2216 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2217 		unlock_extent(tree, start, page_end, GFP_NOFS);
2218 		if (tree->ops && tree->ops->writepage_end_io_hook)
2219 			tree->ops->writepage_end_io_hook(page, start,
2220 							 page_end, NULL, 1);
2221 		unlock_start = page_end + 1;
2222 		goto done;
2223 	}
2224 
2225 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2226 	blocksize = inode->i_sb->s_blocksize;
2227 
2228 	while (cur <= end) {
2229 		if (cur >= last_byte) {
2230 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2231 			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2232 			if (tree->ops && tree->ops->writepage_end_io_hook)
2233 				tree->ops->writepage_end_io_hook(page, cur,
2234 							 page_end, NULL, 1);
2235 			unlock_start = page_end + 1;
2236 			break;
2237 		}
2238 		em = epd->get_extent(inode, page, pg_offset, cur,
2239 				     end - cur + 1, 1);
2240 		if (IS_ERR(em) || !em) {
2241 			SetPageError(page);
2242 			break;
2243 		}
2244 
2245 		extent_offset = cur - em->start;
2246 		BUG_ON(extent_map_end(em) <= cur);
2247 		BUG_ON(end < cur);
2248 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2249 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2250 		sector = (em->block_start + extent_offset) >> 9;
2251 		bdev = em->bdev;
2252 		block_start = em->block_start;
2253 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2254 		free_extent_map(em);
2255 		em = NULL;
2256 
2257 		/*
2258 		 * compressed and inline extents are written through other
2259 		 * paths in the FS
2260 		 */
2261 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2262 		    block_start == EXTENT_MAP_INLINE) {
2263 			clear_extent_dirty(tree, cur,
2264 					   cur + iosize - 1, GFP_NOFS);
2265 
2266 			unlock_extent(tree, unlock_start, cur + iosize - 1,
2267 				      GFP_NOFS);
2268 
2269 			/*
2270 			 * end_io notification does not happen here for
2271 			 * compressed extents
2272 			 */
2273 			if (!compressed && tree->ops &&
2274 			    tree->ops->writepage_end_io_hook)
2275 				tree->ops->writepage_end_io_hook(page, cur,
2276 							 cur + iosize - 1,
2277 							 NULL, 1);
2278 			else if (compressed) {
2279 				/* we don't want to end_page_writeback on
2280 				 * a compressed extent.  this happens
2281 				 * elsewhere
2282 				 */
2283 				nr++;
2284 			}
2285 
2286 			cur += iosize;
2287 			pg_offset += iosize;
2288 			unlock_start = cur;
2289 			continue;
2290 		}
2291 		/* leave this out until we have a page_mkwrite call */
2292 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2293 				   EXTENT_DIRTY, 0)) {
2294 			cur = cur + iosize;
2295 			pg_offset += iosize;
2296 			continue;
2297 		}
2298 
2299 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2300 		if (tree->ops && tree->ops->writepage_io_hook) {
2301 			ret = tree->ops->writepage_io_hook(page, cur,
2302 						cur + iosize - 1);
2303 		} else {
2304 			ret = 0;
2305 		}
2306 		if (ret) {
2307 			SetPageError(page);
2308 		} else {
2309 			unsigned long max_nr = end_index + 1;
2310 
2311 			set_range_writeback(tree, cur, cur + iosize - 1);
2312 			if (!PageWriteback(page)) {
2313 				printk(KERN_ERR "btrfs warning page %lu not "
2314 				       "writeback, cur %llu end %llu\n",
2315 				       page->index, (unsigned long long)cur,
2316 				       (unsigned long long)end);
2317 			}
2318 
2319 			ret = submit_extent_page(WRITE, tree, page, sector,
2320 						 iosize, pg_offset, bdev,
2321 						 &epd->bio, max_nr,
2322 						 end_bio_extent_writepage,
2323 						 0, 0, 0);
2324 			if (ret)
2325 				SetPageError(page);
2326 		}
2327 		cur = cur + iosize;
2328 		pg_offset += iosize;
2329 		nr++;
2330 	}
2331 done:
2332 	if (nr == 0) {
2333 		/* make sure the mapping tag for page dirty gets cleared */
2334 		set_page_writeback(page);
2335 		end_page_writeback(page);
2336 	}
2337 	if (unlock_start <= page_end)
2338 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2339 	unlock_page(page);
2340 
2341 update_nr_written:
2342 	wbc->nr_to_write -= nr_written;
2343 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2344 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2345 		page->mapping->writeback_index = page->index + nr_written;
2346 	return 0;
2347 }
2348 
2349 /**
2350  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2351  * @mapping: address space structure to write
2352  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2353  * @writepage: function called for each page
2354  * @data: data passed to writepage function
2355  *
2356  * If a page is already under I/O, write_cache_pages() skips it, even
2357  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2358  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2359  * and msync() need to guarantee that all the data which was dirty at the time
2360  * the call was made get new I/O started against them.  If wbc->sync_mode is
2361  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2362  * existing IO to complete.
2363  */
2364 static int extent_write_cache_pages(struct extent_io_tree *tree,
2365 			     struct address_space *mapping,
2366 			     struct writeback_control *wbc,
2367 			     writepage_t writepage, void *data,
2368 			     void (*flush_fn)(void *))
2369 {
2370 	struct backing_dev_info *bdi = mapping->backing_dev_info;
2371 	int ret = 0;
2372 	int done = 0;
2373 	struct pagevec pvec;
2374 	int nr_pages;
2375 	pgoff_t index;
2376 	pgoff_t end;		/* Inclusive */
2377 	int scanned = 0;
2378 	int range_whole = 0;
2379 
2380 	pagevec_init(&pvec, 0);
2381 	if (wbc->range_cyclic) {
2382 		index = mapping->writeback_index; /* Start from prev offset */
2383 		end = -1;
2384 	} else {
2385 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2386 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2387 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2388 			range_whole = 1;
2389 		scanned = 1;
2390 	}
2391 retry:
2392 	while (!done && (index <= end) &&
2393 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2394 			      PAGECACHE_TAG_DIRTY, min(end - index,
2395 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2396 		unsigned i;
2397 
2398 		scanned = 1;
2399 		for (i = 0; i < nr_pages; i++) {
2400 			struct page *page = pvec.pages[i];
2401 
2402 			/*
2403 			 * At this point we hold neither mapping->tree_lock nor
2404 			 * lock on the page itself: the page may be truncated or
2405 			 * invalidated (changing page->mapping to NULL), or even
2406 			 * swizzled back from swapper_space to tmpfs file
2407 			 * mapping
2408 			 */
2409 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2410 				tree->ops->write_cache_pages_lock_hook(page);
2411 			else
2412 				lock_page(page);
2413 
2414 			if (unlikely(page->mapping != mapping)) {
2415 				unlock_page(page);
2416 				continue;
2417 			}
2418 
2419 			if (!wbc->range_cyclic && page->index > end) {
2420 				done = 1;
2421 				unlock_page(page);
2422 				continue;
2423 			}
2424 
2425 			if (wbc->sync_mode != WB_SYNC_NONE) {
2426 				if (PageWriteback(page))
2427 					flush_fn(data);
2428 				wait_on_page_writeback(page);
2429 			}
2430 
2431 			if (PageWriteback(page) ||
2432 			    !clear_page_dirty_for_io(page)) {
2433 				unlock_page(page);
2434 				continue;
2435 			}
2436 
2437 			ret = (*writepage)(page, wbc, data);
2438 
2439 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2440 				unlock_page(page);
2441 				ret = 0;
2442 			}
2443 			if (ret || wbc->nr_to_write <= 0)
2444 				done = 1;
2445 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
2446 				wbc->encountered_congestion = 1;
2447 				done = 1;
2448 			}
2449 		}
2450 		pagevec_release(&pvec);
2451 		cond_resched();
2452 	}
2453 	if (!scanned && !done) {
2454 		/*
2455 		 * We hit the last page and there is more work to be done: wrap
2456 		 * back to the start of the file
2457 		 */
2458 		scanned = 1;
2459 		index = 0;
2460 		goto retry;
2461 	}
2462 	return ret;
2463 }
2464 
2465 static noinline void flush_write_bio(void *data)
2466 {
2467 	struct extent_page_data *epd = data;
2468 	if (epd->bio) {
2469 		submit_one_bio(WRITE, epd->bio, 0, 0);
2470 		epd->bio = NULL;
2471 	}
2472 }
2473 
2474 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2475 			  get_extent_t *get_extent,
2476 			  struct writeback_control *wbc)
2477 {
2478 	int ret;
2479 	struct address_space *mapping = page->mapping;
2480 	struct extent_page_data epd = {
2481 		.bio = NULL,
2482 		.tree = tree,
2483 		.get_extent = get_extent,
2484 		.extent_locked = 0,
2485 	};
2486 	struct writeback_control wbc_writepages = {
2487 		.bdi		= wbc->bdi,
2488 		.sync_mode	= WB_SYNC_NONE,
2489 		.older_than_this = NULL,
2490 		.nr_to_write	= 64,
2491 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2492 		.range_end	= (loff_t)-1,
2493 	};
2494 
2495 
2496 	ret = __extent_writepage(page, wbc, &epd);
2497 
2498 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2499 				 __extent_writepage, &epd, flush_write_bio);
2500 	if (epd.bio)
2501 		submit_one_bio(WRITE, epd.bio, 0, 0);
2502 	return ret;
2503 }
2504 
2505 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2506 			      u64 start, u64 end, get_extent_t *get_extent,
2507 			      int mode)
2508 {
2509 	int ret = 0;
2510 	struct address_space *mapping = inode->i_mapping;
2511 	struct page *page;
2512 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2513 		PAGE_CACHE_SHIFT;
2514 
2515 	struct extent_page_data epd = {
2516 		.bio = NULL,
2517 		.tree = tree,
2518 		.get_extent = get_extent,
2519 		.extent_locked = 1,
2520 	};
2521 	struct writeback_control wbc_writepages = {
2522 		.bdi		= inode->i_mapping->backing_dev_info,
2523 		.sync_mode	= mode,
2524 		.older_than_this = NULL,
2525 		.nr_to_write	= nr_pages * 2,
2526 		.range_start	= start,
2527 		.range_end	= end + 1,
2528 	};
2529 
2530 	while (start <= end) {
2531 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2532 		if (clear_page_dirty_for_io(page))
2533 			ret = __extent_writepage(page, &wbc_writepages, &epd);
2534 		else {
2535 			if (tree->ops && tree->ops->writepage_end_io_hook)
2536 				tree->ops->writepage_end_io_hook(page, start,
2537 						 start + PAGE_CACHE_SIZE - 1,
2538 						 NULL, 1);
2539 			unlock_page(page);
2540 		}
2541 		page_cache_release(page);
2542 		start += PAGE_CACHE_SIZE;
2543 	}
2544 
2545 	if (epd.bio)
2546 		submit_one_bio(WRITE, epd.bio, 0, 0);
2547 	return ret;
2548 }
2549 
2550 int extent_writepages(struct extent_io_tree *tree,
2551 		      struct address_space *mapping,
2552 		      get_extent_t *get_extent,
2553 		      struct writeback_control *wbc)
2554 {
2555 	int ret = 0;
2556 	struct extent_page_data epd = {
2557 		.bio = NULL,
2558 		.tree = tree,
2559 		.get_extent = get_extent,
2560 		.extent_locked = 0,
2561 	};
2562 
2563 	ret = extent_write_cache_pages(tree, mapping, wbc,
2564 				       __extent_writepage, &epd,
2565 				       flush_write_bio);
2566 	if (epd.bio)
2567 		submit_one_bio(WRITE, epd.bio, 0, 0);
2568 	return ret;
2569 }
2570 
2571 int extent_readpages(struct extent_io_tree *tree,
2572 		     struct address_space *mapping,
2573 		     struct list_head *pages, unsigned nr_pages,
2574 		     get_extent_t get_extent)
2575 {
2576 	struct bio *bio = NULL;
2577 	unsigned page_idx;
2578 	struct pagevec pvec;
2579 	unsigned long bio_flags = 0;
2580 
2581 	pagevec_init(&pvec, 0);
2582 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2583 		struct page *page = list_entry(pages->prev, struct page, lru);
2584 
2585 		prefetchw(&page->flags);
2586 		list_del(&page->lru);
2587 		/*
2588 		 * what we want to do here is call add_to_page_cache_lru,
2589 		 * but that isn't exported, so we reproduce it here
2590 		 */
2591 		if (!add_to_page_cache(page, mapping,
2592 					page->index, GFP_KERNEL)) {
2593 
2594 			/* open coding of lru_cache_add, also not exported */
2595 			page_cache_get(page);
2596 			if (!pagevec_add(&pvec, page))
2597 				__pagevec_lru_add_file(&pvec);
2598 			__extent_read_full_page(tree, page, get_extent,
2599 						&bio, 0, &bio_flags);
2600 		}
2601 		page_cache_release(page);
2602 	}
2603 	if (pagevec_count(&pvec))
2604 		__pagevec_lru_add_file(&pvec);
2605 	BUG_ON(!list_empty(pages));
2606 	if (bio)
2607 		submit_one_bio(READ, bio, 0, bio_flags);
2608 	return 0;
2609 }
2610 
2611 /*
2612  * basic invalidatepage code, this waits on any locked or writeback
2613  * ranges corresponding to the page, and then deletes any extent state
2614  * records from the tree
2615  */
2616 int extent_invalidatepage(struct extent_io_tree *tree,
2617 			  struct page *page, unsigned long offset)
2618 {
2619 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2620 	u64 end = start + PAGE_CACHE_SIZE - 1;
2621 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2622 
2623 	start += (offset + blocksize - 1) & ~(blocksize - 1);
2624 	if (start > end)
2625 		return 0;
2626 
2627 	lock_extent(tree, start, end, GFP_NOFS);
2628 	wait_on_extent_writeback(tree, start, end);
2629 	clear_extent_bit(tree, start, end,
2630 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2631 			 1, 1, GFP_NOFS);
2632 	return 0;
2633 }
2634 
2635 /*
2636  * simple commit_write call, set_range_dirty is used to mark both
2637  * the pages and the extent records as dirty
2638  */
2639 int extent_commit_write(struct extent_io_tree *tree,
2640 			struct inode *inode, struct page *page,
2641 			unsigned from, unsigned to)
2642 {
2643 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2644 
2645 	set_page_extent_mapped(page);
2646 	set_page_dirty(page);
2647 
2648 	if (pos > inode->i_size) {
2649 		i_size_write(inode, pos);
2650 		mark_inode_dirty(inode);
2651 	}
2652 	return 0;
2653 }
2654 
2655 int extent_prepare_write(struct extent_io_tree *tree,
2656 			 struct inode *inode, struct page *page,
2657 			 unsigned from, unsigned to, get_extent_t *get_extent)
2658 {
2659 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2660 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2661 	u64 block_start;
2662 	u64 orig_block_start;
2663 	u64 block_end;
2664 	u64 cur_end;
2665 	struct extent_map *em;
2666 	unsigned blocksize = 1 << inode->i_blkbits;
2667 	size_t page_offset = 0;
2668 	size_t block_off_start;
2669 	size_t block_off_end;
2670 	int err = 0;
2671 	int iocount = 0;
2672 	int ret = 0;
2673 	int isnew;
2674 
2675 	set_page_extent_mapped(page);
2676 
2677 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2678 	block_end = (page_start + to - 1) | (blocksize - 1);
2679 	orig_block_start = block_start;
2680 
2681 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2682 	while (block_start <= block_end) {
2683 		em = get_extent(inode, page, page_offset, block_start,
2684 				block_end - block_start + 1, 1);
2685 		if (IS_ERR(em) || !em)
2686 			goto err;
2687 
2688 		cur_end = min(block_end, extent_map_end(em) - 1);
2689 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2690 		block_off_end = block_off_start + blocksize;
2691 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2692 
2693 		if (!PageUptodate(page) && isnew &&
2694 		    (block_off_end > to || block_off_start < from)) {
2695 			void *kaddr;
2696 
2697 			kaddr = kmap_atomic(page, KM_USER0);
2698 			if (block_off_end > to)
2699 				memset(kaddr + to, 0, block_off_end - to);
2700 			if (block_off_start < from)
2701 				memset(kaddr + block_off_start, 0,
2702 				       from - block_off_start);
2703 			flush_dcache_page(page);
2704 			kunmap_atomic(kaddr, KM_USER0);
2705 		}
2706 		if ((em->block_start != EXTENT_MAP_HOLE &&
2707 		     em->block_start != EXTENT_MAP_INLINE) &&
2708 		    !isnew && !PageUptodate(page) &&
2709 		    (block_off_end > to || block_off_start < from) &&
2710 		    !test_range_bit(tree, block_start, cur_end,
2711 				    EXTENT_UPTODATE, 1)) {
2712 			u64 sector;
2713 			u64 extent_offset = block_start - em->start;
2714 			size_t iosize;
2715 			sector = (em->block_start + extent_offset) >> 9;
2716 			iosize = (cur_end - block_start + blocksize) &
2717 				~((u64)blocksize - 1);
2718 			/*
2719 			 * we've already got the extent locked, but we
2720 			 * need to split the state such that our end_bio
2721 			 * handler can clear the lock.
2722 			 */
2723 			set_extent_bit(tree, block_start,
2724 				       block_start + iosize - 1,
2725 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2726 			ret = submit_extent_page(READ, tree, page,
2727 					 sector, iosize, page_offset, em->bdev,
2728 					 NULL, 1,
2729 					 end_bio_extent_preparewrite, 0,
2730 					 0, 0);
2731 			iocount++;
2732 			block_start = block_start + iosize;
2733 		} else {
2734 			set_extent_uptodate(tree, block_start, cur_end,
2735 					    GFP_NOFS);
2736 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2737 			block_start = cur_end + 1;
2738 		}
2739 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2740 		free_extent_map(em);
2741 	}
2742 	if (iocount) {
2743 		wait_extent_bit(tree, orig_block_start,
2744 				block_end, EXTENT_LOCKED);
2745 	}
2746 	check_page_uptodate(tree, page);
2747 err:
2748 	/* FIXME, zero out newly allocated blocks on error */
2749 	return err;
2750 }
2751 
2752 /*
2753  * a helper for releasepage, this tests for areas of the page that
2754  * are locked or under IO and drops the related state bits if it is safe
2755  * to drop the page.
2756  */
2757 int try_release_extent_state(struct extent_map_tree *map,
2758 			     struct extent_io_tree *tree, struct page *page,
2759 			     gfp_t mask)
2760 {
2761 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2762 	u64 end = start + PAGE_CACHE_SIZE - 1;
2763 	int ret = 1;
2764 
2765 	if (test_range_bit(tree, start, end,
2766 			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
2767 		ret = 0;
2768 	else {
2769 		if ((mask & GFP_NOFS) == GFP_NOFS)
2770 			mask = GFP_NOFS;
2771 		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2772 				 1, 1, mask);
2773 	}
2774 	return ret;
2775 }
2776 
2777 /*
2778  * a helper for releasepage.  As long as there are no locked extents
2779  * in the range corresponding to the page, both state records and extent
2780  * map records are removed
2781  */
2782 int try_release_extent_mapping(struct extent_map_tree *map,
2783 			       struct extent_io_tree *tree, struct page *page,
2784 			       gfp_t mask)
2785 {
2786 	struct extent_map *em;
2787 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2788 	u64 end = start + PAGE_CACHE_SIZE - 1;
2789 
2790 	if ((mask & __GFP_WAIT) &&
2791 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2792 		u64 len;
2793 		while (start <= end) {
2794 			len = end - start + 1;
2795 			spin_lock(&map->lock);
2796 			em = lookup_extent_mapping(map, start, len);
2797 			if (!em || IS_ERR(em)) {
2798 				spin_unlock(&map->lock);
2799 				break;
2800 			}
2801 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2802 			    em->start != start) {
2803 				spin_unlock(&map->lock);
2804 				free_extent_map(em);
2805 				break;
2806 			}
2807 			if (!test_range_bit(tree, em->start,
2808 					    extent_map_end(em) - 1,
2809 					    EXTENT_LOCKED | EXTENT_WRITEBACK |
2810 					    EXTENT_ORDERED,
2811 					    0)) {
2812 				remove_extent_mapping(map, em);
2813 				/* once for the rb tree */
2814 				free_extent_map(em);
2815 			}
2816 			start = extent_map_end(em);
2817 			spin_unlock(&map->lock);
2818 
2819 			/* once for us */
2820 			free_extent_map(em);
2821 		}
2822 	}
2823 	return try_release_extent_state(map, tree, page, mask);
2824 }
2825 
2826 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2827 		get_extent_t *get_extent)
2828 {
2829 	struct inode *inode = mapping->host;
2830 	u64 start = iblock << inode->i_blkbits;
2831 	sector_t sector = 0;
2832 	size_t blksize = (1 << inode->i_blkbits);
2833 	struct extent_map *em;
2834 
2835 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2836 		    GFP_NOFS);
2837 	em = get_extent(inode, NULL, 0, start, blksize, 0);
2838 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2839 		      GFP_NOFS);
2840 	if (!em || IS_ERR(em))
2841 		return 0;
2842 
2843 	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2844 		goto out;
2845 
2846 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2847 out:
2848 	free_extent_map(em);
2849 	return sector;
2850 }
2851 
2852 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2853 		__u64 start, __u64 len, get_extent_t *get_extent)
2854 {
2855 	int ret;
2856 	u64 off = start;
2857 	u64 max = start + len;
2858 	u32 flags = 0;
2859 	u64 disko = 0;
2860 	struct extent_map *em = NULL;
2861 	int end = 0;
2862 	u64 em_start = 0, em_len = 0;
2863 	unsigned long emflags;
2864 	ret = 0;
2865 
2866 	if (len == 0)
2867 		return -EINVAL;
2868 
2869 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2870 		GFP_NOFS);
2871 	em = get_extent(inode, NULL, 0, off, max - off, 0);
2872 	if (!em)
2873 		goto out;
2874 	if (IS_ERR(em)) {
2875 		ret = PTR_ERR(em);
2876 		goto out;
2877 	}
2878 	while (!end) {
2879 		off = em->start + em->len;
2880 		if (off >= max)
2881 			end = 1;
2882 
2883 		em_start = em->start;
2884 		em_len = em->len;
2885 
2886 		disko = 0;
2887 		flags = 0;
2888 
2889 		switch (em->block_start) {
2890 		case EXTENT_MAP_LAST_BYTE:
2891 			end = 1;
2892 			flags |= FIEMAP_EXTENT_LAST;
2893 			break;
2894 		case EXTENT_MAP_HOLE:
2895 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2896 			break;
2897 		case EXTENT_MAP_INLINE:
2898 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2899 				  FIEMAP_EXTENT_NOT_ALIGNED);
2900 			break;
2901 		case EXTENT_MAP_DELALLOC:
2902 			flags |= (FIEMAP_EXTENT_DELALLOC |
2903 				  FIEMAP_EXTENT_UNKNOWN);
2904 			break;
2905 		default:
2906 			disko = em->block_start;
2907 			break;
2908 		}
2909 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2910 			flags |= FIEMAP_EXTENT_ENCODED;
2911 
2912 		emflags = em->flags;
2913 		free_extent_map(em);
2914 		em = NULL;
2915 
2916 		if (!end) {
2917 			em = get_extent(inode, NULL, 0, off, max - off, 0);
2918 			if (!em)
2919 				goto out;
2920 			if (IS_ERR(em)) {
2921 				ret = PTR_ERR(em);
2922 				goto out;
2923 			}
2924 			emflags = em->flags;
2925 		}
2926 		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2927 			flags |= FIEMAP_EXTENT_LAST;
2928 			end = 1;
2929 		}
2930 
2931 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2932 					em_len, flags);
2933 		if (ret)
2934 			goto out_free;
2935 	}
2936 out_free:
2937 	free_extent_map(em);
2938 out:
2939 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2940 			GFP_NOFS);
2941 	return ret;
2942 }
2943 
2944 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2945 					      unsigned long i)
2946 {
2947 	struct page *p;
2948 	struct address_space *mapping;
2949 
2950 	if (i == 0)
2951 		return eb->first_page;
2952 	i += eb->start >> PAGE_CACHE_SHIFT;
2953 	mapping = eb->first_page->mapping;
2954 	if (!mapping)
2955 		return NULL;
2956 
2957 	/*
2958 	 * extent_buffer_page is only called after pinning the page
2959 	 * by increasing the reference count.  So we know the page must
2960 	 * be in the radix tree.
2961 	 */
2962 	rcu_read_lock();
2963 	p = radix_tree_lookup(&mapping->page_tree, i);
2964 	rcu_read_unlock();
2965 
2966 	return p;
2967 }
2968 
2969 static inline unsigned long num_extent_pages(u64 start, u64 len)
2970 {
2971 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2972 		(start >> PAGE_CACHE_SHIFT);
2973 }
2974 
2975 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2976 						   u64 start,
2977 						   unsigned long len,
2978 						   gfp_t mask)
2979 {
2980 	struct extent_buffer *eb = NULL;
2981 #if LEAK_DEBUG
2982 	unsigned long flags;
2983 #endif
2984 
2985 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2986 	eb->start = start;
2987 	eb->len = len;
2988 	spin_lock_init(&eb->lock);
2989 	init_waitqueue_head(&eb->lock_wq);
2990 
2991 #if LEAK_DEBUG
2992 	spin_lock_irqsave(&leak_lock, flags);
2993 	list_add(&eb->leak_list, &buffers);
2994 	spin_unlock_irqrestore(&leak_lock, flags);
2995 #endif
2996 	atomic_set(&eb->refs, 1);
2997 
2998 	return eb;
2999 }
3000 
3001 static void __free_extent_buffer(struct extent_buffer *eb)
3002 {
3003 #if LEAK_DEBUG
3004 	unsigned long flags;
3005 	spin_lock_irqsave(&leak_lock, flags);
3006 	list_del(&eb->leak_list);
3007 	spin_unlock_irqrestore(&leak_lock, flags);
3008 #endif
3009 	kmem_cache_free(extent_buffer_cache, eb);
3010 }
3011 
3012 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3013 					  u64 start, unsigned long len,
3014 					  struct page *page0,
3015 					  gfp_t mask)
3016 {
3017 	unsigned long num_pages = num_extent_pages(start, len);
3018 	unsigned long i;
3019 	unsigned long index = start >> PAGE_CACHE_SHIFT;
3020 	struct extent_buffer *eb;
3021 	struct extent_buffer *exists = NULL;
3022 	struct page *p;
3023 	struct address_space *mapping = tree->mapping;
3024 	int uptodate = 1;
3025 
3026 	spin_lock(&tree->buffer_lock);
3027 	eb = buffer_search(tree, start);
3028 	if (eb) {
3029 		atomic_inc(&eb->refs);
3030 		spin_unlock(&tree->buffer_lock);
3031 		mark_page_accessed(eb->first_page);
3032 		return eb;
3033 	}
3034 	spin_unlock(&tree->buffer_lock);
3035 
3036 	eb = __alloc_extent_buffer(tree, start, len, mask);
3037 	if (!eb)
3038 		return NULL;
3039 
3040 	if (page0) {
3041 		eb->first_page = page0;
3042 		i = 1;
3043 		index++;
3044 		page_cache_get(page0);
3045 		mark_page_accessed(page0);
3046 		set_page_extent_mapped(page0);
3047 		set_page_extent_head(page0, len);
3048 		uptodate = PageUptodate(page0);
3049 	} else {
3050 		i = 0;
3051 	}
3052 	for (; i < num_pages; i++, index++) {
3053 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3054 		if (!p) {
3055 			WARN_ON(1);
3056 			goto free_eb;
3057 		}
3058 		set_page_extent_mapped(p);
3059 		mark_page_accessed(p);
3060 		if (i == 0) {
3061 			eb->first_page = p;
3062 			set_page_extent_head(p, len);
3063 		} else {
3064 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3065 		}
3066 		if (!PageUptodate(p))
3067 			uptodate = 0;
3068 		unlock_page(p);
3069 	}
3070 	if (uptodate)
3071 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3072 
3073 	spin_lock(&tree->buffer_lock);
3074 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3075 	if (exists) {
3076 		/* add one reference for the caller */
3077 		atomic_inc(&exists->refs);
3078 		spin_unlock(&tree->buffer_lock);
3079 		goto free_eb;
3080 	}
3081 	spin_unlock(&tree->buffer_lock);
3082 
3083 	/* add one reference for the tree */
3084 	atomic_inc(&eb->refs);
3085 	return eb;
3086 
3087 free_eb:
3088 	if (!atomic_dec_and_test(&eb->refs))
3089 		return exists;
3090 	for (index = 1; index < i; index++)
3091 		page_cache_release(extent_buffer_page(eb, index));
3092 	page_cache_release(extent_buffer_page(eb, 0));
3093 	__free_extent_buffer(eb);
3094 	return exists;
3095 }
3096 
3097 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3098 					 u64 start, unsigned long len,
3099 					  gfp_t mask)
3100 {
3101 	struct extent_buffer *eb;
3102 
3103 	spin_lock(&tree->buffer_lock);
3104 	eb = buffer_search(tree, start);
3105 	if (eb)
3106 		atomic_inc(&eb->refs);
3107 	spin_unlock(&tree->buffer_lock);
3108 
3109 	if (eb)
3110 		mark_page_accessed(eb->first_page);
3111 
3112 	return eb;
3113 }
3114 
3115 void free_extent_buffer(struct extent_buffer *eb)
3116 {
3117 	if (!eb)
3118 		return;
3119 
3120 	if (!atomic_dec_and_test(&eb->refs))
3121 		return;
3122 
3123 	WARN_ON(1);
3124 }
3125 
3126 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3127 			      struct extent_buffer *eb)
3128 {
3129 	int set;
3130 	unsigned long i;
3131 	unsigned long num_pages;
3132 	struct page *page;
3133 
3134 	u64 start = eb->start;
3135 	u64 end = start + eb->len - 1;
3136 
3137 	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3138 	num_pages = num_extent_pages(eb->start, eb->len);
3139 
3140 	for (i = 0; i < num_pages; i++) {
3141 		page = extent_buffer_page(eb, i);
3142 		if (!set && !PageDirty(page))
3143 			continue;
3144 
3145 		lock_page(page);
3146 		if (i == 0)
3147 			set_page_extent_head(page, eb->len);
3148 		else
3149 			set_page_private(page, EXTENT_PAGE_PRIVATE);
3150 
3151 		/*
3152 		 * if we're on the last page or the first page and the
3153 		 * block isn't aligned on a page boundary, do extra checks
3154 		 * to make sure we don't clean page that is partially dirty
3155 		 */
3156 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3157 		    ((i == num_pages - 1) &&
3158 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3159 			start = (u64)page->index << PAGE_CACHE_SHIFT;
3160 			end  = start + PAGE_CACHE_SIZE - 1;
3161 			if (test_range_bit(tree, start, end,
3162 					   EXTENT_DIRTY, 0)) {
3163 				unlock_page(page);
3164 				continue;
3165 			}
3166 		}
3167 		clear_page_dirty_for_io(page);
3168 		spin_lock_irq(&page->mapping->tree_lock);
3169 		if (!PageDirty(page)) {
3170 			radix_tree_tag_clear(&page->mapping->page_tree,
3171 						page_index(page),
3172 						PAGECACHE_TAG_DIRTY);
3173 		}
3174 		spin_unlock_irq(&page->mapping->tree_lock);
3175 		unlock_page(page);
3176 	}
3177 	return 0;
3178 }
3179 
3180 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3181 				    struct extent_buffer *eb)
3182 {
3183 	return wait_on_extent_writeback(tree, eb->start,
3184 					eb->start + eb->len - 1);
3185 }
3186 
3187 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3188 			     struct extent_buffer *eb)
3189 {
3190 	unsigned long i;
3191 	unsigned long num_pages;
3192 
3193 	num_pages = num_extent_pages(eb->start, eb->len);
3194 	for (i = 0; i < num_pages; i++) {
3195 		struct page *page = extent_buffer_page(eb, i);
3196 		/* writepage may need to do something special for the
3197 		 * first page, we have to make sure page->private is
3198 		 * properly set.  releasepage may drop page->private
3199 		 * on us if the page isn't already dirty.
3200 		 */
3201 		lock_page(page);
3202 		if (i == 0) {
3203 			set_page_extent_head(page, eb->len);
3204 		} else if (PagePrivate(page) &&
3205 			   page->private != EXTENT_PAGE_PRIVATE) {
3206 			set_page_extent_mapped(page);
3207 		}
3208 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3209 		set_extent_dirty(tree, page_offset(page),
3210 				 page_offset(page) + PAGE_CACHE_SIZE - 1,
3211 				 GFP_NOFS);
3212 		unlock_page(page);
3213 	}
3214 	return 0;
3215 }
3216 
3217 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3218 				struct extent_buffer *eb)
3219 {
3220 	unsigned long i;
3221 	struct page *page;
3222 	unsigned long num_pages;
3223 
3224 	num_pages = num_extent_pages(eb->start, eb->len);
3225 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3226 
3227 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3228 			      GFP_NOFS);
3229 	for (i = 0; i < num_pages; i++) {
3230 		page = extent_buffer_page(eb, i);
3231 		if (page)
3232 			ClearPageUptodate(page);
3233 	}
3234 	return 0;
3235 }
3236 
3237 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3238 				struct extent_buffer *eb)
3239 {
3240 	unsigned long i;
3241 	struct page *page;
3242 	unsigned long num_pages;
3243 
3244 	num_pages = num_extent_pages(eb->start, eb->len);
3245 
3246 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3247 			    GFP_NOFS);
3248 	for (i = 0; i < num_pages; i++) {
3249 		page = extent_buffer_page(eb, i);
3250 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3251 		    ((i == num_pages - 1) &&
3252 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3253 			check_page_uptodate(tree, page);
3254 			continue;
3255 		}
3256 		SetPageUptodate(page);
3257 	}
3258 	return 0;
3259 }
3260 
3261 int extent_range_uptodate(struct extent_io_tree *tree,
3262 			  u64 start, u64 end)
3263 {
3264 	struct page *page;
3265 	int ret;
3266 	int pg_uptodate = 1;
3267 	int uptodate;
3268 	unsigned long index;
3269 
3270 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3271 	if (ret)
3272 		return 1;
3273 	while (start <= end) {
3274 		index = start >> PAGE_CACHE_SHIFT;
3275 		page = find_get_page(tree->mapping, index);
3276 		uptodate = PageUptodate(page);
3277 		page_cache_release(page);
3278 		if (!uptodate) {
3279 			pg_uptodate = 0;
3280 			break;
3281 		}
3282 		start += PAGE_CACHE_SIZE;
3283 	}
3284 	return pg_uptodate;
3285 }
3286 
3287 int extent_buffer_uptodate(struct extent_io_tree *tree,
3288 			   struct extent_buffer *eb)
3289 {
3290 	int ret = 0;
3291 	unsigned long num_pages;
3292 	unsigned long i;
3293 	struct page *page;
3294 	int pg_uptodate = 1;
3295 
3296 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3297 		return 1;
3298 
3299 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3300 			   EXTENT_UPTODATE, 1);
3301 	if (ret)
3302 		return ret;
3303 
3304 	num_pages = num_extent_pages(eb->start, eb->len);
3305 	for (i = 0; i < num_pages; i++) {
3306 		page = extent_buffer_page(eb, i);
3307 		if (!PageUptodate(page)) {
3308 			pg_uptodate = 0;
3309 			break;
3310 		}
3311 	}
3312 	return pg_uptodate;
3313 }
3314 
3315 int read_extent_buffer_pages(struct extent_io_tree *tree,
3316 			     struct extent_buffer *eb,
3317 			     u64 start, int wait,
3318 			     get_extent_t *get_extent, int mirror_num)
3319 {
3320 	unsigned long i;
3321 	unsigned long start_i;
3322 	struct page *page;
3323 	int err;
3324 	int ret = 0;
3325 	int locked_pages = 0;
3326 	int all_uptodate = 1;
3327 	int inc_all_pages = 0;
3328 	unsigned long num_pages;
3329 	struct bio *bio = NULL;
3330 	unsigned long bio_flags = 0;
3331 
3332 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3333 		return 0;
3334 
3335 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3336 			   EXTENT_UPTODATE, 1)) {
3337 		return 0;
3338 	}
3339 
3340 	if (start) {
3341 		WARN_ON(start < eb->start);
3342 		start_i = (start >> PAGE_CACHE_SHIFT) -
3343 			(eb->start >> PAGE_CACHE_SHIFT);
3344 	} else {
3345 		start_i = 0;
3346 	}
3347 
3348 	num_pages = num_extent_pages(eb->start, eb->len);
3349 	for (i = start_i; i < num_pages; i++) {
3350 		page = extent_buffer_page(eb, i);
3351 		if (!wait) {
3352 			if (!trylock_page(page))
3353 				goto unlock_exit;
3354 		} else {
3355 			lock_page(page);
3356 		}
3357 		locked_pages++;
3358 		if (!PageUptodate(page))
3359 			all_uptodate = 0;
3360 	}
3361 	if (all_uptodate) {
3362 		if (start_i == 0)
3363 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3364 		goto unlock_exit;
3365 	}
3366 
3367 	for (i = start_i; i < num_pages; i++) {
3368 		page = extent_buffer_page(eb, i);
3369 		if (inc_all_pages)
3370 			page_cache_get(page);
3371 		if (!PageUptodate(page)) {
3372 			if (start_i == 0)
3373 				inc_all_pages = 1;
3374 			ClearPageError(page);
3375 			err = __extent_read_full_page(tree, page,
3376 						      get_extent, &bio,
3377 						      mirror_num, &bio_flags);
3378 			if (err)
3379 				ret = err;
3380 		} else {
3381 			unlock_page(page);
3382 		}
3383 	}
3384 
3385 	if (bio)
3386 		submit_one_bio(READ, bio, mirror_num, bio_flags);
3387 
3388 	if (ret || !wait)
3389 		return ret;
3390 
3391 	for (i = start_i; i < num_pages; i++) {
3392 		page = extent_buffer_page(eb, i);
3393 		wait_on_page_locked(page);
3394 		if (!PageUptodate(page))
3395 			ret = -EIO;
3396 	}
3397 
3398 	if (!ret)
3399 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3400 	return ret;
3401 
3402 unlock_exit:
3403 	i = start_i;
3404 	while (locked_pages > 0) {
3405 		page = extent_buffer_page(eb, i);
3406 		i++;
3407 		unlock_page(page);
3408 		locked_pages--;
3409 	}
3410 	return ret;
3411 }
3412 
3413 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3414 			unsigned long start,
3415 			unsigned long len)
3416 {
3417 	size_t cur;
3418 	size_t offset;
3419 	struct page *page;
3420 	char *kaddr;
3421 	char *dst = (char *)dstv;
3422 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3423 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3424 
3425 	WARN_ON(start > eb->len);
3426 	WARN_ON(start + len > eb->start + eb->len);
3427 
3428 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3429 
3430 	while (len > 0) {
3431 		page = extent_buffer_page(eb, i);
3432 
3433 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3434 		kaddr = kmap_atomic(page, KM_USER1);
3435 		memcpy(dst, kaddr + offset, cur);
3436 		kunmap_atomic(kaddr, KM_USER1);
3437 
3438 		dst += cur;
3439 		len -= cur;
3440 		offset = 0;
3441 		i++;
3442 	}
3443 }
3444 
3445 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3446 			       unsigned long min_len, char **token, char **map,
3447 			       unsigned long *map_start,
3448 			       unsigned long *map_len, int km)
3449 {
3450 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3451 	char *kaddr;
3452 	struct page *p;
3453 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3454 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3455 	unsigned long end_i = (start_offset + start + min_len - 1) >>
3456 		PAGE_CACHE_SHIFT;
3457 
3458 	if (i != end_i)
3459 		return -EINVAL;
3460 
3461 	if (i == 0) {
3462 		offset = start_offset;
3463 		*map_start = 0;
3464 	} else {
3465 		offset = 0;
3466 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3467 	}
3468 
3469 	if (start + min_len > eb->len) {
3470 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3471 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3472 		       eb->len, start, min_len);
3473 		WARN_ON(1);
3474 	}
3475 
3476 	p = extent_buffer_page(eb, i);
3477 	kaddr = kmap_atomic(p, km);
3478 	*token = kaddr;
3479 	*map = kaddr + offset;
3480 	*map_len = PAGE_CACHE_SIZE - offset;
3481 	return 0;
3482 }
3483 
3484 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3485 		      unsigned long min_len,
3486 		      char **token, char **map,
3487 		      unsigned long *map_start,
3488 		      unsigned long *map_len, int km)
3489 {
3490 	int err;
3491 	int save = 0;
3492 	if (eb->map_token) {
3493 		unmap_extent_buffer(eb, eb->map_token, km);
3494 		eb->map_token = NULL;
3495 		save = 1;
3496 	}
3497 	err = map_private_extent_buffer(eb, start, min_len, token, map,
3498 				       map_start, map_len, km);
3499 	if (!err && save) {
3500 		eb->map_token = *token;
3501 		eb->kaddr = *map;
3502 		eb->map_start = *map_start;
3503 		eb->map_len = *map_len;
3504 	}
3505 	return err;
3506 }
3507 
3508 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3509 {
3510 	kunmap_atomic(token, km);
3511 }
3512 
3513 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3514 			  unsigned long start,
3515 			  unsigned long len)
3516 {
3517 	size_t cur;
3518 	size_t offset;
3519 	struct page *page;
3520 	char *kaddr;
3521 	char *ptr = (char *)ptrv;
3522 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3523 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3524 	int ret = 0;
3525 
3526 	WARN_ON(start > eb->len);
3527 	WARN_ON(start + len > eb->start + eb->len);
3528 
3529 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3530 
3531 	while (len > 0) {
3532 		page = extent_buffer_page(eb, i);
3533 
3534 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3535 
3536 		kaddr = kmap_atomic(page, KM_USER0);
3537 		ret = memcmp(ptr, kaddr + offset, cur);
3538 		kunmap_atomic(kaddr, KM_USER0);
3539 		if (ret)
3540 			break;
3541 
3542 		ptr += cur;
3543 		len -= cur;
3544 		offset = 0;
3545 		i++;
3546 	}
3547 	return ret;
3548 }
3549 
3550 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3551 			 unsigned long start, unsigned long len)
3552 {
3553 	size_t cur;
3554 	size_t offset;
3555 	struct page *page;
3556 	char *kaddr;
3557 	char *src = (char *)srcv;
3558 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3559 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3560 
3561 	WARN_ON(start > eb->len);
3562 	WARN_ON(start + len > eb->start + eb->len);
3563 
3564 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3565 
3566 	while (len > 0) {
3567 		page = extent_buffer_page(eb, i);
3568 		WARN_ON(!PageUptodate(page));
3569 
3570 		cur = min(len, PAGE_CACHE_SIZE - offset);
3571 		kaddr = kmap_atomic(page, KM_USER1);
3572 		memcpy(kaddr + offset, src, cur);
3573 		kunmap_atomic(kaddr, KM_USER1);
3574 
3575 		src += cur;
3576 		len -= cur;
3577 		offset = 0;
3578 		i++;
3579 	}
3580 }
3581 
3582 void memset_extent_buffer(struct extent_buffer *eb, char c,
3583 			  unsigned long start, unsigned long len)
3584 {
3585 	size_t cur;
3586 	size_t offset;
3587 	struct page *page;
3588 	char *kaddr;
3589 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3590 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3591 
3592 	WARN_ON(start > eb->len);
3593 	WARN_ON(start + len > eb->start + eb->len);
3594 
3595 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3596 
3597 	while (len > 0) {
3598 		page = extent_buffer_page(eb, i);
3599 		WARN_ON(!PageUptodate(page));
3600 
3601 		cur = min(len, PAGE_CACHE_SIZE - offset);
3602 		kaddr = kmap_atomic(page, KM_USER0);
3603 		memset(kaddr + offset, c, cur);
3604 		kunmap_atomic(kaddr, KM_USER0);
3605 
3606 		len -= cur;
3607 		offset = 0;
3608 		i++;
3609 	}
3610 }
3611 
3612 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3613 			unsigned long dst_offset, unsigned long src_offset,
3614 			unsigned long len)
3615 {
3616 	u64 dst_len = dst->len;
3617 	size_t cur;
3618 	size_t offset;
3619 	struct page *page;
3620 	char *kaddr;
3621 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3622 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3623 
3624 	WARN_ON(src->len != dst_len);
3625 
3626 	offset = (start_offset + dst_offset) &
3627 		((unsigned long)PAGE_CACHE_SIZE - 1);
3628 
3629 	while (len > 0) {
3630 		page = extent_buffer_page(dst, i);
3631 		WARN_ON(!PageUptodate(page));
3632 
3633 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3634 
3635 		kaddr = kmap_atomic(page, KM_USER0);
3636 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3637 		kunmap_atomic(kaddr, KM_USER0);
3638 
3639 		src_offset += cur;
3640 		len -= cur;
3641 		offset = 0;
3642 		i++;
3643 	}
3644 }
3645 
3646 static void move_pages(struct page *dst_page, struct page *src_page,
3647 		       unsigned long dst_off, unsigned long src_off,
3648 		       unsigned long len)
3649 {
3650 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3651 	if (dst_page == src_page) {
3652 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3653 	} else {
3654 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3655 		char *p = dst_kaddr + dst_off + len;
3656 		char *s = src_kaddr + src_off + len;
3657 
3658 		while (len--)
3659 			*--p = *--s;
3660 
3661 		kunmap_atomic(src_kaddr, KM_USER1);
3662 	}
3663 	kunmap_atomic(dst_kaddr, KM_USER0);
3664 }
3665 
3666 static void copy_pages(struct page *dst_page, struct page *src_page,
3667 		       unsigned long dst_off, unsigned long src_off,
3668 		       unsigned long len)
3669 {
3670 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3671 	char *src_kaddr;
3672 
3673 	if (dst_page != src_page)
3674 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3675 	else
3676 		src_kaddr = dst_kaddr;
3677 
3678 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3679 	kunmap_atomic(dst_kaddr, KM_USER0);
3680 	if (dst_page != src_page)
3681 		kunmap_atomic(src_kaddr, KM_USER1);
3682 }
3683 
3684 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3685 			   unsigned long src_offset, unsigned long len)
3686 {
3687 	size_t cur;
3688 	size_t dst_off_in_page;
3689 	size_t src_off_in_page;
3690 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3691 	unsigned long dst_i;
3692 	unsigned long src_i;
3693 
3694 	if (src_offset + len > dst->len) {
3695 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3696 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3697 		BUG_ON(1);
3698 	}
3699 	if (dst_offset + len > dst->len) {
3700 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3701 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3702 		BUG_ON(1);
3703 	}
3704 
3705 	while (len > 0) {
3706 		dst_off_in_page = (start_offset + dst_offset) &
3707 			((unsigned long)PAGE_CACHE_SIZE - 1);
3708 		src_off_in_page = (start_offset + src_offset) &
3709 			((unsigned long)PAGE_CACHE_SIZE - 1);
3710 
3711 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3712 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3713 
3714 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3715 					       src_off_in_page));
3716 		cur = min_t(unsigned long, cur,
3717 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3718 
3719 		copy_pages(extent_buffer_page(dst, dst_i),
3720 			   extent_buffer_page(dst, src_i),
3721 			   dst_off_in_page, src_off_in_page, cur);
3722 
3723 		src_offset += cur;
3724 		dst_offset += cur;
3725 		len -= cur;
3726 	}
3727 }
3728 
3729 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3730 			   unsigned long src_offset, unsigned long len)
3731 {
3732 	size_t cur;
3733 	size_t dst_off_in_page;
3734 	size_t src_off_in_page;
3735 	unsigned long dst_end = dst_offset + len - 1;
3736 	unsigned long src_end = src_offset + len - 1;
3737 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3738 	unsigned long dst_i;
3739 	unsigned long src_i;
3740 
3741 	if (src_offset + len > dst->len) {
3742 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3743 		       "len %lu len %lu\n", src_offset, len, dst->len);
3744 		BUG_ON(1);
3745 	}
3746 	if (dst_offset + len > dst->len) {
3747 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3748 		       "len %lu len %lu\n", dst_offset, len, dst->len);
3749 		BUG_ON(1);
3750 	}
3751 	if (dst_offset < src_offset) {
3752 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3753 		return;
3754 	}
3755 	while (len > 0) {
3756 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3757 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3758 
3759 		dst_off_in_page = (start_offset + dst_end) &
3760 			((unsigned long)PAGE_CACHE_SIZE - 1);
3761 		src_off_in_page = (start_offset + src_end) &
3762 			((unsigned long)PAGE_CACHE_SIZE - 1);
3763 
3764 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3765 		cur = min(cur, dst_off_in_page + 1);
3766 		move_pages(extent_buffer_page(dst, dst_i),
3767 			   extent_buffer_page(dst, src_i),
3768 			   dst_off_in_page - cur + 1,
3769 			   src_off_in_page - cur + 1, cur);
3770 
3771 		dst_end -= cur;
3772 		src_end -= cur;
3773 		len -= cur;
3774 	}
3775 }
3776 
3777 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3778 {
3779 	u64 start = page_offset(page);
3780 	struct extent_buffer *eb;
3781 	int ret = 1;
3782 	unsigned long i;
3783 	unsigned long num_pages;
3784 
3785 	spin_lock(&tree->buffer_lock);
3786 	eb = buffer_search(tree, start);
3787 	if (!eb)
3788 		goto out;
3789 
3790 	if (atomic_read(&eb->refs) > 1) {
3791 		ret = 0;
3792 		goto out;
3793 	}
3794 	/* at this point we can safely release the extent buffer */
3795 	num_pages = num_extent_pages(eb->start, eb->len);
3796 	for (i = 0; i < num_pages; i++)
3797 		page_cache_release(extent_buffer_page(eb, i));
3798 	rb_erase(&eb->rb_node, &tree->buffer);
3799 	__free_extent_buffer(eb);
3800 out:
3801 	spin_unlock(&tree->buffer_lock);
3802 	return ret;
3803 }
3804