xref: /linux/fs/btrfs/extent_io.c (revision 9601e3f6336f6ca66929f451b1f66085e68e36e3)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 
20 static struct kmem_cache *extent_state_cache;
21 static struct kmem_cache *extent_buffer_cache;
22 
23 static LIST_HEAD(buffers);
24 static LIST_HEAD(states);
25 
26 #define LEAK_DEBUG 0
27 #if LEAK_DEBUG
28 static DEFINE_SPINLOCK(leak_lock);
29 #endif
30 
31 #define BUFFER_LRU_MAX 64
32 
33 struct tree_entry {
34 	u64 start;
35 	u64 end;
36 	struct rb_node rb_node;
37 };
38 
39 struct extent_page_data {
40 	struct bio *bio;
41 	struct extent_io_tree *tree;
42 	get_extent_t *get_extent;
43 
44 	/* tells writepage not to lock the state bits for this range
45 	 * it still does the unlocking
46 	 */
47 	unsigned int extent_locked:1;
48 
49 	/* tells the submit_bio code to use a WRITE_SYNC */
50 	unsigned int sync_io:1;
51 };
52 
53 int __init extent_io_init(void)
54 {
55 	extent_state_cache = kmem_cache_create("extent_state",
56 			sizeof(struct extent_state), 0,
57 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
58 	if (!extent_state_cache)
59 		return -ENOMEM;
60 
61 	extent_buffer_cache = kmem_cache_create("extent_buffers",
62 			sizeof(struct extent_buffer), 0,
63 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
64 	if (!extent_buffer_cache)
65 		goto free_state_cache;
66 	return 0;
67 
68 free_state_cache:
69 	kmem_cache_destroy(extent_state_cache);
70 	return -ENOMEM;
71 }
72 
73 void extent_io_exit(void)
74 {
75 	struct extent_state *state;
76 	struct extent_buffer *eb;
77 
78 	while (!list_empty(&states)) {
79 		state = list_entry(states.next, struct extent_state, leak_list);
80 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
81 		       "state %lu in tree %p refs %d\n",
82 		       (unsigned long long)state->start,
83 		       (unsigned long long)state->end,
84 		       state->state, state->tree, atomic_read(&state->refs));
85 		list_del(&state->leak_list);
86 		kmem_cache_free(extent_state_cache, state);
87 
88 	}
89 
90 	while (!list_empty(&buffers)) {
91 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
92 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
93 		       "refs %d\n", (unsigned long long)eb->start,
94 		       eb->len, atomic_read(&eb->refs));
95 		list_del(&eb->leak_list);
96 		kmem_cache_free(extent_buffer_cache, eb);
97 	}
98 	if (extent_state_cache)
99 		kmem_cache_destroy(extent_state_cache);
100 	if (extent_buffer_cache)
101 		kmem_cache_destroy(extent_buffer_cache);
102 }
103 
104 void extent_io_tree_init(struct extent_io_tree *tree,
105 			  struct address_space *mapping, gfp_t mask)
106 {
107 	tree->state.rb_node = NULL;
108 	tree->buffer.rb_node = NULL;
109 	tree->ops = NULL;
110 	tree->dirty_bytes = 0;
111 	spin_lock_init(&tree->lock);
112 	spin_lock_init(&tree->buffer_lock);
113 	tree->mapping = mapping;
114 }
115 
116 static struct extent_state *alloc_extent_state(gfp_t mask)
117 {
118 	struct extent_state *state;
119 #if LEAK_DEBUG
120 	unsigned long flags;
121 #endif
122 
123 	state = kmem_cache_alloc(extent_state_cache, mask);
124 	if (!state)
125 		return state;
126 	state->state = 0;
127 	state->private = 0;
128 	state->tree = NULL;
129 #if LEAK_DEBUG
130 	spin_lock_irqsave(&leak_lock, flags);
131 	list_add(&state->leak_list, &states);
132 	spin_unlock_irqrestore(&leak_lock, flags);
133 #endif
134 	atomic_set(&state->refs, 1);
135 	init_waitqueue_head(&state->wq);
136 	return state;
137 }
138 
139 static void free_extent_state(struct extent_state *state)
140 {
141 	if (!state)
142 		return;
143 	if (atomic_dec_and_test(&state->refs)) {
144 #if LEAK_DEBUG
145 		unsigned long flags;
146 #endif
147 		WARN_ON(state->tree);
148 #if LEAK_DEBUG
149 		spin_lock_irqsave(&leak_lock, flags);
150 		list_del(&state->leak_list);
151 		spin_unlock_irqrestore(&leak_lock, flags);
152 #endif
153 		kmem_cache_free(extent_state_cache, state);
154 	}
155 }
156 
157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
158 				   struct rb_node *node)
159 {
160 	struct rb_node **p = &root->rb_node;
161 	struct rb_node *parent = NULL;
162 	struct tree_entry *entry;
163 
164 	while (*p) {
165 		parent = *p;
166 		entry = rb_entry(parent, struct tree_entry, rb_node);
167 
168 		if (offset < entry->start)
169 			p = &(*p)->rb_left;
170 		else if (offset > entry->end)
171 			p = &(*p)->rb_right;
172 		else
173 			return parent;
174 	}
175 
176 	entry = rb_entry(node, struct tree_entry, rb_node);
177 	rb_link_node(node, parent, p);
178 	rb_insert_color(node, root);
179 	return NULL;
180 }
181 
182 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
183 				     struct rb_node **prev_ret,
184 				     struct rb_node **next_ret)
185 {
186 	struct rb_root *root = &tree->state;
187 	struct rb_node *n = root->rb_node;
188 	struct rb_node *prev = NULL;
189 	struct rb_node *orig_prev = NULL;
190 	struct tree_entry *entry;
191 	struct tree_entry *prev_entry = NULL;
192 
193 	while (n) {
194 		entry = rb_entry(n, struct tree_entry, rb_node);
195 		prev = n;
196 		prev_entry = entry;
197 
198 		if (offset < entry->start)
199 			n = n->rb_left;
200 		else if (offset > entry->end)
201 			n = n->rb_right;
202 		else
203 			return n;
204 	}
205 
206 	if (prev_ret) {
207 		orig_prev = prev;
208 		while (prev && offset > prev_entry->end) {
209 			prev = rb_next(prev);
210 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211 		}
212 		*prev_ret = prev;
213 		prev = orig_prev;
214 	}
215 
216 	if (next_ret) {
217 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
218 		while (prev && offset < prev_entry->start) {
219 			prev = rb_prev(prev);
220 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		}
222 		*next_ret = prev;
223 	}
224 	return NULL;
225 }
226 
227 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
228 					  u64 offset)
229 {
230 	struct rb_node *prev = NULL;
231 	struct rb_node *ret;
232 
233 	ret = __etree_search(tree, offset, &prev, NULL);
234 	if (!ret)
235 		return prev;
236 	return ret;
237 }
238 
239 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
240 					  u64 offset, struct rb_node *node)
241 {
242 	struct rb_root *root = &tree->buffer;
243 	struct rb_node **p = &root->rb_node;
244 	struct rb_node *parent = NULL;
245 	struct extent_buffer *eb;
246 
247 	while (*p) {
248 		parent = *p;
249 		eb = rb_entry(parent, struct extent_buffer, rb_node);
250 
251 		if (offset < eb->start)
252 			p = &(*p)->rb_left;
253 		else if (offset > eb->start)
254 			p = &(*p)->rb_right;
255 		else
256 			return eb;
257 	}
258 
259 	rb_link_node(node, parent, p);
260 	rb_insert_color(node, root);
261 	return NULL;
262 }
263 
264 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
265 					   u64 offset)
266 {
267 	struct rb_root *root = &tree->buffer;
268 	struct rb_node *n = root->rb_node;
269 	struct extent_buffer *eb;
270 
271 	while (n) {
272 		eb = rb_entry(n, struct extent_buffer, rb_node);
273 		if (offset < eb->start)
274 			n = n->rb_left;
275 		else if (offset > eb->start)
276 			n = n->rb_right;
277 		else
278 			return eb;
279 	}
280 	return NULL;
281 }
282 
283 /*
284  * utility function to look for merge candidates inside a given range.
285  * Any extents with matching state are merged together into a single
286  * extent in the tree.  Extents with EXTENT_IO in their state field
287  * are not merged because the end_io handlers need to be able to do
288  * operations on them without sleeping (or doing allocations/splits).
289  *
290  * This should be called with the tree lock held.
291  */
292 static int merge_state(struct extent_io_tree *tree,
293 		       struct extent_state *state)
294 {
295 	struct extent_state *other;
296 	struct rb_node *other_node;
297 
298 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
299 		return 0;
300 
301 	other_node = rb_prev(&state->rb_node);
302 	if (other_node) {
303 		other = rb_entry(other_node, struct extent_state, rb_node);
304 		if (other->end == state->start - 1 &&
305 		    other->state == state->state) {
306 			state->start = other->start;
307 			other->tree = NULL;
308 			rb_erase(&other->rb_node, &tree->state);
309 			free_extent_state(other);
310 		}
311 	}
312 	other_node = rb_next(&state->rb_node);
313 	if (other_node) {
314 		other = rb_entry(other_node, struct extent_state, rb_node);
315 		if (other->start == state->end + 1 &&
316 		    other->state == state->state) {
317 			other->start = state->start;
318 			state->tree = NULL;
319 			rb_erase(&state->rb_node, &tree->state);
320 			free_extent_state(state);
321 		}
322 	}
323 	return 0;
324 }
325 
326 static void set_state_cb(struct extent_io_tree *tree,
327 			 struct extent_state *state,
328 			 unsigned long bits)
329 {
330 	if (tree->ops && tree->ops->set_bit_hook) {
331 		tree->ops->set_bit_hook(tree->mapping->host, state->start,
332 					state->end, state->state, bits);
333 	}
334 }
335 
336 static void clear_state_cb(struct extent_io_tree *tree,
337 			   struct extent_state *state,
338 			   unsigned long bits)
339 {
340 	if (tree->ops && tree->ops->clear_bit_hook) {
341 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
342 					  state->end, state->state, bits);
343 	}
344 }
345 
346 /*
347  * insert an extent_state struct into the tree.  'bits' are set on the
348  * struct before it is inserted.
349  *
350  * This may return -EEXIST if the extent is already there, in which case the
351  * state struct is freed.
352  *
353  * The tree lock is not taken internally.  This is a utility function and
354  * probably isn't what you want to call (see set/clear_extent_bit).
355  */
356 static int insert_state(struct extent_io_tree *tree,
357 			struct extent_state *state, u64 start, u64 end,
358 			int bits)
359 {
360 	struct rb_node *node;
361 
362 	if (end < start) {
363 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
364 		       (unsigned long long)end,
365 		       (unsigned long long)start);
366 		WARN_ON(1);
367 	}
368 	if (bits & EXTENT_DIRTY)
369 		tree->dirty_bytes += end - start + 1;
370 	set_state_cb(tree, state, bits);
371 	state->state |= bits;
372 	state->start = start;
373 	state->end = end;
374 	node = tree_insert(&tree->state, end, &state->rb_node);
375 	if (node) {
376 		struct extent_state *found;
377 		found = rb_entry(node, struct extent_state, rb_node);
378 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
379 		       "%llu %llu\n", (unsigned long long)found->start,
380 		       (unsigned long long)found->end,
381 		       (unsigned long long)start, (unsigned long long)end);
382 		free_extent_state(state);
383 		return -EEXIST;
384 	}
385 	state->tree = tree;
386 	merge_state(tree, state);
387 	return 0;
388 }
389 
390 /*
391  * split a given extent state struct in two, inserting the preallocated
392  * struct 'prealloc' as the newly created second half.  'split' indicates an
393  * offset inside 'orig' where it should be split.
394  *
395  * Before calling,
396  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
397  * are two extent state structs in the tree:
398  * prealloc: [orig->start, split - 1]
399  * orig: [ split, orig->end ]
400  *
401  * The tree locks are not taken by this function. They need to be held
402  * by the caller.
403  */
404 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 		       struct extent_state *prealloc, u64 split)
406 {
407 	struct rb_node *node;
408 	prealloc->start = orig->start;
409 	prealloc->end = split - 1;
410 	prealloc->state = orig->state;
411 	orig->start = split;
412 
413 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
414 	if (node) {
415 		free_extent_state(prealloc);
416 		return -EEXIST;
417 	}
418 	prealloc->tree = tree;
419 	return 0;
420 }
421 
422 /*
423  * utility function to clear some bits in an extent state struct.
424  * it will optionally wake up any one waiting on this state (wake == 1), or
425  * forcibly remove the state from the tree (delete == 1).
426  *
427  * If no bits are set on the state struct after clearing things, the
428  * struct is freed and removed from the tree
429  */
430 static int clear_state_bit(struct extent_io_tree *tree,
431 			    struct extent_state *state, int bits, int wake,
432 			    int delete)
433 {
434 	int ret = state->state & bits;
435 
436 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437 		u64 range = state->end - state->start + 1;
438 		WARN_ON(range > tree->dirty_bytes);
439 		tree->dirty_bytes -= range;
440 	}
441 	clear_state_cb(tree, state, bits);
442 	state->state &= ~bits;
443 	if (wake)
444 		wake_up(&state->wq);
445 	if (delete || state->state == 0) {
446 		if (state->tree) {
447 			clear_state_cb(tree, state, state->state);
448 			rb_erase(&state->rb_node, &tree->state);
449 			state->tree = NULL;
450 			free_extent_state(state);
451 		} else {
452 			WARN_ON(1);
453 		}
454 	} else {
455 		merge_state(tree, state);
456 	}
457 	return ret;
458 }
459 
460 /*
461  * clear some bits on a range in the tree.  This may require splitting
462  * or inserting elements in the tree, so the gfp mask is used to
463  * indicate which allocations or sleeping are allowed.
464  *
465  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
466  * the given range from the tree regardless of state (ie for truncate).
467  *
468  * the range [start, end] is inclusive.
469  *
470  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
471  * bits were already set, or zero if none of the bits were already set.
472  */
473 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
474 		     int bits, int wake, int delete, gfp_t mask)
475 {
476 	struct extent_state *state;
477 	struct extent_state *prealloc = NULL;
478 	struct rb_node *node;
479 	int err;
480 	int set = 0;
481 
482 again:
483 	if (!prealloc && (mask & __GFP_WAIT)) {
484 		prealloc = alloc_extent_state(mask);
485 		if (!prealloc)
486 			return -ENOMEM;
487 	}
488 
489 	spin_lock(&tree->lock);
490 	/*
491 	 * this search will find the extents that end after
492 	 * our range starts
493 	 */
494 	node = tree_search(tree, start);
495 	if (!node)
496 		goto out;
497 	state = rb_entry(node, struct extent_state, rb_node);
498 	if (state->start > end)
499 		goto out;
500 	WARN_ON(state->end < start);
501 
502 	/*
503 	 *     | ---- desired range ---- |
504 	 *  | state | or
505 	 *  | ------------- state -------------- |
506 	 *
507 	 * We need to split the extent we found, and may flip
508 	 * bits on second half.
509 	 *
510 	 * If the extent we found extends past our range, we
511 	 * just split and search again.  It'll get split again
512 	 * the next time though.
513 	 *
514 	 * If the extent we found is inside our range, we clear
515 	 * the desired bit on it.
516 	 */
517 
518 	if (state->start < start) {
519 		if (!prealloc)
520 			prealloc = alloc_extent_state(GFP_ATOMIC);
521 		err = split_state(tree, state, prealloc, start);
522 		BUG_ON(err == -EEXIST);
523 		prealloc = NULL;
524 		if (err)
525 			goto out;
526 		if (state->end <= end) {
527 			start = state->end + 1;
528 			set |= clear_state_bit(tree, state, bits,
529 					wake, delete);
530 		} else {
531 			start = state->start;
532 		}
533 		goto search_again;
534 	}
535 	/*
536 	 * | ---- desired range ---- |
537 	 *                        | state |
538 	 * We need to split the extent, and clear the bit
539 	 * on the first half
540 	 */
541 	if (state->start <= end && state->end > end) {
542 		if (!prealloc)
543 			prealloc = alloc_extent_state(GFP_ATOMIC);
544 		err = split_state(tree, state, prealloc, end + 1);
545 		BUG_ON(err == -EEXIST);
546 
547 		if (wake)
548 			wake_up(&state->wq);
549 		set |= clear_state_bit(tree, prealloc, bits,
550 				       wake, delete);
551 		prealloc = NULL;
552 		goto out;
553 	}
554 
555 	start = state->end + 1;
556 	set |= clear_state_bit(tree, state, bits, wake, delete);
557 	goto search_again;
558 
559 out:
560 	spin_unlock(&tree->lock);
561 	if (prealloc)
562 		free_extent_state(prealloc);
563 
564 	return set;
565 
566 search_again:
567 	if (start > end)
568 		goto out;
569 	spin_unlock(&tree->lock);
570 	if (mask & __GFP_WAIT)
571 		cond_resched();
572 	goto again;
573 }
574 
575 static int wait_on_state(struct extent_io_tree *tree,
576 			 struct extent_state *state)
577 		__releases(tree->lock)
578 		__acquires(tree->lock)
579 {
580 	DEFINE_WAIT(wait);
581 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
582 	spin_unlock(&tree->lock);
583 	schedule();
584 	spin_lock(&tree->lock);
585 	finish_wait(&state->wq, &wait);
586 	return 0;
587 }
588 
589 /*
590  * waits for one or more bits to clear on a range in the state tree.
591  * The range [start, end] is inclusive.
592  * The tree lock is taken by this function
593  */
594 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
595 {
596 	struct extent_state *state;
597 	struct rb_node *node;
598 
599 	spin_lock(&tree->lock);
600 again:
601 	while (1) {
602 		/*
603 		 * this search will find all the extents that end after
604 		 * our range starts
605 		 */
606 		node = tree_search(tree, start);
607 		if (!node)
608 			break;
609 
610 		state = rb_entry(node, struct extent_state, rb_node);
611 
612 		if (state->start > end)
613 			goto out;
614 
615 		if (state->state & bits) {
616 			start = state->start;
617 			atomic_inc(&state->refs);
618 			wait_on_state(tree, state);
619 			free_extent_state(state);
620 			goto again;
621 		}
622 		start = state->end + 1;
623 
624 		if (start > end)
625 			break;
626 
627 		if (need_resched()) {
628 			spin_unlock(&tree->lock);
629 			cond_resched();
630 			spin_lock(&tree->lock);
631 		}
632 	}
633 out:
634 	spin_unlock(&tree->lock);
635 	return 0;
636 }
637 
638 static void set_state_bits(struct extent_io_tree *tree,
639 			   struct extent_state *state,
640 			   int bits)
641 {
642 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
643 		u64 range = state->end - state->start + 1;
644 		tree->dirty_bytes += range;
645 	}
646 	set_state_cb(tree, state, bits);
647 	state->state |= bits;
648 }
649 
650 /*
651  * set some bits on a range in the tree.  This may require allocations
652  * or sleeping, so the gfp mask is used to indicate what is allowed.
653  *
654  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
655  * range already has the desired bits set.  The start of the existing
656  * range is returned in failed_start in this case.
657  *
658  * [start, end] is inclusive
659  * This takes the tree lock.
660  */
661 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
662 			  int bits, int exclusive, u64 *failed_start,
663 			  gfp_t mask)
664 {
665 	struct extent_state *state;
666 	struct extent_state *prealloc = NULL;
667 	struct rb_node *node;
668 	int err = 0;
669 	int set;
670 	u64 last_start;
671 	u64 last_end;
672 again:
673 	if (!prealloc && (mask & __GFP_WAIT)) {
674 		prealloc = alloc_extent_state(mask);
675 		if (!prealloc)
676 			return -ENOMEM;
677 	}
678 
679 	spin_lock(&tree->lock);
680 	/*
681 	 * this search will find all the extents that end after
682 	 * our range starts.
683 	 */
684 	node = tree_search(tree, start);
685 	if (!node) {
686 		err = insert_state(tree, prealloc, start, end, bits);
687 		prealloc = NULL;
688 		BUG_ON(err == -EEXIST);
689 		goto out;
690 	}
691 
692 	state = rb_entry(node, struct extent_state, rb_node);
693 	last_start = state->start;
694 	last_end = state->end;
695 
696 	/*
697 	 * | ---- desired range ---- |
698 	 * | state |
699 	 *
700 	 * Just lock what we found and keep going
701 	 */
702 	if (state->start == start && state->end <= end) {
703 		set = state->state & bits;
704 		if (set && exclusive) {
705 			*failed_start = state->start;
706 			err = -EEXIST;
707 			goto out;
708 		}
709 		set_state_bits(tree, state, bits);
710 		start = state->end + 1;
711 		merge_state(tree, state);
712 		goto search_again;
713 	}
714 
715 	/*
716 	 *     | ---- desired range ---- |
717 	 * | state |
718 	 *   or
719 	 * | ------------- state -------------- |
720 	 *
721 	 * We need to split the extent we found, and may flip bits on
722 	 * second half.
723 	 *
724 	 * If the extent we found extends past our
725 	 * range, we just split and search again.  It'll get split
726 	 * again the next time though.
727 	 *
728 	 * If the extent we found is inside our range, we set the
729 	 * desired bit on it.
730 	 */
731 	if (state->start < start) {
732 		set = state->state & bits;
733 		if (exclusive && set) {
734 			*failed_start = start;
735 			err = -EEXIST;
736 			goto out;
737 		}
738 		err = split_state(tree, state, prealloc, start);
739 		BUG_ON(err == -EEXIST);
740 		prealloc = NULL;
741 		if (err)
742 			goto out;
743 		if (state->end <= end) {
744 			set_state_bits(tree, state, bits);
745 			start = state->end + 1;
746 			merge_state(tree, state);
747 		} else {
748 			start = state->start;
749 		}
750 		goto search_again;
751 	}
752 	/*
753 	 * | ---- desired range ---- |
754 	 *     | state | or               | state |
755 	 *
756 	 * There's a hole, we need to insert something in it and
757 	 * ignore the extent we found.
758 	 */
759 	if (state->start > start) {
760 		u64 this_end;
761 		if (end < last_start)
762 			this_end = end;
763 		else
764 			this_end = last_start - 1;
765 		err = insert_state(tree, prealloc, start, this_end,
766 				   bits);
767 		prealloc = NULL;
768 		BUG_ON(err == -EEXIST);
769 		if (err)
770 			goto out;
771 		start = this_end + 1;
772 		goto search_again;
773 	}
774 	/*
775 	 * | ---- desired range ---- |
776 	 *                        | state |
777 	 * We need to split the extent, and set the bit
778 	 * on the first half
779 	 */
780 	if (state->start <= end && state->end > end) {
781 		set = state->state & bits;
782 		if (exclusive && set) {
783 			*failed_start = start;
784 			err = -EEXIST;
785 			goto out;
786 		}
787 		err = split_state(tree, state, prealloc, end + 1);
788 		BUG_ON(err == -EEXIST);
789 
790 		set_state_bits(tree, prealloc, bits);
791 		merge_state(tree, prealloc);
792 		prealloc = NULL;
793 		goto out;
794 	}
795 
796 	goto search_again;
797 
798 out:
799 	spin_unlock(&tree->lock);
800 	if (prealloc)
801 		free_extent_state(prealloc);
802 
803 	return err;
804 
805 search_again:
806 	if (start > end)
807 		goto out;
808 	spin_unlock(&tree->lock);
809 	if (mask & __GFP_WAIT)
810 		cond_resched();
811 	goto again;
812 }
813 
814 /* wrappers around set/clear extent bit */
815 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
816 		     gfp_t mask)
817 {
818 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
819 			      mask);
820 }
821 
822 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
823 		       gfp_t mask)
824 {
825 	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
826 }
827 
828 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
829 		    int bits, gfp_t mask)
830 {
831 	return set_extent_bit(tree, start, end, bits, 0, NULL,
832 			      mask);
833 }
834 
835 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
836 		      int bits, gfp_t mask)
837 {
838 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
839 }
840 
841 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
842 		     gfp_t mask)
843 {
844 	return set_extent_bit(tree, start, end,
845 			      EXTENT_DELALLOC | EXTENT_DIRTY,
846 			      0, NULL, mask);
847 }
848 
849 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
850 		       gfp_t mask)
851 {
852 	return clear_extent_bit(tree, start, end,
853 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
854 }
855 
856 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
857 			 gfp_t mask)
858 {
859 	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
860 }
861 
862 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
863 		     gfp_t mask)
864 {
865 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
866 			      mask);
867 }
868 
869 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
870 		       gfp_t mask)
871 {
872 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
873 }
874 
875 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
876 			gfp_t mask)
877 {
878 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
879 			      mask);
880 }
881 
882 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
883 				 u64 end, gfp_t mask)
884 {
885 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
886 }
887 
888 static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
889 			 gfp_t mask)
890 {
891 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
892 			      0, NULL, mask);
893 }
894 
895 static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
896 				  u64 end, gfp_t mask)
897 {
898 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
899 }
900 
901 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
902 {
903 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
904 }
905 
906 /*
907  * either insert or lock state struct between start and end use mask to tell
908  * us if waiting is desired.
909  */
910 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
911 {
912 	int err;
913 	u64 failed_start;
914 	while (1) {
915 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
916 				     &failed_start, mask);
917 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
918 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
919 			start = failed_start;
920 		} else {
921 			break;
922 		}
923 		WARN_ON(start > end);
924 	}
925 	return err;
926 }
927 
928 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
929 		    gfp_t mask)
930 {
931 	int err;
932 	u64 failed_start;
933 
934 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
935 			     &failed_start, mask);
936 	if (err == -EEXIST) {
937 		if (failed_start > start)
938 			clear_extent_bit(tree, start, failed_start - 1,
939 					 EXTENT_LOCKED, 1, 0, mask);
940 		return 0;
941 	}
942 	return 1;
943 }
944 
945 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
946 		  gfp_t mask)
947 {
948 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
949 }
950 
951 /*
952  * helper function to set pages and extents in the tree dirty
953  */
954 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
955 {
956 	unsigned long index = start >> PAGE_CACHE_SHIFT;
957 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
958 	struct page *page;
959 
960 	while (index <= end_index) {
961 		page = find_get_page(tree->mapping, index);
962 		BUG_ON(!page);
963 		__set_page_dirty_nobuffers(page);
964 		page_cache_release(page);
965 		index++;
966 	}
967 	set_extent_dirty(tree, start, end, GFP_NOFS);
968 	return 0;
969 }
970 
971 /*
972  * helper function to set both pages and extents in the tree writeback
973  */
974 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
975 {
976 	unsigned long index = start >> PAGE_CACHE_SHIFT;
977 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
978 	struct page *page;
979 
980 	while (index <= end_index) {
981 		page = find_get_page(tree->mapping, index);
982 		BUG_ON(!page);
983 		set_page_writeback(page);
984 		page_cache_release(page);
985 		index++;
986 	}
987 	set_extent_writeback(tree, start, end, GFP_NOFS);
988 	return 0;
989 }
990 
991 /*
992  * find the first offset in the io tree with 'bits' set. zero is
993  * returned if we find something, and *start_ret and *end_ret are
994  * set to reflect the state struct that was found.
995  *
996  * If nothing was found, 1 is returned, < 0 on error
997  */
998 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
999 			  u64 *start_ret, u64 *end_ret, int bits)
1000 {
1001 	struct rb_node *node;
1002 	struct extent_state *state;
1003 	int ret = 1;
1004 
1005 	spin_lock(&tree->lock);
1006 	/*
1007 	 * this search will find all the extents that end after
1008 	 * our range starts.
1009 	 */
1010 	node = tree_search(tree, start);
1011 	if (!node)
1012 		goto out;
1013 
1014 	while (1) {
1015 		state = rb_entry(node, struct extent_state, rb_node);
1016 		if (state->end >= start && (state->state & bits)) {
1017 			*start_ret = state->start;
1018 			*end_ret = state->end;
1019 			ret = 0;
1020 			break;
1021 		}
1022 		node = rb_next(node);
1023 		if (!node)
1024 			break;
1025 	}
1026 out:
1027 	spin_unlock(&tree->lock);
1028 	return ret;
1029 }
1030 
1031 /* find the first state struct with 'bits' set after 'start', and
1032  * return it.  tree->lock must be held.  NULL will returned if
1033  * nothing was found after 'start'
1034  */
1035 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1036 						 u64 start, int bits)
1037 {
1038 	struct rb_node *node;
1039 	struct extent_state *state;
1040 
1041 	/*
1042 	 * this search will find all the extents that end after
1043 	 * our range starts.
1044 	 */
1045 	node = tree_search(tree, start);
1046 	if (!node)
1047 		goto out;
1048 
1049 	while (1) {
1050 		state = rb_entry(node, struct extent_state, rb_node);
1051 		if (state->end >= start && (state->state & bits))
1052 			return state;
1053 
1054 		node = rb_next(node);
1055 		if (!node)
1056 			break;
1057 	}
1058 out:
1059 	return NULL;
1060 }
1061 
1062 /*
1063  * find a contiguous range of bytes in the file marked as delalloc, not
1064  * more than 'max_bytes'.  start and end are used to return the range,
1065  *
1066  * 1 is returned if we find something, 0 if nothing was in the tree
1067  */
1068 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1069 					u64 *start, u64 *end, u64 max_bytes)
1070 {
1071 	struct rb_node *node;
1072 	struct extent_state *state;
1073 	u64 cur_start = *start;
1074 	u64 found = 0;
1075 	u64 total_bytes = 0;
1076 
1077 	spin_lock(&tree->lock);
1078 
1079 	/*
1080 	 * this search will find all the extents that end after
1081 	 * our range starts.
1082 	 */
1083 	node = tree_search(tree, cur_start);
1084 	if (!node) {
1085 		if (!found)
1086 			*end = (u64)-1;
1087 		goto out;
1088 	}
1089 
1090 	while (1) {
1091 		state = rb_entry(node, struct extent_state, rb_node);
1092 		if (found && (state->start != cur_start ||
1093 			      (state->state & EXTENT_BOUNDARY))) {
1094 			goto out;
1095 		}
1096 		if (!(state->state & EXTENT_DELALLOC)) {
1097 			if (!found)
1098 				*end = state->end;
1099 			goto out;
1100 		}
1101 		if (!found)
1102 			*start = state->start;
1103 		found++;
1104 		*end = state->end;
1105 		cur_start = state->end + 1;
1106 		node = rb_next(node);
1107 		if (!node)
1108 			break;
1109 		total_bytes += state->end - state->start + 1;
1110 		if (total_bytes >= max_bytes)
1111 			break;
1112 	}
1113 out:
1114 	spin_unlock(&tree->lock);
1115 	return found;
1116 }
1117 
1118 static noinline int __unlock_for_delalloc(struct inode *inode,
1119 					  struct page *locked_page,
1120 					  u64 start, u64 end)
1121 {
1122 	int ret;
1123 	struct page *pages[16];
1124 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1125 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1126 	unsigned long nr_pages = end_index - index + 1;
1127 	int i;
1128 
1129 	if (index == locked_page->index && end_index == index)
1130 		return 0;
1131 
1132 	while (nr_pages > 0) {
1133 		ret = find_get_pages_contig(inode->i_mapping, index,
1134 				     min_t(unsigned long, nr_pages,
1135 				     ARRAY_SIZE(pages)), pages);
1136 		for (i = 0; i < ret; i++) {
1137 			if (pages[i] != locked_page)
1138 				unlock_page(pages[i]);
1139 			page_cache_release(pages[i]);
1140 		}
1141 		nr_pages -= ret;
1142 		index += ret;
1143 		cond_resched();
1144 	}
1145 	return 0;
1146 }
1147 
1148 static noinline int lock_delalloc_pages(struct inode *inode,
1149 					struct page *locked_page,
1150 					u64 delalloc_start,
1151 					u64 delalloc_end)
1152 {
1153 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1154 	unsigned long start_index = index;
1155 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1156 	unsigned long pages_locked = 0;
1157 	struct page *pages[16];
1158 	unsigned long nrpages;
1159 	int ret;
1160 	int i;
1161 
1162 	/* the caller is responsible for locking the start index */
1163 	if (index == locked_page->index && index == end_index)
1164 		return 0;
1165 
1166 	/* skip the page at the start index */
1167 	nrpages = end_index - index + 1;
1168 	while (nrpages > 0) {
1169 		ret = find_get_pages_contig(inode->i_mapping, index,
1170 				     min_t(unsigned long,
1171 				     nrpages, ARRAY_SIZE(pages)), pages);
1172 		if (ret == 0) {
1173 			ret = -EAGAIN;
1174 			goto done;
1175 		}
1176 		/* now we have an array of pages, lock them all */
1177 		for (i = 0; i < ret; i++) {
1178 			/*
1179 			 * the caller is taking responsibility for
1180 			 * locked_page
1181 			 */
1182 			if (pages[i] != locked_page) {
1183 				lock_page(pages[i]);
1184 				if (!PageDirty(pages[i]) ||
1185 				    pages[i]->mapping != inode->i_mapping) {
1186 					ret = -EAGAIN;
1187 					unlock_page(pages[i]);
1188 					page_cache_release(pages[i]);
1189 					goto done;
1190 				}
1191 			}
1192 			page_cache_release(pages[i]);
1193 			pages_locked++;
1194 		}
1195 		nrpages -= ret;
1196 		index += ret;
1197 		cond_resched();
1198 	}
1199 	ret = 0;
1200 done:
1201 	if (ret && pages_locked) {
1202 		__unlock_for_delalloc(inode, locked_page,
1203 			      delalloc_start,
1204 			      ((u64)(start_index + pages_locked - 1)) <<
1205 			      PAGE_CACHE_SHIFT);
1206 	}
1207 	return ret;
1208 }
1209 
1210 /*
1211  * find a contiguous range of bytes in the file marked as delalloc, not
1212  * more than 'max_bytes'.  start and end are used to return the range,
1213  *
1214  * 1 is returned if we find something, 0 if nothing was in the tree
1215  */
1216 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1217 					     struct extent_io_tree *tree,
1218 					     struct page *locked_page,
1219 					     u64 *start, u64 *end,
1220 					     u64 max_bytes)
1221 {
1222 	u64 delalloc_start;
1223 	u64 delalloc_end;
1224 	u64 found;
1225 	int ret;
1226 	int loops = 0;
1227 
1228 again:
1229 	/* step one, find a bunch of delalloc bytes starting at start */
1230 	delalloc_start = *start;
1231 	delalloc_end = 0;
1232 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1233 				    max_bytes);
1234 	if (!found || delalloc_end <= *start) {
1235 		*start = delalloc_start;
1236 		*end = delalloc_end;
1237 		return found;
1238 	}
1239 
1240 	/*
1241 	 * start comes from the offset of locked_page.  We have to lock
1242 	 * pages in order, so we can't process delalloc bytes before
1243 	 * locked_page
1244 	 */
1245 	if (delalloc_start < *start)
1246 		delalloc_start = *start;
1247 
1248 	/*
1249 	 * make sure to limit the number of pages we try to lock down
1250 	 * if we're looping.
1251 	 */
1252 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1253 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1254 
1255 	/* step two, lock all the pages after the page that has start */
1256 	ret = lock_delalloc_pages(inode, locked_page,
1257 				  delalloc_start, delalloc_end);
1258 	if (ret == -EAGAIN) {
1259 		/* some of the pages are gone, lets avoid looping by
1260 		 * shortening the size of the delalloc range we're searching
1261 		 */
1262 		if (!loops) {
1263 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1264 			max_bytes = PAGE_CACHE_SIZE - offset;
1265 			loops = 1;
1266 			goto again;
1267 		} else {
1268 			found = 0;
1269 			goto out_failed;
1270 		}
1271 	}
1272 	BUG_ON(ret);
1273 
1274 	/* step three, lock the state bits for the whole range */
1275 	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1276 
1277 	/* then test to make sure it is all still delalloc */
1278 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1279 			     EXTENT_DELALLOC, 1);
1280 	if (!ret) {
1281 		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282 		__unlock_for_delalloc(inode, locked_page,
1283 			      delalloc_start, delalloc_end);
1284 		cond_resched();
1285 		goto again;
1286 	}
1287 	*start = delalloc_start;
1288 	*end = delalloc_end;
1289 out_failed:
1290 	return found;
1291 }
1292 
1293 int extent_clear_unlock_delalloc(struct inode *inode,
1294 				struct extent_io_tree *tree,
1295 				u64 start, u64 end, struct page *locked_page,
1296 				int unlock_pages,
1297 				int clear_unlock,
1298 				int clear_delalloc, int clear_dirty,
1299 				int set_writeback,
1300 				int end_writeback)
1301 {
1302 	int ret;
1303 	struct page *pages[16];
1304 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1305 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1306 	unsigned long nr_pages = end_index - index + 1;
1307 	int i;
1308 	int clear_bits = 0;
1309 
1310 	if (clear_unlock)
1311 		clear_bits |= EXTENT_LOCKED;
1312 	if (clear_dirty)
1313 		clear_bits |= EXTENT_DIRTY;
1314 
1315 	if (clear_delalloc)
1316 		clear_bits |= EXTENT_DELALLOC;
1317 
1318 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1319 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1320 		return 0;
1321 
1322 	while (nr_pages > 0) {
1323 		ret = find_get_pages_contig(inode->i_mapping, index,
1324 				     min_t(unsigned long,
1325 				     nr_pages, ARRAY_SIZE(pages)), pages);
1326 		for (i = 0; i < ret; i++) {
1327 			if (pages[i] == locked_page) {
1328 				page_cache_release(pages[i]);
1329 				continue;
1330 			}
1331 			if (clear_dirty)
1332 				clear_page_dirty_for_io(pages[i]);
1333 			if (set_writeback)
1334 				set_page_writeback(pages[i]);
1335 			if (end_writeback)
1336 				end_page_writeback(pages[i]);
1337 			if (unlock_pages)
1338 				unlock_page(pages[i]);
1339 			page_cache_release(pages[i]);
1340 		}
1341 		nr_pages -= ret;
1342 		index += ret;
1343 		cond_resched();
1344 	}
1345 	return 0;
1346 }
1347 
1348 /*
1349  * count the number of bytes in the tree that have a given bit(s)
1350  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1351  * cached.  The total number found is returned.
1352  */
1353 u64 count_range_bits(struct extent_io_tree *tree,
1354 		     u64 *start, u64 search_end, u64 max_bytes,
1355 		     unsigned long bits)
1356 {
1357 	struct rb_node *node;
1358 	struct extent_state *state;
1359 	u64 cur_start = *start;
1360 	u64 total_bytes = 0;
1361 	int found = 0;
1362 
1363 	if (search_end <= cur_start) {
1364 		WARN_ON(1);
1365 		return 0;
1366 	}
1367 
1368 	spin_lock(&tree->lock);
1369 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1370 		total_bytes = tree->dirty_bytes;
1371 		goto out;
1372 	}
1373 	/*
1374 	 * this search will find all the extents that end after
1375 	 * our range starts.
1376 	 */
1377 	node = tree_search(tree, cur_start);
1378 	if (!node)
1379 		goto out;
1380 
1381 	while (1) {
1382 		state = rb_entry(node, struct extent_state, rb_node);
1383 		if (state->start > search_end)
1384 			break;
1385 		if (state->end >= cur_start && (state->state & bits)) {
1386 			total_bytes += min(search_end, state->end) + 1 -
1387 				       max(cur_start, state->start);
1388 			if (total_bytes >= max_bytes)
1389 				break;
1390 			if (!found) {
1391 				*start = state->start;
1392 				found = 1;
1393 			}
1394 		}
1395 		node = rb_next(node);
1396 		if (!node)
1397 			break;
1398 	}
1399 out:
1400 	spin_unlock(&tree->lock);
1401 	return total_bytes;
1402 }
1403 
1404 #if 0
1405 /*
1406  * helper function to lock both pages and extents in the tree.
1407  * pages must be locked first.
1408  */
1409 static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1410 {
1411 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1412 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1413 	struct page *page;
1414 	int err;
1415 
1416 	while (index <= end_index) {
1417 		page = grab_cache_page(tree->mapping, index);
1418 		if (!page) {
1419 			err = -ENOMEM;
1420 			goto failed;
1421 		}
1422 		if (IS_ERR(page)) {
1423 			err = PTR_ERR(page);
1424 			goto failed;
1425 		}
1426 		index++;
1427 	}
1428 	lock_extent(tree, start, end, GFP_NOFS);
1429 	return 0;
1430 
1431 failed:
1432 	/*
1433 	 * we failed above in getting the page at 'index', so we undo here
1434 	 * up to but not including the page at 'index'
1435 	 */
1436 	end_index = index;
1437 	index = start >> PAGE_CACHE_SHIFT;
1438 	while (index < end_index) {
1439 		page = find_get_page(tree->mapping, index);
1440 		unlock_page(page);
1441 		page_cache_release(page);
1442 		index++;
1443 	}
1444 	return err;
1445 }
1446 
1447 /*
1448  * helper function to unlock both pages and extents in the tree.
1449  */
1450 static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1451 {
1452 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1453 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1454 	struct page *page;
1455 
1456 	while (index <= end_index) {
1457 		page = find_get_page(tree->mapping, index);
1458 		unlock_page(page);
1459 		page_cache_release(page);
1460 		index++;
1461 	}
1462 	unlock_extent(tree, start, end, GFP_NOFS);
1463 	return 0;
1464 }
1465 #endif
1466 
1467 /*
1468  * set the private field for a given byte offset in the tree.  If there isn't
1469  * an extent_state there already, this does nothing.
1470  */
1471 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1472 {
1473 	struct rb_node *node;
1474 	struct extent_state *state;
1475 	int ret = 0;
1476 
1477 	spin_lock(&tree->lock);
1478 	/*
1479 	 * this search will find all the extents that end after
1480 	 * our range starts.
1481 	 */
1482 	node = tree_search(tree, start);
1483 	if (!node) {
1484 		ret = -ENOENT;
1485 		goto out;
1486 	}
1487 	state = rb_entry(node, struct extent_state, rb_node);
1488 	if (state->start != start) {
1489 		ret = -ENOENT;
1490 		goto out;
1491 	}
1492 	state->private = private;
1493 out:
1494 	spin_unlock(&tree->lock);
1495 	return ret;
1496 }
1497 
1498 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1499 {
1500 	struct rb_node *node;
1501 	struct extent_state *state;
1502 	int ret = 0;
1503 
1504 	spin_lock(&tree->lock);
1505 	/*
1506 	 * this search will find all the extents that end after
1507 	 * our range starts.
1508 	 */
1509 	node = tree_search(tree, start);
1510 	if (!node) {
1511 		ret = -ENOENT;
1512 		goto out;
1513 	}
1514 	state = rb_entry(node, struct extent_state, rb_node);
1515 	if (state->start != start) {
1516 		ret = -ENOENT;
1517 		goto out;
1518 	}
1519 	*private = state->private;
1520 out:
1521 	spin_unlock(&tree->lock);
1522 	return ret;
1523 }
1524 
1525 /*
1526  * searches a range in the state tree for a given mask.
1527  * If 'filled' == 1, this returns 1 only if every extent in the tree
1528  * has the bits set.  Otherwise, 1 is returned if any bit in the
1529  * range is found set.
1530  */
1531 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1532 		   int bits, int filled)
1533 {
1534 	struct extent_state *state = NULL;
1535 	struct rb_node *node;
1536 	int bitset = 0;
1537 
1538 	spin_lock(&tree->lock);
1539 	node = tree_search(tree, start);
1540 	while (node && start <= end) {
1541 		state = rb_entry(node, struct extent_state, rb_node);
1542 
1543 		if (filled && state->start > start) {
1544 			bitset = 0;
1545 			break;
1546 		}
1547 
1548 		if (state->start > end)
1549 			break;
1550 
1551 		if (state->state & bits) {
1552 			bitset = 1;
1553 			if (!filled)
1554 				break;
1555 		} else if (filled) {
1556 			bitset = 0;
1557 			break;
1558 		}
1559 		start = state->end + 1;
1560 		if (start > end)
1561 			break;
1562 		node = rb_next(node);
1563 		if (!node) {
1564 			if (filled)
1565 				bitset = 0;
1566 			break;
1567 		}
1568 	}
1569 	spin_unlock(&tree->lock);
1570 	return bitset;
1571 }
1572 
1573 /*
1574  * helper function to set a given page up to date if all the
1575  * extents in the tree for that page are up to date
1576  */
1577 static int check_page_uptodate(struct extent_io_tree *tree,
1578 			       struct page *page)
1579 {
1580 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1581 	u64 end = start + PAGE_CACHE_SIZE - 1;
1582 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1583 		SetPageUptodate(page);
1584 	return 0;
1585 }
1586 
1587 /*
1588  * helper function to unlock a page if all the extents in the tree
1589  * for that page are unlocked
1590  */
1591 static int check_page_locked(struct extent_io_tree *tree,
1592 			     struct page *page)
1593 {
1594 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1595 	u64 end = start + PAGE_CACHE_SIZE - 1;
1596 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1597 		unlock_page(page);
1598 	return 0;
1599 }
1600 
1601 /*
1602  * helper function to end page writeback if all the extents
1603  * in the tree for that page are done with writeback
1604  */
1605 static int check_page_writeback(struct extent_io_tree *tree,
1606 			     struct page *page)
1607 {
1608 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1609 	u64 end = start + PAGE_CACHE_SIZE - 1;
1610 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1611 		end_page_writeback(page);
1612 	return 0;
1613 }
1614 
1615 /* lots and lots of room for performance fixes in the end_bio funcs */
1616 
1617 /*
1618  * after a writepage IO is done, we need to:
1619  * clear the uptodate bits on error
1620  * clear the writeback bits in the extent tree for this IO
1621  * end_page_writeback if the page has no more pending IO
1622  *
1623  * Scheduling is not allowed, so the extent state tree is expected
1624  * to have one and only one object corresponding to this IO.
1625  */
1626 static void end_bio_extent_writepage(struct bio *bio, int err)
1627 {
1628 	int uptodate = err == 0;
1629 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1630 	struct extent_io_tree *tree;
1631 	u64 start;
1632 	u64 end;
1633 	int whole_page;
1634 	int ret;
1635 
1636 	do {
1637 		struct page *page = bvec->bv_page;
1638 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1639 
1640 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1641 			 bvec->bv_offset;
1642 		end = start + bvec->bv_len - 1;
1643 
1644 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1645 			whole_page = 1;
1646 		else
1647 			whole_page = 0;
1648 
1649 		if (--bvec >= bio->bi_io_vec)
1650 			prefetchw(&bvec->bv_page->flags);
1651 		if (tree->ops && tree->ops->writepage_end_io_hook) {
1652 			ret = tree->ops->writepage_end_io_hook(page, start,
1653 						       end, NULL, uptodate);
1654 			if (ret)
1655 				uptodate = 0;
1656 		}
1657 
1658 		if (!uptodate && tree->ops &&
1659 		    tree->ops->writepage_io_failed_hook) {
1660 			ret = tree->ops->writepage_io_failed_hook(bio, page,
1661 							 start, end, NULL);
1662 			if (ret == 0) {
1663 				uptodate = (err == 0);
1664 				continue;
1665 			}
1666 		}
1667 
1668 		if (!uptodate) {
1669 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1670 			ClearPageUptodate(page);
1671 			SetPageError(page);
1672 		}
1673 
1674 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1675 
1676 		if (whole_page)
1677 			end_page_writeback(page);
1678 		else
1679 			check_page_writeback(tree, page);
1680 	} while (bvec >= bio->bi_io_vec);
1681 
1682 	bio_put(bio);
1683 }
1684 
1685 /*
1686  * after a readpage IO is done, we need to:
1687  * clear the uptodate bits on error
1688  * set the uptodate bits if things worked
1689  * set the page up to date if all extents in the tree are uptodate
1690  * clear the lock bit in the extent tree
1691  * unlock the page if there are no other extents locked for it
1692  *
1693  * Scheduling is not allowed, so the extent state tree is expected
1694  * to have one and only one object corresponding to this IO.
1695  */
1696 static void end_bio_extent_readpage(struct bio *bio, int err)
1697 {
1698 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1699 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1700 	struct extent_io_tree *tree;
1701 	u64 start;
1702 	u64 end;
1703 	int whole_page;
1704 	int ret;
1705 
1706 	if (err)
1707 		uptodate = 0;
1708 
1709 	do {
1710 		struct page *page = bvec->bv_page;
1711 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1712 
1713 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1714 			bvec->bv_offset;
1715 		end = start + bvec->bv_len - 1;
1716 
1717 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1718 			whole_page = 1;
1719 		else
1720 			whole_page = 0;
1721 
1722 		if (--bvec >= bio->bi_io_vec)
1723 			prefetchw(&bvec->bv_page->flags);
1724 
1725 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1726 			ret = tree->ops->readpage_end_io_hook(page, start, end,
1727 							      NULL);
1728 			if (ret)
1729 				uptodate = 0;
1730 		}
1731 		if (!uptodate && tree->ops &&
1732 		    tree->ops->readpage_io_failed_hook) {
1733 			ret = tree->ops->readpage_io_failed_hook(bio, page,
1734 							 start, end, NULL);
1735 			if (ret == 0) {
1736 				uptodate =
1737 					test_bit(BIO_UPTODATE, &bio->bi_flags);
1738 				if (err)
1739 					uptodate = 0;
1740 				continue;
1741 			}
1742 		}
1743 
1744 		if (uptodate) {
1745 			set_extent_uptodate(tree, start, end,
1746 					    GFP_ATOMIC);
1747 		}
1748 		unlock_extent(tree, start, end, GFP_ATOMIC);
1749 
1750 		if (whole_page) {
1751 			if (uptodate) {
1752 				SetPageUptodate(page);
1753 			} else {
1754 				ClearPageUptodate(page);
1755 				SetPageError(page);
1756 			}
1757 			unlock_page(page);
1758 		} else {
1759 			if (uptodate) {
1760 				check_page_uptodate(tree, page);
1761 			} else {
1762 				ClearPageUptodate(page);
1763 				SetPageError(page);
1764 			}
1765 			check_page_locked(tree, page);
1766 		}
1767 	} while (bvec >= bio->bi_io_vec);
1768 
1769 	bio_put(bio);
1770 }
1771 
1772 /*
1773  * IO done from prepare_write is pretty simple, we just unlock
1774  * the structs in the extent tree when done, and set the uptodate bits
1775  * as appropriate.
1776  */
1777 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1778 {
1779 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1780 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1781 	struct extent_io_tree *tree;
1782 	u64 start;
1783 	u64 end;
1784 
1785 	do {
1786 		struct page *page = bvec->bv_page;
1787 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1788 
1789 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1790 			bvec->bv_offset;
1791 		end = start + bvec->bv_len - 1;
1792 
1793 		if (--bvec >= bio->bi_io_vec)
1794 			prefetchw(&bvec->bv_page->flags);
1795 
1796 		if (uptodate) {
1797 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1798 		} else {
1799 			ClearPageUptodate(page);
1800 			SetPageError(page);
1801 		}
1802 
1803 		unlock_extent(tree, start, end, GFP_ATOMIC);
1804 
1805 	} while (bvec >= bio->bi_io_vec);
1806 
1807 	bio_put(bio);
1808 }
1809 
1810 static struct bio *
1811 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1812 		 gfp_t gfp_flags)
1813 {
1814 	struct bio *bio;
1815 
1816 	bio = bio_alloc(gfp_flags, nr_vecs);
1817 
1818 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1819 		while (!bio && (nr_vecs /= 2))
1820 			bio = bio_alloc(gfp_flags, nr_vecs);
1821 	}
1822 
1823 	if (bio) {
1824 		bio->bi_size = 0;
1825 		bio->bi_bdev = bdev;
1826 		bio->bi_sector = first_sector;
1827 	}
1828 	return bio;
1829 }
1830 
1831 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1832 			  unsigned long bio_flags)
1833 {
1834 	int ret = 0;
1835 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1836 	struct page *page = bvec->bv_page;
1837 	struct extent_io_tree *tree = bio->bi_private;
1838 	u64 start;
1839 	u64 end;
1840 
1841 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1842 	end = start + bvec->bv_len - 1;
1843 
1844 	bio->bi_private = NULL;
1845 
1846 	bio_get(bio);
1847 
1848 	if (tree->ops && tree->ops->submit_bio_hook)
1849 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1850 					   mirror_num, bio_flags);
1851 	else
1852 		submit_bio(rw, bio);
1853 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1854 		ret = -EOPNOTSUPP;
1855 	bio_put(bio);
1856 	return ret;
1857 }
1858 
1859 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1860 			      struct page *page, sector_t sector,
1861 			      size_t size, unsigned long offset,
1862 			      struct block_device *bdev,
1863 			      struct bio **bio_ret,
1864 			      unsigned long max_pages,
1865 			      bio_end_io_t end_io_func,
1866 			      int mirror_num,
1867 			      unsigned long prev_bio_flags,
1868 			      unsigned long bio_flags)
1869 {
1870 	int ret = 0;
1871 	struct bio *bio;
1872 	int nr;
1873 	int contig = 0;
1874 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1875 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1876 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1877 
1878 	if (bio_ret && *bio_ret) {
1879 		bio = *bio_ret;
1880 		if (old_compressed)
1881 			contig = bio->bi_sector == sector;
1882 		else
1883 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1884 				sector;
1885 
1886 		if (prev_bio_flags != bio_flags || !contig ||
1887 		    (tree->ops && tree->ops->merge_bio_hook &&
1888 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1889 					       bio_flags)) ||
1890 		    bio_add_page(bio, page, page_size, offset) < page_size) {
1891 			ret = submit_one_bio(rw, bio, mirror_num,
1892 					     prev_bio_flags);
1893 			bio = NULL;
1894 		} else {
1895 			return 0;
1896 		}
1897 	}
1898 	if (this_compressed)
1899 		nr = BIO_MAX_PAGES;
1900 	else
1901 		nr = bio_get_nr_vecs(bdev);
1902 
1903 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1904 
1905 	bio_add_page(bio, page, page_size, offset);
1906 	bio->bi_end_io = end_io_func;
1907 	bio->bi_private = tree;
1908 
1909 	if (bio_ret)
1910 		*bio_ret = bio;
1911 	else
1912 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1913 
1914 	return ret;
1915 }
1916 
1917 void set_page_extent_mapped(struct page *page)
1918 {
1919 	if (!PagePrivate(page)) {
1920 		SetPagePrivate(page);
1921 		page_cache_get(page);
1922 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1923 	}
1924 }
1925 
1926 static void set_page_extent_head(struct page *page, unsigned long len)
1927 {
1928 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1929 }
1930 
1931 /*
1932  * basic readpage implementation.  Locked extent state structs are inserted
1933  * into the tree that are removed when the IO is done (by the end_io
1934  * handlers)
1935  */
1936 static int __extent_read_full_page(struct extent_io_tree *tree,
1937 				   struct page *page,
1938 				   get_extent_t *get_extent,
1939 				   struct bio **bio, int mirror_num,
1940 				   unsigned long *bio_flags)
1941 {
1942 	struct inode *inode = page->mapping->host;
1943 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1944 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1945 	u64 end;
1946 	u64 cur = start;
1947 	u64 extent_offset;
1948 	u64 last_byte = i_size_read(inode);
1949 	u64 block_start;
1950 	u64 cur_end;
1951 	sector_t sector;
1952 	struct extent_map *em;
1953 	struct block_device *bdev;
1954 	int ret;
1955 	int nr = 0;
1956 	size_t page_offset = 0;
1957 	size_t iosize;
1958 	size_t disk_io_size;
1959 	size_t blocksize = inode->i_sb->s_blocksize;
1960 	unsigned long this_bio_flag = 0;
1961 
1962 	set_page_extent_mapped(page);
1963 
1964 	end = page_end;
1965 	lock_extent(tree, start, end, GFP_NOFS);
1966 
1967 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1968 		char *userpage;
1969 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1970 
1971 		if (zero_offset) {
1972 			iosize = PAGE_CACHE_SIZE - zero_offset;
1973 			userpage = kmap_atomic(page, KM_USER0);
1974 			memset(userpage + zero_offset, 0, iosize);
1975 			flush_dcache_page(page);
1976 			kunmap_atomic(userpage, KM_USER0);
1977 		}
1978 	}
1979 	while (cur <= end) {
1980 		if (cur >= last_byte) {
1981 			char *userpage;
1982 			iosize = PAGE_CACHE_SIZE - page_offset;
1983 			userpage = kmap_atomic(page, KM_USER0);
1984 			memset(userpage + page_offset, 0, iosize);
1985 			flush_dcache_page(page);
1986 			kunmap_atomic(userpage, KM_USER0);
1987 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1988 					    GFP_NOFS);
1989 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1990 			break;
1991 		}
1992 		em = get_extent(inode, page, page_offset, cur,
1993 				end - cur + 1, 0);
1994 		if (IS_ERR(em) || !em) {
1995 			SetPageError(page);
1996 			unlock_extent(tree, cur, end, GFP_NOFS);
1997 			break;
1998 		}
1999 		extent_offset = cur - em->start;
2000 		BUG_ON(extent_map_end(em) <= cur);
2001 		BUG_ON(end < cur);
2002 
2003 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2004 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2005 
2006 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2007 		cur_end = min(extent_map_end(em) - 1, end);
2008 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2009 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2010 			disk_io_size = em->block_len;
2011 			sector = em->block_start >> 9;
2012 		} else {
2013 			sector = (em->block_start + extent_offset) >> 9;
2014 			disk_io_size = iosize;
2015 		}
2016 		bdev = em->bdev;
2017 		block_start = em->block_start;
2018 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2019 			block_start = EXTENT_MAP_HOLE;
2020 		free_extent_map(em);
2021 		em = NULL;
2022 
2023 		/* we've found a hole, just zero and go on */
2024 		if (block_start == EXTENT_MAP_HOLE) {
2025 			char *userpage;
2026 			userpage = kmap_atomic(page, KM_USER0);
2027 			memset(userpage + page_offset, 0, iosize);
2028 			flush_dcache_page(page);
2029 			kunmap_atomic(userpage, KM_USER0);
2030 
2031 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2032 					    GFP_NOFS);
2033 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2034 			cur = cur + iosize;
2035 			page_offset += iosize;
2036 			continue;
2037 		}
2038 		/* the get_extent function already copied into the page */
2039 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2040 			check_page_uptodate(tree, page);
2041 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2042 			cur = cur + iosize;
2043 			page_offset += iosize;
2044 			continue;
2045 		}
2046 		/* we have an inline extent but it didn't get marked up
2047 		 * to date.  Error out
2048 		 */
2049 		if (block_start == EXTENT_MAP_INLINE) {
2050 			SetPageError(page);
2051 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2052 			cur = cur + iosize;
2053 			page_offset += iosize;
2054 			continue;
2055 		}
2056 
2057 		ret = 0;
2058 		if (tree->ops && tree->ops->readpage_io_hook) {
2059 			ret = tree->ops->readpage_io_hook(page, cur,
2060 							  cur + iosize - 1);
2061 		}
2062 		if (!ret) {
2063 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2064 			pnr -= page->index;
2065 			ret = submit_extent_page(READ, tree, page,
2066 					 sector, disk_io_size, page_offset,
2067 					 bdev, bio, pnr,
2068 					 end_bio_extent_readpage, mirror_num,
2069 					 *bio_flags,
2070 					 this_bio_flag);
2071 			nr++;
2072 			*bio_flags = this_bio_flag;
2073 		}
2074 		if (ret)
2075 			SetPageError(page);
2076 		cur = cur + iosize;
2077 		page_offset += iosize;
2078 	}
2079 	if (!nr) {
2080 		if (!PageError(page))
2081 			SetPageUptodate(page);
2082 		unlock_page(page);
2083 	}
2084 	return 0;
2085 }
2086 
2087 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2088 			    get_extent_t *get_extent)
2089 {
2090 	struct bio *bio = NULL;
2091 	unsigned long bio_flags = 0;
2092 	int ret;
2093 
2094 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2095 				      &bio_flags);
2096 	if (bio)
2097 		submit_one_bio(READ, bio, 0, bio_flags);
2098 	return ret;
2099 }
2100 
2101 static noinline void update_nr_written(struct page *page,
2102 				      struct writeback_control *wbc,
2103 				      unsigned long nr_written)
2104 {
2105 	wbc->nr_to_write -= nr_written;
2106 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2107 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2108 		page->mapping->writeback_index = page->index + nr_written;
2109 }
2110 
2111 /*
2112  * the writepage semantics are similar to regular writepage.  extent
2113  * records are inserted to lock ranges in the tree, and as dirty areas
2114  * are found, they are marked writeback.  Then the lock bits are removed
2115  * and the end_io handler clears the writeback ranges
2116  */
2117 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2118 			      void *data)
2119 {
2120 	struct inode *inode = page->mapping->host;
2121 	struct extent_page_data *epd = data;
2122 	struct extent_io_tree *tree = epd->tree;
2123 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2124 	u64 delalloc_start;
2125 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2126 	u64 end;
2127 	u64 cur = start;
2128 	u64 extent_offset;
2129 	u64 last_byte = i_size_read(inode);
2130 	u64 block_start;
2131 	u64 iosize;
2132 	u64 unlock_start;
2133 	sector_t sector;
2134 	struct extent_map *em;
2135 	struct block_device *bdev;
2136 	int ret;
2137 	int nr = 0;
2138 	size_t pg_offset = 0;
2139 	size_t blocksize;
2140 	loff_t i_size = i_size_read(inode);
2141 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2142 	u64 nr_delalloc;
2143 	u64 delalloc_end;
2144 	int page_started;
2145 	int compressed;
2146 	int write_flags;
2147 	unsigned long nr_written = 0;
2148 
2149 	if (wbc->sync_mode == WB_SYNC_ALL)
2150 		write_flags = WRITE_SYNC_PLUG;
2151 	else
2152 		write_flags = WRITE;
2153 
2154 	WARN_ON(!PageLocked(page));
2155 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2156 	if (page->index > end_index ||
2157 	   (page->index == end_index && !pg_offset)) {
2158 		page->mapping->a_ops->invalidatepage(page, 0);
2159 		unlock_page(page);
2160 		return 0;
2161 	}
2162 
2163 	if (page->index == end_index) {
2164 		char *userpage;
2165 
2166 		userpage = kmap_atomic(page, KM_USER0);
2167 		memset(userpage + pg_offset, 0,
2168 		       PAGE_CACHE_SIZE - pg_offset);
2169 		kunmap_atomic(userpage, KM_USER0);
2170 		flush_dcache_page(page);
2171 	}
2172 	pg_offset = 0;
2173 
2174 	set_page_extent_mapped(page);
2175 
2176 	delalloc_start = start;
2177 	delalloc_end = 0;
2178 	page_started = 0;
2179 	if (!epd->extent_locked) {
2180 		/*
2181 		 * make sure the wbc mapping index is at least updated
2182 		 * to this page.
2183 		 */
2184 		update_nr_written(page, wbc, 0);
2185 
2186 		while (delalloc_end < page_end) {
2187 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2188 						       page,
2189 						       &delalloc_start,
2190 						       &delalloc_end,
2191 						       128 * 1024 * 1024);
2192 			if (nr_delalloc == 0) {
2193 				delalloc_start = delalloc_end + 1;
2194 				continue;
2195 			}
2196 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2197 						 delalloc_end, &page_started,
2198 						 &nr_written);
2199 			delalloc_start = delalloc_end + 1;
2200 		}
2201 
2202 		/* did the fill delalloc function already unlock and start
2203 		 * the IO?
2204 		 */
2205 		if (page_started) {
2206 			ret = 0;
2207 			/*
2208 			 * we've unlocked the page, so we can't update
2209 			 * the mapping's writeback index, just update
2210 			 * nr_to_write.
2211 			 */
2212 			wbc->nr_to_write -= nr_written;
2213 			goto done_unlocked;
2214 		}
2215 	}
2216 	lock_extent(tree, start, page_end, GFP_NOFS);
2217 
2218 	unlock_start = start;
2219 
2220 	if (tree->ops && tree->ops->writepage_start_hook) {
2221 		ret = tree->ops->writepage_start_hook(page, start,
2222 						      page_end);
2223 		if (ret == -EAGAIN) {
2224 			unlock_extent(tree, start, page_end, GFP_NOFS);
2225 			redirty_page_for_writepage(wbc, page);
2226 			update_nr_written(page, wbc, nr_written);
2227 			unlock_page(page);
2228 			ret = 0;
2229 			goto done_unlocked;
2230 		}
2231 	}
2232 
2233 	/*
2234 	 * we don't want to touch the inode after unlocking the page,
2235 	 * so we update the mapping writeback index now
2236 	 */
2237 	update_nr_written(page, wbc, nr_written + 1);
2238 
2239 	end = page_end;
2240 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2241 		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2242 
2243 	if (last_byte <= start) {
2244 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2245 		unlock_extent(tree, start, page_end, GFP_NOFS);
2246 		if (tree->ops && tree->ops->writepage_end_io_hook)
2247 			tree->ops->writepage_end_io_hook(page, start,
2248 							 page_end, NULL, 1);
2249 		unlock_start = page_end + 1;
2250 		goto done;
2251 	}
2252 
2253 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2254 	blocksize = inode->i_sb->s_blocksize;
2255 
2256 	while (cur <= end) {
2257 		if (cur >= last_byte) {
2258 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2259 			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2260 			if (tree->ops && tree->ops->writepage_end_io_hook)
2261 				tree->ops->writepage_end_io_hook(page, cur,
2262 							 page_end, NULL, 1);
2263 			unlock_start = page_end + 1;
2264 			break;
2265 		}
2266 		em = epd->get_extent(inode, page, pg_offset, cur,
2267 				     end - cur + 1, 1);
2268 		if (IS_ERR(em) || !em) {
2269 			SetPageError(page);
2270 			break;
2271 		}
2272 
2273 		extent_offset = cur - em->start;
2274 		BUG_ON(extent_map_end(em) <= cur);
2275 		BUG_ON(end < cur);
2276 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2277 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2278 		sector = (em->block_start + extent_offset) >> 9;
2279 		bdev = em->bdev;
2280 		block_start = em->block_start;
2281 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2282 		free_extent_map(em);
2283 		em = NULL;
2284 
2285 		/*
2286 		 * compressed and inline extents are written through other
2287 		 * paths in the FS
2288 		 */
2289 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2290 		    block_start == EXTENT_MAP_INLINE) {
2291 			clear_extent_dirty(tree, cur,
2292 					   cur + iosize - 1, GFP_NOFS);
2293 
2294 			unlock_extent(tree, unlock_start, cur + iosize - 1,
2295 				      GFP_NOFS);
2296 
2297 			/*
2298 			 * end_io notification does not happen here for
2299 			 * compressed extents
2300 			 */
2301 			if (!compressed && tree->ops &&
2302 			    tree->ops->writepage_end_io_hook)
2303 				tree->ops->writepage_end_io_hook(page, cur,
2304 							 cur + iosize - 1,
2305 							 NULL, 1);
2306 			else if (compressed) {
2307 				/* we don't want to end_page_writeback on
2308 				 * a compressed extent.  this happens
2309 				 * elsewhere
2310 				 */
2311 				nr++;
2312 			}
2313 
2314 			cur += iosize;
2315 			pg_offset += iosize;
2316 			unlock_start = cur;
2317 			continue;
2318 		}
2319 		/* leave this out until we have a page_mkwrite call */
2320 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2321 				   EXTENT_DIRTY, 0)) {
2322 			cur = cur + iosize;
2323 			pg_offset += iosize;
2324 			continue;
2325 		}
2326 
2327 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2328 		if (tree->ops && tree->ops->writepage_io_hook) {
2329 			ret = tree->ops->writepage_io_hook(page, cur,
2330 						cur + iosize - 1);
2331 		} else {
2332 			ret = 0;
2333 		}
2334 		if (ret) {
2335 			SetPageError(page);
2336 		} else {
2337 			unsigned long max_nr = end_index + 1;
2338 
2339 			set_range_writeback(tree, cur, cur + iosize - 1);
2340 			if (!PageWriteback(page)) {
2341 				printk(KERN_ERR "btrfs warning page %lu not "
2342 				       "writeback, cur %llu end %llu\n",
2343 				       page->index, (unsigned long long)cur,
2344 				       (unsigned long long)end);
2345 			}
2346 
2347 			ret = submit_extent_page(write_flags, tree, page,
2348 						 sector, iosize, pg_offset,
2349 						 bdev, &epd->bio, max_nr,
2350 						 end_bio_extent_writepage,
2351 						 0, 0, 0);
2352 			if (ret)
2353 				SetPageError(page);
2354 		}
2355 		cur = cur + iosize;
2356 		pg_offset += iosize;
2357 		nr++;
2358 	}
2359 done:
2360 	if (nr == 0) {
2361 		/* make sure the mapping tag for page dirty gets cleared */
2362 		set_page_writeback(page);
2363 		end_page_writeback(page);
2364 	}
2365 	if (unlock_start <= page_end)
2366 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2367 	unlock_page(page);
2368 
2369 done_unlocked:
2370 
2371 	return 0;
2372 }
2373 
2374 /**
2375  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2376  * @mapping: address space structure to write
2377  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2378  * @writepage: function called for each page
2379  * @data: data passed to writepage function
2380  *
2381  * If a page is already under I/O, write_cache_pages() skips it, even
2382  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2383  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2384  * and msync() need to guarantee that all the data which was dirty at the time
2385  * the call was made get new I/O started against them.  If wbc->sync_mode is
2386  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2387  * existing IO to complete.
2388  */
2389 static int extent_write_cache_pages(struct extent_io_tree *tree,
2390 			     struct address_space *mapping,
2391 			     struct writeback_control *wbc,
2392 			     writepage_t writepage, void *data,
2393 			     void (*flush_fn)(void *))
2394 {
2395 	struct backing_dev_info *bdi = mapping->backing_dev_info;
2396 	int ret = 0;
2397 	int done = 0;
2398 	struct pagevec pvec;
2399 	int nr_pages;
2400 	pgoff_t index;
2401 	pgoff_t end;		/* Inclusive */
2402 	int scanned = 0;
2403 	int range_whole = 0;
2404 
2405 	pagevec_init(&pvec, 0);
2406 	if (wbc->range_cyclic) {
2407 		index = mapping->writeback_index; /* Start from prev offset */
2408 		end = -1;
2409 	} else {
2410 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2411 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2412 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2413 			range_whole = 1;
2414 		scanned = 1;
2415 	}
2416 retry:
2417 	while (!done && (index <= end) &&
2418 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2419 			      PAGECACHE_TAG_DIRTY, min(end - index,
2420 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2421 		unsigned i;
2422 
2423 		scanned = 1;
2424 		for (i = 0; i < nr_pages; i++) {
2425 			struct page *page = pvec.pages[i];
2426 
2427 			/*
2428 			 * At this point we hold neither mapping->tree_lock nor
2429 			 * lock on the page itself: the page may be truncated or
2430 			 * invalidated (changing page->mapping to NULL), or even
2431 			 * swizzled back from swapper_space to tmpfs file
2432 			 * mapping
2433 			 */
2434 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2435 				tree->ops->write_cache_pages_lock_hook(page);
2436 			else
2437 				lock_page(page);
2438 
2439 			if (unlikely(page->mapping != mapping)) {
2440 				unlock_page(page);
2441 				continue;
2442 			}
2443 
2444 			if (!wbc->range_cyclic && page->index > end) {
2445 				done = 1;
2446 				unlock_page(page);
2447 				continue;
2448 			}
2449 
2450 			if (wbc->sync_mode != WB_SYNC_NONE) {
2451 				if (PageWriteback(page))
2452 					flush_fn(data);
2453 				wait_on_page_writeback(page);
2454 			}
2455 
2456 			if (PageWriteback(page) ||
2457 			    !clear_page_dirty_for_io(page)) {
2458 				unlock_page(page);
2459 				continue;
2460 			}
2461 
2462 			ret = (*writepage)(page, wbc, data);
2463 
2464 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2465 				unlock_page(page);
2466 				ret = 0;
2467 			}
2468 			if (ret || wbc->nr_to_write <= 0)
2469 				done = 1;
2470 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
2471 				wbc->encountered_congestion = 1;
2472 				done = 1;
2473 			}
2474 		}
2475 		pagevec_release(&pvec);
2476 		cond_resched();
2477 	}
2478 	if (!scanned && !done) {
2479 		/*
2480 		 * We hit the last page and there is more work to be done: wrap
2481 		 * back to the start of the file
2482 		 */
2483 		scanned = 1;
2484 		index = 0;
2485 		goto retry;
2486 	}
2487 	return ret;
2488 }
2489 
2490 static void flush_epd_write_bio(struct extent_page_data *epd)
2491 {
2492 	if (epd->bio) {
2493 		if (epd->sync_io)
2494 			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2495 		else
2496 			submit_one_bio(WRITE, epd->bio, 0, 0);
2497 		epd->bio = NULL;
2498 	}
2499 }
2500 
2501 static noinline void flush_write_bio(void *data)
2502 {
2503 	struct extent_page_data *epd = data;
2504 	flush_epd_write_bio(epd);
2505 }
2506 
2507 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2508 			  get_extent_t *get_extent,
2509 			  struct writeback_control *wbc)
2510 {
2511 	int ret;
2512 	struct address_space *mapping = page->mapping;
2513 	struct extent_page_data epd = {
2514 		.bio = NULL,
2515 		.tree = tree,
2516 		.get_extent = get_extent,
2517 		.extent_locked = 0,
2518 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2519 	};
2520 	struct writeback_control wbc_writepages = {
2521 		.bdi		= wbc->bdi,
2522 		.sync_mode	= wbc->sync_mode,
2523 		.older_than_this = NULL,
2524 		.nr_to_write	= 64,
2525 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2526 		.range_end	= (loff_t)-1,
2527 	};
2528 
2529 	ret = __extent_writepage(page, wbc, &epd);
2530 
2531 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2532 				 __extent_writepage, &epd, flush_write_bio);
2533 	flush_epd_write_bio(&epd);
2534 	return ret;
2535 }
2536 
2537 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2538 			      u64 start, u64 end, get_extent_t *get_extent,
2539 			      int mode)
2540 {
2541 	int ret = 0;
2542 	struct address_space *mapping = inode->i_mapping;
2543 	struct page *page;
2544 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2545 		PAGE_CACHE_SHIFT;
2546 
2547 	struct extent_page_data epd = {
2548 		.bio = NULL,
2549 		.tree = tree,
2550 		.get_extent = get_extent,
2551 		.extent_locked = 1,
2552 		.sync_io = mode == WB_SYNC_ALL,
2553 	};
2554 	struct writeback_control wbc_writepages = {
2555 		.bdi		= inode->i_mapping->backing_dev_info,
2556 		.sync_mode	= mode,
2557 		.older_than_this = NULL,
2558 		.nr_to_write	= nr_pages * 2,
2559 		.range_start	= start,
2560 		.range_end	= end + 1,
2561 	};
2562 
2563 	while (start <= end) {
2564 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2565 		if (clear_page_dirty_for_io(page))
2566 			ret = __extent_writepage(page, &wbc_writepages, &epd);
2567 		else {
2568 			if (tree->ops && tree->ops->writepage_end_io_hook)
2569 				tree->ops->writepage_end_io_hook(page, start,
2570 						 start + PAGE_CACHE_SIZE - 1,
2571 						 NULL, 1);
2572 			unlock_page(page);
2573 		}
2574 		page_cache_release(page);
2575 		start += PAGE_CACHE_SIZE;
2576 	}
2577 
2578 	flush_epd_write_bio(&epd);
2579 	return ret;
2580 }
2581 
2582 int extent_writepages(struct extent_io_tree *tree,
2583 		      struct address_space *mapping,
2584 		      get_extent_t *get_extent,
2585 		      struct writeback_control *wbc)
2586 {
2587 	int ret = 0;
2588 	struct extent_page_data epd = {
2589 		.bio = NULL,
2590 		.tree = tree,
2591 		.get_extent = get_extent,
2592 		.extent_locked = 0,
2593 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2594 	};
2595 
2596 	ret = extent_write_cache_pages(tree, mapping, wbc,
2597 				       __extent_writepage, &epd,
2598 				       flush_write_bio);
2599 	flush_epd_write_bio(&epd);
2600 	return ret;
2601 }
2602 
2603 int extent_readpages(struct extent_io_tree *tree,
2604 		     struct address_space *mapping,
2605 		     struct list_head *pages, unsigned nr_pages,
2606 		     get_extent_t get_extent)
2607 {
2608 	struct bio *bio = NULL;
2609 	unsigned page_idx;
2610 	struct pagevec pvec;
2611 	unsigned long bio_flags = 0;
2612 
2613 	pagevec_init(&pvec, 0);
2614 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2615 		struct page *page = list_entry(pages->prev, struct page, lru);
2616 
2617 		prefetchw(&page->flags);
2618 		list_del(&page->lru);
2619 		/*
2620 		 * what we want to do here is call add_to_page_cache_lru,
2621 		 * but that isn't exported, so we reproduce it here
2622 		 */
2623 		if (!add_to_page_cache(page, mapping,
2624 					page->index, GFP_KERNEL)) {
2625 
2626 			/* open coding of lru_cache_add, also not exported */
2627 			page_cache_get(page);
2628 			if (!pagevec_add(&pvec, page))
2629 				__pagevec_lru_add_file(&pvec);
2630 			__extent_read_full_page(tree, page, get_extent,
2631 						&bio, 0, &bio_flags);
2632 		}
2633 		page_cache_release(page);
2634 	}
2635 	if (pagevec_count(&pvec))
2636 		__pagevec_lru_add_file(&pvec);
2637 	BUG_ON(!list_empty(pages));
2638 	if (bio)
2639 		submit_one_bio(READ, bio, 0, bio_flags);
2640 	return 0;
2641 }
2642 
2643 /*
2644  * basic invalidatepage code, this waits on any locked or writeback
2645  * ranges corresponding to the page, and then deletes any extent state
2646  * records from the tree
2647  */
2648 int extent_invalidatepage(struct extent_io_tree *tree,
2649 			  struct page *page, unsigned long offset)
2650 {
2651 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2652 	u64 end = start + PAGE_CACHE_SIZE - 1;
2653 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2654 
2655 	start += (offset + blocksize - 1) & ~(blocksize - 1);
2656 	if (start > end)
2657 		return 0;
2658 
2659 	lock_extent(tree, start, end, GFP_NOFS);
2660 	wait_on_extent_writeback(tree, start, end);
2661 	clear_extent_bit(tree, start, end,
2662 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2663 			 1, 1, GFP_NOFS);
2664 	return 0;
2665 }
2666 
2667 /*
2668  * simple commit_write call, set_range_dirty is used to mark both
2669  * the pages and the extent records as dirty
2670  */
2671 int extent_commit_write(struct extent_io_tree *tree,
2672 			struct inode *inode, struct page *page,
2673 			unsigned from, unsigned to)
2674 {
2675 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2676 
2677 	set_page_extent_mapped(page);
2678 	set_page_dirty(page);
2679 
2680 	if (pos > inode->i_size) {
2681 		i_size_write(inode, pos);
2682 		mark_inode_dirty(inode);
2683 	}
2684 	return 0;
2685 }
2686 
2687 int extent_prepare_write(struct extent_io_tree *tree,
2688 			 struct inode *inode, struct page *page,
2689 			 unsigned from, unsigned to, get_extent_t *get_extent)
2690 {
2691 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2692 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2693 	u64 block_start;
2694 	u64 orig_block_start;
2695 	u64 block_end;
2696 	u64 cur_end;
2697 	struct extent_map *em;
2698 	unsigned blocksize = 1 << inode->i_blkbits;
2699 	size_t page_offset = 0;
2700 	size_t block_off_start;
2701 	size_t block_off_end;
2702 	int err = 0;
2703 	int iocount = 0;
2704 	int ret = 0;
2705 	int isnew;
2706 
2707 	set_page_extent_mapped(page);
2708 
2709 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2710 	block_end = (page_start + to - 1) | (blocksize - 1);
2711 	orig_block_start = block_start;
2712 
2713 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2714 	while (block_start <= block_end) {
2715 		em = get_extent(inode, page, page_offset, block_start,
2716 				block_end - block_start + 1, 1);
2717 		if (IS_ERR(em) || !em)
2718 			goto err;
2719 
2720 		cur_end = min(block_end, extent_map_end(em) - 1);
2721 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2722 		block_off_end = block_off_start + blocksize;
2723 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2724 
2725 		if (!PageUptodate(page) && isnew &&
2726 		    (block_off_end > to || block_off_start < from)) {
2727 			void *kaddr;
2728 
2729 			kaddr = kmap_atomic(page, KM_USER0);
2730 			if (block_off_end > to)
2731 				memset(kaddr + to, 0, block_off_end - to);
2732 			if (block_off_start < from)
2733 				memset(kaddr + block_off_start, 0,
2734 				       from - block_off_start);
2735 			flush_dcache_page(page);
2736 			kunmap_atomic(kaddr, KM_USER0);
2737 		}
2738 		if ((em->block_start != EXTENT_MAP_HOLE &&
2739 		     em->block_start != EXTENT_MAP_INLINE) &&
2740 		    !isnew && !PageUptodate(page) &&
2741 		    (block_off_end > to || block_off_start < from) &&
2742 		    !test_range_bit(tree, block_start, cur_end,
2743 				    EXTENT_UPTODATE, 1)) {
2744 			u64 sector;
2745 			u64 extent_offset = block_start - em->start;
2746 			size_t iosize;
2747 			sector = (em->block_start + extent_offset) >> 9;
2748 			iosize = (cur_end - block_start + blocksize) &
2749 				~((u64)blocksize - 1);
2750 			/*
2751 			 * we've already got the extent locked, but we
2752 			 * need to split the state such that our end_bio
2753 			 * handler can clear the lock.
2754 			 */
2755 			set_extent_bit(tree, block_start,
2756 				       block_start + iosize - 1,
2757 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2758 			ret = submit_extent_page(READ, tree, page,
2759 					 sector, iosize, page_offset, em->bdev,
2760 					 NULL, 1,
2761 					 end_bio_extent_preparewrite, 0,
2762 					 0, 0);
2763 			iocount++;
2764 			block_start = block_start + iosize;
2765 		} else {
2766 			set_extent_uptodate(tree, block_start, cur_end,
2767 					    GFP_NOFS);
2768 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2769 			block_start = cur_end + 1;
2770 		}
2771 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2772 		free_extent_map(em);
2773 	}
2774 	if (iocount) {
2775 		wait_extent_bit(tree, orig_block_start,
2776 				block_end, EXTENT_LOCKED);
2777 	}
2778 	check_page_uptodate(tree, page);
2779 err:
2780 	/* FIXME, zero out newly allocated blocks on error */
2781 	return err;
2782 }
2783 
2784 /*
2785  * a helper for releasepage, this tests for areas of the page that
2786  * are locked or under IO and drops the related state bits if it is safe
2787  * to drop the page.
2788  */
2789 int try_release_extent_state(struct extent_map_tree *map,
2790 			     struct extent_io_tree *tree, struct page *page,
2791 			     gfp_t mask)
2792 {
2793 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 	u64 end = start + PAGE_CACHE_SIZE - 1;
2795 	int ret = 1;
2796 
2797 	if (test_range_bit(tree, start, end,
2798 			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
2799 		ret = 0;
2800 	else {
2801 		if ((mask & GFP_NOFS) == GFP_NOFS)
2802 			mask = GFP_NOFS;
2803 		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2804 				 1, 1, mask);
2805 	}
2806 	return ret;
2807 }
2808 
2809 /*
2810  * a helper for releasepage.  As long as there are no locked extents
2811  * in the range corresponding to the page, both state records and extent
2812  * map records are removed
2813  */
2814 int try_release_extent_mapping(struct extent_map_tree *map,
2815 			       struct extent_io_tree *tree, struct page *page,
2816 			       gfp_t mask)
2817 {
2818 	struct extent_map *em;
2819 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2820 	u64 end = start + PAGE_CACHE_SIZE - 1;
2821 
2822 	if ((mask & __GFP_WAIT) &&
2823 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2824 		u64 len;
2825 		while (start <= end) {
2826 			len = end - start + 1;
2827 			spin_lock(&map->lock);
2828 			em = lookup_extent_mapping(map, start, len);
2829 			if (!em || IS_ERR(em)) {
2830 				spin_unlock(&map->lock);
2831 				break;
2832 			}
2833 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2834 			    em->start != start) {
2835 				spin_unlock(&map->lock);
2836 				free_extent_map(em);
2837 				break;
2838 			}
2839 			if (!test_range_bit(tree, em->start,
2840 					    extent_map_end(em) - 1,
2841 					    EXTENT_LOCKED | EXTENT_WRITEBACK |
2842 					    EXTENT_ORDERED,
2843 					    0)) {
2844 				remove_extent_mapping(map, em);
2845 				/* once for the rb tree */
2846 				free_extent_map(em);
2847 			}
2848 			start = extent_map_end(em);
2849 			spin_unlock(&map->lock);
2850 
2851 			/* once for us */
2852 			free_extent_map(em);
2853 		}
2854 	}
2855 	return try_release_extent_state(map, tree, page, mask);
2856 }
2857 
2858 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2859 		get_extent_t *get_extent)
2860 {
2861 	struct inode *inode = mapping->host;
2862 	u64 start = iblock << inode->i_blkbits;
2863 	sector_t sector = 0;
2864 	size_t blksize = (1 << inode->i_blkbits);
2865 	struct extent_map *em;
2866 
2867 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2868 		    GFP_NOFS);
2869 	em = get_extent(inode, NULL, 0, start, blksize, 0);
2870 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2871 		      GFP_NOFS);
2872 	if (!em || IS_ERR(em))
2873 		return 0;
2874 
2875 	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2876 		goto out;
2877 
2878 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2879 out:
2880 	free_extent_map(em);
2881 	return sector;
2882 }
2883 
2884 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2885 		__u64 start, __u64 len, get_extent_t *get_extent)
2886 {
2887 	int ret;
2888 	u64 off = start;
2889 	u64 max = start + len;
2890 	u32 flags = 0;
2891 	u64 disko = 0;
2892 	struct extent_map *em = NULL;
2893 	int end = 0;
2894 	u64 em_start = 0, em_len = 0;
2895 	unsigned long emflags;
2896 	ret = 0;
2897 
2898 	if (len == 0)
2899 		return -EINVAL;
2900 
2901 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2902 		GFP_NOFS);
2903 	em = get_extent(inode, NULL, 0, off, max - off, 0);
2904 	if (!em)
2905 		goto out;
2906 	if (IS_ERR(em)) {
2907 		ret = PTR_ERR(em);
2908 		goto out;
2909 	}
2910 	while (!end) {
2911 		off = em->start + em->len;
2912 		if (off >= max)
2913 			end = 1;
2914 
2915 		em_start = em->start;
2916 		em_len = em->len;
2917 
2918 		disko = 0;
2919 		flags = 0;
2920 
2921 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2922 			end = 1;
2923 			flags |= FIEMAP_EXTENT_LAST;
2924 		} else if (em->block_start == EXTENT_MAP_HOLE) {
2925 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2926 		} else if (em->block_start == EXTENT_MAP_INLINE) {
2927 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2928 				  FIEMAP_EXTENT_NOT_ALIGNED);
2929 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
2930 			flags |= (FIEMAP_EXTENT_DELALLOC |
2931 				  FIEMAP_EXTENT_UNKNOWN);
2932 		} else {
2933 			disko = em->block_start;
2934 		}
2935 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2936 			flags |= FIEMAP_EXTENT_ENCODED;
2937 
2938 		emflags = em->flags;
2939 		free_extent_map(em);
2940 		em = NULL;
2941 
2942 		if (!end) {
2943 			em = get_extent(inode, NULL, 0, off, max - off, 0);
2944 			if (!em)
2945 				goto out;
2946 			if (IS_ERR(em)) {
2947 				ret = PTR_ERR(em);
2948 				goto out;
2949 			}
2950 			emflags = em->flags;
2951 		}
2952 		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2953 			flags |= FIEMAP_EXTENT_LAST;
2954 			end = 1;
2955 		}
2956 
2957 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2958 					em_len, flags);
2959 		if (ret)
2960 			goto out_free;
2961 	}
2962 out_free:
2963 	free_extent_map(em);
2964 out:
2965 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2966 			GFP_NOFS);
2967 	return ret;
2968 }
2969 
2970 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2971 					      unsigned long i)
2972 {
2973 	struct page *p;
2974 	struct address_space *mapping;
2975 
2976 	if (i == 0)
2977 		return eb->first_page;
2978 	i += eb->start >> PAGE_CACHE_SHIFT;
2979 	mapping = eb->first_page->mapping;
2980 	if (!mapping)
2981 		return NULL;
2982 
2983 	/*
2984 	 * extent_buffer_page is only called after pinning the page
2985 	 * by increasing the reference count.  So we know the page must
2986 	 * be in the radix tree.
2987 	 */
2988 	rcu_read_lock();
2989 	p = radix_tree_lookup(&mapping->page_tree, i);
2990 	rcu_read_unlock();
2991 
2992 	return p;
2993 }
2994 
2995 static inline unsigned long num_extent_pages(u64 start, u64 len)
2996 {
2997 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2998 		(start >> PAGE_CACHE_SHIFT);
2999 }
3000 
3001 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3002 						   u64 start,
3003 						   unsigned long len,
3004 						   gfp_t mask)
3005 {
3006 	struct extent_buffer *eb = NULL;
3007 #if LEAK_DEBUG
3008 	unsigned long flags;
3009 #endif
3010 
3011 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3012 	eb->start = start;
3013 	eb->len = len;
3014 	spin_lock_init(&eb->lock);
3015 	init_waitqueue_head(&eb->lock_wq);
3016 
3017 #if LEAK_DEBUG
3018 	spin_lock_irqsave(&leak_lock, flags);
3019 	list_add(&eb->leak_list, &buffers);
3020 	spin_unlock_irqrestore(&leak_lock, flags);
3021 #endif
3022 	atomic_set(&eb->refs, 1);
3023 
3024 	return eb;
3025 }
3026 
3027 static void __free_extent_buffer(struct extent_buffer *eb)
3028 {
3029 #if LEAK_DEBUG
3030 	unsigned long flags;
3031 	spin_lock_irqsave(&leak_lock, flags);
3032 	list_del(&eb->leak_list);
3033 	spin_unlock_irqrestore(&leak_lock, flags);
3034 #endif
3035 	kmem_cache_free(extent_buffer_cache, eb);
3036 }
3037 
3038 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3039 					  u64 start, unsigned long len,
3040 					  struct page *page0,
3041 					  gfp_t mask)
3042 {
3043 	unsigned long num_pages = num_extent_pages(start, len);
3044 	unsigned long i;
3045 	unsigned long index = start >> PAGE_CACHE_SHIFT;
3046 	struct extent_buffer *eb;
3047 	struct extent_buffer *exists = NULL;
3048 	struct page *p;
3049 	struct address_space *mapping = tree->mapping;
3050 	int uptodate = 1;
3051 
3052 	spin_lock(&tree->buffer_lock);
3053 	eb = buffer_search(tree, start);
3054 	if (eb) {
3055 		atomic_inc(&eb->refs);
3056 		spin_unlock(&tree->buffer_lock);
3057 		mark_page_accessed(eb->first_page);
3058 		return eb;
3059 	}
3060 	spin_unlock(&tree->buffer_lock);
3061 
3062 	eb = __alloc_extent_buffer(tree, start, len, mask);
3063 	if (!eb)
3064 		return NULL;
3065 
3066 	if (page0) {
3067 		eb->first_page = page0;
3068 		i = 1;
3069 		index++;
3070 		page_cache_get(page0);
3071 		mark_page_accessed(page0);
3072 		set_page_extent_mapped(page0);
3073 		set_page_extent_head(page0, len);
3074 		uptodate = PageUptodate(page0);
3075 	} else {
3076 		i = 0;
3077 	}
3078 	for (; i < num_pages; i++, index++) {
3079 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3080 		if (!p) {
3081 			WARN_ON(1);
3082 			goto free_eb;
3083 		}
3084 		set_page_extent_mapped(p);
3085 		mark_page_accessed(p);
3086 		if (i == 0) {
3087 			eb->first_page = p;
3088 			set_page_extent_head(p, len);
3089 		} else {
3090 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3091 		}
3092 		if (!PageUptodate(p))
3093 			uptodate = 0;
3094 		unlock_page(p);
3095 	}
3096 	if (uptodate)
3097 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3098 
3099 	spin_lock(&tree->buffer_lock);
3100 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3101 	if (exists) {
3102 		/* add one reference for the caller */
3103 		atomic_inc(&exists->refs);
3104 		spin_unlock(&tree->buffer_lock);
3105 		goto free_eb;
3106 	}
3107 	spin_unlock(&tree->buffer_lock);
3108 
3109 	/* add one reference for the tree */
3110 	atomic_inc(&eb->refs);
3111 	return eb;
3112 
3113 free_eb:
3114 	if (!atomic_dec_and_test(&eb->refs))
3115 		return exists;
3116 	for (index = 1; index < i; index++)
3117 		page_cache_release(extent_buffer_page(eb, index));
3118 	page_cache_release(extent_buffer_page(eb, 0));
3119 	__free_extent_buffer(eb);
3120 	return exists;
3121 }
3122 
3123 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3124 					 u64 start, unsigned long len,
3125 					  gfp_t mask)
3126 {
3127 	struct extent_buffer *eb;
3128 
3129 	spin_lock(&tree->buffer_lock);
3130 	eb = buffer_search(tree, start);
3131 	if (eb)
3132 		atomic_inc(&eb->refs);
3133 	spin_unlock(&tree->buffer_lock);
3134 
3135 	if (eb)
3136 		mark_page_accessed(eb->first_page);
3137 
3138 	return eb;
3139 }
3140 
3141 void free_extent_buffer(struct extent_buffer *eb)
3142 {
3143 	if (!eb)
3144 		return;
3145 
3146 	if (!atomic_dec_and_test(&eb->refs))
3147 		return;
3148 
3149 	WARN_ON(1);
3150 }
3151 
3152 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3153 			      struct extent_buffer *eb)
3154 {
3155 	unsigned long i;
3156 	unsigned long num_pages;
3157 	struct page *page;
3158 
3159 	num_pages = num_extent_pages(eb->start, eb->len);
3160 
3161 	for (i = 0; i < num_pages; i++) {
3162 		page = extent_buffer_page(eb, i);
3163 		if (!PageDirty(page))
3164 			continue;
3165 
3166 		lock_page(page);
3167 		if (i == 0)
3168 			set_page_extent_head(page, eb->len);
3169 		else
3170 			set_page_private(page, EXTENT_PAGE_PRIVATE);
3171 
3172 		clear_page_dirty_for_io(page);
3173 		spin_lock_irq(&page->mapping->tree_lock);
3174 		if (!PageDirty(page)) {
3175 			radix_tree_tag_clear(&page->mapping->page_tree,
3176 						page_index(page),
3177 						PAGECACHE_TAG_DIRTY);
3178 		}
3179 		spin_unlock_irq(&page->mapping->tree_lock);
3180 		unlock_page(page);
3181 	}
3182 	return 0;
3183 }
3184 
3185 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3186 				    struct extent_buffer *eb)
3187 {
3188 	return wait_on_extent_writeback(tree, eb->start,
3189 					eb->start + eb->len - 1);
3190 }
3191 
3192 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3193 			     struct extent_buffer *eb)
3194 {
3195 	unsigned long i;
3196 	unsigned long num_pages;
3197 	int was_dirty = 0;
3198 
3199 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3200 	num_pages = num_extent_pages(eb->start, eb->len);
3201 	for (i = 0; i < num_pages; i++)
3202 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3203 	return was_dirty;
3204 }
3205 
3206 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3207 				struct extent_buffer *eb)
3208 {
3209 	unsigned long i;
3210 	struct page *page;
3211 	unsigned long num_pages;
3212 
3213 	num_pages = num_extent_pages(eb->start, eb->len);
3214 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3215 
3216 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3217 			      GFP_NOFS);
3218 	for (i = 0; i < num_pages; i++) {
3219 		page = extent_buffer_page(eb, i);
3220 		if (page)
3221 			ClearPageUptodate(page);
3222 	}
3223 	return 0;
3224 }
3225 
3226 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3227 				struct extent_buffer *eb)
3228 {
3229 	unsigned long i;
3230 	struct page *page;
3231 	unsigned long num_pages;
3232 
3233 	num_pages = num_extent_pages(eb->start, eb->len);
3234 
3235 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3236 			    GFP_NOFS);
3237 	for (i = 0; i < num_pages; i++) {
3238 		page = extent_buffer_page(eb, i);
3239 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3240 		    ((i == num_pages - 1) &&
3241 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3242 			check_page_uptodate(tree, page);
3243 			continue;
3244 		}
3245 		SetPageUptodate(page);
3246 	}
3247 	return 0;
3248 }
3249 
3250 int extent_range_uptodate(struct extent_io_tree *tree,
3251 			  u64 start, u64 end)
3252 {
3253 	struct page *page;
3254 	int ret;
3255 	int pg_uptodate = 1;
3256 	int uptodate;
3257 	unsigned long index;
3258 
3259 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3260 	if (ret)
3261 		return 1;
3262 	while (start <= end) {
3263 		index = start >> PAGE_CACHE_SHIFT;
3264 		page = find_get_page(tree->mapping, index);
3265 		uptodate = PageUptodate(page);
3266 		page_cache_release(page);
3267 		if (!uptodate) {
3268 			pg_uptodate = 0;
3269 			break;
3270 		}
3271 		start += PAGE_CACHE_SIZE;
3272 	}
3273 	return pg_uptodate;
3274 }
3275 
3276 int extent_buffer_uptodate(struct extent_io_tree *tree,
3277 			   struct extent_buffer *eb)
3278 {
3279 	int ret = 0;
3280 	unsigned long num_pages;
3281 	unsigned long i;
3282 	struct page *page;
3283 	int pg_uptodate = 1;
3284 
3285 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3286 		return 1;
3287 
3288 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3289 			   EXTENT_UPTODATE, 1);
3290 	if (ret)
3291 		return ret;
3292 
3293 	num_pages = num_extent_pages(eb->start, eb->len);
3294 	for (i = 0; i < num_pages; i++) {
3295 		page = extent_buffer_page(eb, i);
3296 		if (!PageUptodate(page)) {
3297 			pg_uptodate = 0;
3298 			break;
3299 		}
3300 	}
3301 	return pg_uptodate;
3302 }
3303 
3304 int read_extent_buffer_pages(struct extent_io_tree *tree,
3305 			     struct extent_buffer *eb,
3306 			     u64 start, int wait,
3307 			     get_extent_t *get_extent, int mirror_num)
3308 {
3309 	unsigned long i;
3310 	unsigned long start_i;
3311 	struct page *page;
3312 	int err;
3313 	int ret = 0;
3314 	int locked_pages = 0;
3315 	int all_uptodate = 1;
3316 	int inc_all_pages = 0;
3317 	unsigned long num_pages;
3318 	struct bio *bio = NULL;
3319 	unsigned long bio_flags = 0;
3320 
3321 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3322 		return 0;
3323 
3324 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3325 			   EXTENT_UPTODATE, 1)) {
3326 		return 0;
3327 	}
3328 
3329 	if (start) {
3330 		WARN_ON(start < eb->start);
3331 		start_i = (start >> PAGE_CACHE_SHIFT) -
3332 			(eb->start >> PAGE_CACHE_SHIFT);
3333 	} else {
3334 		start_i = 0;
3335 	}
3336 
3337 	num_pages = num_extent_pages(eb->start, eb->len);
3338 	for (i = start_i; i < num_pages; i++) {
3339 		page = extent_buffer_page(eb, i);
3340 		if (!wait) {
3341 			if (!trylock_page(page))
3342 				goto unlock_exit;
3343 		} else {
3344 			lock_page(page);
3345 		}
3346 		locked_pages++;
3347 		if (!PageUptodate(page))
3348 			all_uptodate = 0;
3349 	}
3350 	if (all_uptodate) {
3351 		if (start_i == 0)
3352 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3353 		goto unlock_exit;
3354 	}
3355 
3356 	for (i = start_i; i < num_pages; i++) {
3357 		page = extent_buffer_page(eb, i);
3358 		if (inc_all_pages)
3359 			page_cache_get(page);
3360 		if (!PageUptodate(page)) {
3361 			if (start_i == 0)
3362 				inc_all_pages = 1;
3363 			ClearPageError(page);
3364 			err = __extent_read_full_page(tree, page,
3365 						      get_extent, &bio,
3366 						      mirror_num, &bio_flags);
3367 			if (err)
3368 				ret = err;
3369 		} else {
3370 			unlock_page(page);
3371 		}
3372 	}
3373 
3374 	if (bio)
3375 		submit_one_bio(READ, bio, mirror_num, bio_flags);
3376 
3377 	if (ret || !wait)
3378 		return ret;
3379 
3380 	for (i = start_i; i < num_pages; i++) {
3381 		page = extent_buffer_page(eb, i);
3382 		wait_on_page_locked(page);
3383 		if (!PageUptodate(page))
3384 			ret = -EIO;
3385 	}
3386 
3387 	if (!ret)
3388 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3389 	return ret;
3390 
3391 unlock_exit:
3392 	i = start_i;
3393 	while (locked_pages > 0) {
3394 		page = extent_buffer_page(eb, i);
3395 		i++;
3396 		unlock_page(page);
3397 		locked_pages--;
3398 	}
3399 	return ret;
3400 }
3401 
3402 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3403 			unsigned long start,
3404 			unsigned long len)
3405 {
3406 	size_t cur;
3407 	size_t offset;
3408 	struct page *page;
3409 	char *kaddr;
3410 	char *dst = (char *)dstv;
3411 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3412 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3413 
3414 	WARN_ON(start > eb->len);
3415 	WARN_ON(start + len > eb->start + eb->len);
3416 
3417 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3418 
3419 	while (len > 0) {
3420 		page = extent_buffer_page(eb, i);
3421 
3422 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3423 		kaddr = kmap_atomic(page, KM_USER1);
3424 		memcpy(dst, kaddr + offset, cur);
3425 		kunmap_atomic(kaddr, KM_USER1);
3426 
3427 		dst += cur;
3428 		len -= cur;
3429 		offset = 0;
3430 		i++;
3431 	}
3432 }
3433 
3434 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3435 			       unsigned long min_len, char **token, char **map,
3436 			       unsigned long *map_start,
3437 			       unsigned long *map_len, int km)
3438 {
3439 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3440 	char *kaddr;
3441 	struct page *p;
3442 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3443 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3444 	unsigned long end_i = (start_offset + start + min_len - 1) >>
3445 		PAGE_CACHE_SHIFT;
3446 
3447 	if (i != end_i)
3448 		return -EINVAL;
3449 
3450 	if (i == 0) {
3451 		offset = start_offset;
3452 		*map_start = 0;
3453 	} else {
3454 		offset = 0;
3455 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3456 	}
3457 
3458 	if (start + min_len > eb->len) {
3459 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3460 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3461 		       eb->len, start, min_len);
3462 		WARN_ON(1);
3463 	}
3464 
3465 	p = extent_buffer_page(eb, i);
3466 	kaddr = kmap_atomic(p, km);
3467 	*token = kaddr;
3468 	*map = kaddr + offset;
3469 	*map_len = PAGE_CACHE_SIZE - offset;
3470 	return 0;
3471 }
3472 
3473 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3474 		      unsigned long min_len,
3475 		      char **token, char **map,
3476 		      unsigned long *map_start,
3477 		      unsigned long *map_len, int km)
3478 {
3479 	int err;
3480 	int save = 0;
3481 	if (eb->map_token) {
3482 		unmap_extent_buffer(eb, eb->map_token, km);
3483 		eb->map_token = NULL;
3484 		save = 1;
3485 	}
3486 	err = map_private_extent_buffer(eb, start, min_len, token, map,
3487 				       map_start, map_len, km);
3488 	if (!err && save) {
3489 		eb->map_token = *token;
3490 		eb->kaddr = *map;
3491 		eb->map_start = *map_start;
3492 		eb->map_len = *map_len;
3493 	}
3494 	return err;
3495 }
3496 
3497 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3498 {
3499 	kunmap_atomic(token, km);
3500 }
3501 
3502 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3503 			  unsigned long start,
3504 			  unsigned long len)
3505 {
3506 	size_t cur;
3507 	size_t offset;
3508 	struct page *page;
3509 	char *kaddr;
3510 	char *ptr = (char *)ptrv;
3511 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3512 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3513 	int ret = 0;
3514 
3515 	WARN_ON(start > eb->len);
3516 	WARN_ON(start + len > eb->start + eb->len);
3517 
3518 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3519 
3520 	while (len > 0) {
3521 		page = extent_buffer_page(eb, i);
3522 
3523 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3524 
3525 		kaddr = kmap_atomic(page, KM_USER0);
3526 		ret = memcmp(ptr, kaddr + offset, cur);
3527 		kunmap_atomic(kaddr, KM_USER0);
3528 		if (ret)
3529 			break;
3530 
3531 		ptr += cur;
3532 		len -= cur;
3533 		offset = 0;
3534 		i++;
3535 	}
3536 	return ret;
3537 }
3538 
3539 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3540 			 unsigned long start, unsigned long len)
3541 {
3542 	size_t cur;
3543 	size_t offset;
3544 	struct page *page;
3545 	char *kaddr;
3546 	char *src = (char *)srcv;
3547 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3548 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3549 
3550 	WARN_ON(start > eb->len);
3551 	WARN_ON(start + len > eb->start + eb->len);
3552 
3553 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3554 
3555 	while (len > 0) {
3556 		page = extent_buffer_page(eb, i);
3557 		WARN_ON(!PageUptodate(page));
3558 
3559 		cur = min(len, PAGE_CACHE_SIZE - offset);
3560 		kaddr = kmap_atomic(page, KM_USER1);
3561 		memcpy(kaddr + offset, src, cur);
3562 		kunmap_atomic(kaddr, KM_USER1);
3563 
3564 		src += cur;
3565 		len -= cur;
3566 		offset = 0;
3567 		i++;
3568 	}
3569 }
3570 
3571 void memset_extent_buffer(struct extent_buffer *eb, char c,
3572 			  unsigned long start, unsigned long len)
3573 {
3574 	size_t cur;
3575 	size_t offset;
3576 	struct page *page;
3577 	char *kaddr;
3578 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3579 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3580 
3581 	WARN_ON(start > eb->len);
3582 	WARN_ON(start + len > eb->start + eb->len);
3583 
3584 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3585 
3586 	while (len > 0) {
3587 		page = extent_buffer_page(eb, i);
3588 		WARN_ON(!PageUptodate(page));
3589 
3590 		cur = min(len, PAGE_CACHE_SIZE - offset);
3591 		kaddr = kmap_atomic(page, KM_USER0);
3592 		memset(kaddr + offset, c, cur);
3593 		kunmap_atomic(kaddr, KM_USER0);
3594 
3595 		len -= cur;
3596 		offset = 0;
3597 		i++;
3598 	}
3599 }
3600 
3601 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3602 			unsigned long dst_offset, unsigned long src_offset,
3603 			unsigned long len)
3604 {
3605 	u64 dst_len = dst->len;
3606 	size_t cur;
3607 	size_t offset;
3608 	struct page *page;
3609 	char *kaddr;
3610 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3611 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3612 
3613 	WARN_ON(src->len != dst_len);
3614 
3615 	offset = (start_offset + dst_offset) &
3616 		((unsigned long)PAGE_CACHE_SIZE - 1);
3617 
3618 	while (len > 0) {
3619 		page = extent_buffer_page(dst, i);
3620 		WARN_ON(!PageUptodate(page));
3621 
3622 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3623 
3624 		kaddr = kmap_atomic(page, KM_USER0);
3625 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3626 		kunmap_atomic(kaddr, KM_USER0);
3627 
3628 		src_offset += cur;
3629 		len -= cur;
3630 		offset = 0;
3631 		i++;
3632 	}
3633 }
3634 
3635 static void move_pages(struct page *dst_page, struct page *src_page,
3636 		       unsigned long dst_off, unsigned long src_off,
3637 		       unsigned long len)
3638 {
3639 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3640 	if (dst_page == src_page) {
3641 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3642 	} else {
3643 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3644 		char *p = dst_kaddr + dst_off + len;
3645 		char *s = src_kaddr + src_off + len;
3646 
3647 		while (len--)
3648 			*--p = *--s;
3649 
3650 		kunmap_atomic(src_kaddr, KM_USER1);
3651 	}
3652 	kunmap_atomic(dst_kaddr, KM_USER0);
3653 }
3654 
3655 static void copy_pages(struct page *dst_page, struct page *src_page,
3656 		       unsigned long dst_off, unsigned long src_off,
3657 		       unsigned long len)
3658 {
3659 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3660 	char *src_kaddr;
3661 
3662 	if (dst_page != src_page)
3663 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3664 	else
3665 		src_kaddr = dst_kaddr;
3666 
3667 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3668 	kunmap_atomic(dst_kaddr, KM_USER0);
3669 	if (dst_page != src_page)
3670 		kunmap_atomic(src_kaddr, KM_USER1);
3671 }
3672 
3673 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3674 			   unsigned long src_offset, unsigned long len)
3675 {
3676 	size_t cur;
3677 	size_t dst_off_in_page;
3678 	size_t src_off_in_page;
3679 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3680 	unsigned long dst_i;
3681 	unsigned long src_i;
3682 
3683 	if (src_offset + len > dst->len) {
3684 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3685 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3686 		BUG_ON(1);
3687 	}
3688 	if (dst_offset + len > dst->len) {
3689 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3690 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3691 		BUG_ON(1);
3692 	}
3693 
3694 	while (len > 0) {
3695 		dst_off_in_page = (start_offset + dst_offset) &
3696 			((unsigned long)PAGE_CACHE_SIZE - 1);
3697 		src_off_in_page = (start_offset + src_offset) &
3698 			((unsigned long)PAGE_CACHE_SIZE - 1);
3699 
3700 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3701 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3702 
3703 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3704 					       src_off_in_page));
3705 		cur = min_t(unsigned long, cur,
3706 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3707 
3708 		copy_pages(extent_buffer_page(dst, dst_i),
3709 			   extent_buffer_page(dst, src_i),
3710 			   dst_off_in_page, src_off_in_page, cur);
3711 
3712 		src_offset += cur;
3713 		dst_offset += cur;
3714 		len -= cur;
3715 	}
3716 }
3717 
3718 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3719 			   unsigned long src_offset, unsigned long len)
3720 {
3721 	size_t cur;
3722 	size_t dst_off_in_page;
3723 	size_t src_off_in_page;
3724 	unsigned long dst_end = dst_offset + len - 1;
3725 	unsigned long src_end = src_offset + len - 1;
3726 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3727 	unsigned long dst_i;
3728 	unsigned long src_i;
3729 
3730 	if (src_offset + len > dst->len) {
3731 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3732 		       "len %lu len %lu\n", src_offset, len, dst->len);
3733 		BUG_ON(1);
3734 	}
3735 	if (dst_offset + len > dst->len) {
3736 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3737 		       "len %lu len %lu\n", dst_offset, len, dst->len);
3738 		BUG_ON(1);
3739 	}
3740 	if (dst_offset < src_offset) {
3741 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3742 		return;
3743 	}
3744 	while (len > 0) {
3745 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3746 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3747 
3748 		dst_off_in_page = (start_offset + dst_end) &
3749 			((unsigned long)PAGE_CACHE_SIZE - 1);
3750 		src_off_in_page = (start_offset + src_end) &
3751 			((unsigned long)PAGE_CACHE_SIZE - 1);
3752 
3753 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3754 		cur = min(cur, dst_off_in_page + 1);
3755 		move_pages(extent_buffer_page(dst, dst_i),
3756 			   extent_buffer_page(dst, src_i),
3757 			   dst_off_in_page - cur + 1,
3758 			   src_off_in_page - cur + 1, cur);
3759 
3760 		dst_end -= cur;
3761 		src_end -= cur;
3762 		len -= cur;
3763 	}
3764 }
3765 
3766 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3767 {
3768 	u64 start = page_offset(page);
3769 	struct extent_buffer *eb;
3770 	int ret = 1;
3771 	unsigned long i;
3772 	unsigned long num_pages;
3773 
3774 	spin_lock(&tree->buffer_lock);
3775 	eb = buffer_search(tree, start);
3776 	if (!eb)
3777 		goto out;
3778 
3779 	if (atomic_read(&eb->refs) > 1) {
3780 		ret = 0;
3781 		goto out;
3782 	}
3783 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3784 		ret = 0;
3785 		goto out;
3786 	}
3787 	/* at this point we can safely release the extent buffer */
3788 	num_pages = num_extent_pages(eb->start, eb->len);
3789 	for (i = 0; i < num_pages; i++)
3790 		page_cache_release(extent_buffer_page(eb, i));
3791 	rb_erase(&eb->rb_node, &tree->buffer);
3792 	__free_extent_buffer(eb);
3793 out:
3794 	spin_unlock(&tree->buffer_lock);
3795 	return ret;
3796 }
3797