xref: /linux/fs/btrfs/extent-tree.c (revision e2fc4d19292ef2eb208f76976ddc3320cc5839b6)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include "compat.h"
24 #include "hash.h"
25 #include "crc32c.h"
26 #include "ctree.h"
27 #include "disk-io.h"
28 #include "print-tree.h"
29 #include "transaction.h"
30 #include "volumes.h"
31 #include "locking.h"
32 #include "ref-cache.h"
33 
34 #define PENDING_EXTENT_INSERT 0
35 #define PENDING_EXTENT_DELETE 1
36 #define PENDING_BACKREF_UPDATE 2
37 
38 struct pending_extent_op {
39 	int type;
40 	u64 bytenr;
41 	u64 num_bytes;
42 	u64 parent;
43 	u64 orig_parent;
44 	u64 generation;
45 	u64 orig_generation;
46 	int level;
47 	struct list_head list;
48 	int del;
49 };
50 
51 static int finish_current_insert(struct btrfs_trans_handle *trans,
52 				 struct btrfs_root *extent_root, int all);
53 static int del_pending_extents(struct btrfs_trans_handle *trans,
54 			       struct btrfs_root *extent_root, int all);
55 static int pin_down_bytes(struct btrfs_trans_handle *trans,
56 			  struct btrfs_root *root,
57 			  u64 bytenr, u64 num_bytes, int is_data);
58 static int update_block_group(struct btrfs_trans_handle *trans,
59 			      struct btrfs_root *root,
60 			      u64 bytenr, u64 num_bytes, int alloc,
61 			      int mark_free);
62 
63 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
64 			  struct btrfs_root *extent_root, u64 alloc_bytes,
65 			  u64 flags, int force);
66 
67 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
68 {
69 	return (cache->flags & bits) == bits;
70 }
71 
72 /*
73  * this adds the block group to the fs_info rb tree for the block group
74  * cache
75  */
76 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
77 				struct btrfs_block_group_cache *block_group)
78 {
79 	struct rb_node **p;
80 	struct rb_node *parent = NULL;
81 	struct btrfs_block_group_cache *cache;
82 
83 	spin_lock(&info->block_group_cache_lock);
84 	p = &info->block_group_cache_tree.rb_node;
85 
86 	while (*p) {
87 		parent = *p;
88 		cache = rb_entry(parent, struct btrfs_block_group_cache,
89 				 cache_node);
90 		if (block_group->key.objectid < cache->key.objectid) {
91 			p = &(*p)->rb_left;
92 		} else if (block_group->key.objectid > cache->key.objectid) {
93 			p = &(*p)->rb_right;
94 		} else {
95 			spin_unlock(&info->block_group_cache_lock);
96 			return -EEXIST;
97 		}
98 	}
99 
100 	rb_link_node(&block_group->cache_node, parent, p);
101 	rb_insert_color(&block_group->cache_node,
102 			&info->block_group_cache_tree);
103 	spin_unlock(&info->block_group_cache_lock);
104 
105 	return 0;
106 }
107 
108 /*
109  * This will return the block group at or after bytenr if contains is 0, else
110  * it will return the block group that contains the bytenr
111  */
112 static struct btrfs_block_group_cache *
113 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
114 			      int contains)
115 {
116 	struct btrfs_block_group_cache *cache, *ret = NULL;
117 	struct rb_node *n;
118 	u64 end, start;
119 
120 	spin_lock(&info->block_group_cache_lock);
121 	n = info->block_group_cache_tree.rb_node;
122 
123 	while (n) {
124 		cache = rb_entry(n, struct btrfs_block_group_cache,
125 				 cache_node);
126 		end = cache->key.objectid + cache->key.offset - 1;
127 		start = cache->key.objectid;
128 
129 		if (bytenr < start) {
130 			if (!contains && (!ret || start < ret->key.objectid))
131 				ret = cache;
132 			n = n->rb_left;
133 		} else if (bytenr > start) {
134 			if (contains && bytenr <= end) {
135 				ret = cache;
136 				break;
137 			}
138 			n = n->rb_right;
139 		} else {
140 			ret = cache;
141 			break;
142 		}
143 	}
144 	if (ret)
145 		atomic_inc(&ret->count);
146 	spin_unlock(&info->block_group_cache_lock);
147 
148 	return ret;
149 }
150 
151 /*
152  * this is only called by cache_block_group, since we could have freed extents
153  * we need to check the pinned_extents for any extents that can't be used yet
154  * since their free space will be released as soon as the transaction commits.
155  */
156 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
157 			      struct btrfs_fs_info *info, u64 start, u64 end)
158 {
159 	u64 extent_start, extent_end, size;
160 	int ret;
161 
162 	mutex_lock(&info->pinned_mutex);
163 	while (start < end) {
164 		ret = find_first_extent_bit(&info->pinned_extents, start,
165 					    &extent_start, &extent_end,
166 					    EXTENT_DIRTY);
167 		if (ret)
168 			break;
169 
170 		if (extent_start == start) {
171 			start = extent_end + 1;
172 		} else if (extent_start > start && extent_start < end) {
173 			size = extent_start - start;
174 			ret = btrfs_add_free_space(block_group, start,
175 						   size);
176 			BUG_ON(ret);
177 			start = extent_end + 1;
178 		} else {
179 			break;
180 		}
181 	}
182 
183 	if (start < end) {
184 		size = end - start;
185 		ret = btrfs_add_free_space(block_group, start, size);
186 		BUG_ON(ret);
187 	}
188 	mutex_unlock(&info->pinned_mutex);
189 
190 	return 0;
191 }
192 
193 static int remove_sb_from_cache(struct btrfs_root *root,
194 				struct btrfs_block_group_cache *cache)
195 {
196 	u64 bytenr;
197 	u64 *logical;
198 	int stripe_len;
199 	int i, nr, ret;
200 
201 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
202 		bytenr = btrfs_sb_offset(i);
203 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
204 				       cache->key.objectid, bytenr, 0,
205 				       &logical, &nr, &stripe_len);
206 		BUG_ON(ret);
207 		while (nr--) {
208 			btrfs_remove_free_space(cache, logical[nr],
209 						stripe_len);
210 		}
211 		kfree(logical);
212 	}
213 	return 0;
214 }
215 
216 static int cache_block_group(struct btrfs_root *root,
217 			     struct btrfs_block_group_cache *block_group)
218 {
219 	struct btrfs_path *path;
220 	int ret = 0;
221 	struct btrfs_key key;
222 	struct extent_buffer *leaf;
223 	int slot;
224 	u64 last;
225 
226 	if (!block_group)
227 		return 0;
228 
229 	root = root->fs_info->extent_root;
230 
231 	if (block_group->cached)
232 		return 0;
233 
234 	path = btrfs_alloc_path();
235 	if (!path)
236 		return -ENOMEM;
237 
238 	path->reada = 2;
239 	/*
240 	 * we get into deadlocks with paths held by callers of this function.
241 	 * since the alloc_mutex is protecting things right now, just
242 	 * skip the locking here
243 	 */
244 	path->skip_locking = 1;
245 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
246 	key.objectid = last;
247 	key.offset = 0;
248 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
249 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
250 	if (ret < 0)
251 		goto err;
252 
253 	while (1) {
254 		leaf = path->nodes[0];
255 		slot = path->slots[0];
256 		if (slot >= btrfs_header_nritems(leaf)) {
257 			ret = btrfs_next_leaf(root, path);
258 			if (ret < 0)
259 				goto err;
260 			if (ret == 0)
261 				continue;
262 			else
263 				break;
264 		}
265 		btrfs_item_key_to_cpu(leaf, &key, slot);
266 		if (key.objectid < block_group->key.objectid)
267 			goto next;
268 
269 		if (key.objectid >= block_group->key.objectid +
270 		    block_group->key.offset)
271 			break;
272 
273 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
274 			add_new_free_space(block_group, root->fs_info, last,
275 					   key.objectid);
276 
277 			last = key.objectid + key.offset;
278 		}
279 next:
280 		path->slots[0]++;
281 	}
282 
283 	add_new_free_space(block_group, root->fs_info, last,
284 			   block_group->key.objectid +
285 			   block_group->key.offset);
286 
287 	remove_sb_from_cache(root, block_group);
288 	block_group->cached = 1;
289 	ret = 0;
290 err:
291 	btrfs_free_path(path);
292 	return ret;
293 }
294 
295 /*
296  * return the block group that starts at or after bytenr
297  */
298 static struct btrfs_block_group_cache *
299 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
300 {
301 	struct btrfs_block_group_cache *cache;
302 
303 	cache = block_group_cache_tree_search(info, bytenr, 0);
304 
305 	return cache;
306 }
307 
308 /*
309  * return the block group that contains teh given bytenr
310  */
311 struct btrfs_block_group_cache *btrfs_lookup_block_group(
312 						 struct btrfs_fs_info *info,
313 						 u64 bytenr)
314 {
315 	struct btrfs_block_group_cache *cache;
316 
317 	cache = block_group_cache_tree_search(info, bytenr, 1);
318 
319 	return cache;
320 }
321 
322 static inline void put_block_group(struct btrfs_block_group_cache *cache)
323 {
324 	if (atomic_dec_and_test(&cache->count))
325 		kfree(cache);
326 }
327 
328 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
329 						  u64 flags)
330 {
331 	struct list_head *head = &info->space_info;
332 	struct btrfs_space_info *found;
333 	list_for_each_entry(found, head, list) {
334 		if (found->flags == flags)
335 			return found;
336 	}
337 	return NULL;
338 }
339 
340 static u64 div_factor(u64 num, int factor)
341 {
342 	if (factor == 10)
343 		return num;
344 	num *= factor;
345 	do_div(num, 10);
346 	return num;
347 }
348 
349 u64 btrfs_find_block_group(struct btrfs_root *root,
350 			   u64 search_start, u64 search_hint, int owner)
351 {
352 	struct btrfs_block_group_cache *cache;
353 	u64 used;
354 	u64 last = max(search_hint, search_start);
355 	u64 group_start = 0;
356 	int full_search = 0;
357 	int factor = 9;
358 	int wrapped = 0;
359 again:
360 	while (1) {
361 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
362 		if (!cache)
363 			break;
364 
365 		spin_lock(&cache->lock);
366 		last = cache->key.objectid + cache->key.offset;
367 		used = btrfs_block_group_used(&cache->item);
368 
369 		if ((full_search || !cache->ro) &&
370 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
371 			if (used + cache->pinned + cache->reserved <
372 			    div_factor(cache->key.offset, factor)) {
373 				group_start = cache->key.objectid;
374 				spin_unlock(&cache->lock);
375 				put_block_group(cache);
376 				goto found;
377 			}
378 		}
379 		spin_unlock(&cache->lock);
380 		put_block_group(cache);
381 		cond_resched();
382 	}
383 	if (!wrapped) {
384 		last = search_start;
385 		wrapped = 1;
386 		goto again;
387 	}
388 	if (!full_search && factor < 10) {
389 		last = search_start;
390 		full_search = 1;
391 		factor = 10;
392 		goto again;
393 	}
394 found:
395 	return group_start;
396 }
397 
398 /* simple helper to search for an existing extent at a given offset */
399 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
400 {
401 	int ret;
402 	struct btrfs_key key;
403 	struct btrfs_path *path;
404 
405 	path = btrfs_alloc_path();
406 	BUG_ON(!path);
407 	key.objectid = start;
408 	key.offset = len;
409 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
410 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
411 				0, 0);
412 	btrfs_free_path(path);
413 	return ret;
414 }
415 
416 /*
417  * Back reference rules.  Back refs have three main goals:
418  *
419  * 1) differentiate between all holders of references to an extent so that
420  *    when a reference is dropped we can make sure it was a valid reference
421  *    before freeing the extent.
422  *
423  * 2) Provide enough information to quickly find the holders of an extent
424  *    if we notice a given block is corrupted or bad.
425  *
426  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
427  *    maintenance.  This is actually the same as #2, but with a slightly
428  *    different use case.
429  *
430  * File extents can be referenced by:
431  *
432  * - multiple snapshots, subvolumes, or different generations in one subvol
433  * - different files inside a single subvolume
434  * - different offsets inside a file (bookend extents in file.c)
435  *
436  * The extent ref structure has fields for:
437  *
438  * - Objectid of the subvolume root
439  * - Generation number of the tree holding the reference
440  * - objectid of the file holding the reference
441  * - number of references holding by parent node (alway 1 for tree blocks)
442  *
443  * Btree leaf may hold multiple references to a file extent. In most cases,
444  * these references are from same file and the corresponding offsets inside
445  * the file are close together.
446  *
447  * When a file extent is allocated the fields are filled in:
448  *     (root_key.objectid, trans->transid, inode objectid, 1)
449  *
450  * When a leaf is cow'd new references are added for every file extent found
451  * in the leaf.  It looks similar to the create case, but trans->transid will
452  * be different when the block is cow'd.
453  *
454  *     (root_key.objectid, trans->transid, inode objectid,
455  *      number of references in the leaf)
456  *
457  * When a file extent is removed either during snapshot deletion or
458  * file truncation, we find the corresponding back reference and check
459  * the following fields:
460  *
461  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
462  *      inode objectid)
463  *
464  * Btree extents can be referenced by:
465  *
466  * - Different subvolumes
467  * - Different generations of the same subvolume
468  *
469  * When a tree block is created, back references are inserted:
470  *
471  * (root->root_key.objectid, trans->transid, level, 1)
472  *
473  * When a tree block is cow'd, new back references are added for all the
474  * blocks it points to. If the tree block isn't in reference counted root,
475  * the old back references are removed. These new back references are of
476  * the form (trans->transid will have increased since creation):
477  *
478  * (root->root_key.objectid, trans->transid, level, 1)
479  *
480  * When a backref is in deleting, the following fields are checked:
481  *
482  * if backref was for a tree root:
483  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
484  * else
485  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
486  *
487  * Back Reference Key composing:
488  *
489  * The key objectid corresponds to the first byte in the extent, the key
490  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
491  * byte of parent extent. If a extent is tree root, the key offset is set
492  * to the key objectid.
493  */
494 
495 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
496 					  struct btrfs_root *root,
497 					  struct btrfs_path *path,
498 					  u64 bytenr, u64 parent,
499 					  u64 ref_root, u64 ref_generation,
500 					  u64 owner_objectid, int del)
501 {
502 	struct btrfs_key key;
503 	struct btrfs_extent_ref *ref;
504 	struct extent_buffer *leaf;
505 	u64 ref_objectid;
506 	int ret;
507 
508 	key.objectid = bytenr;
509 	key.type = BTRFS_EXTENT_REF_KEY;
510 	key.offset = parent;
511 
512 	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
513 	if (ret < 0)
514 		goto out;
515 	if (ret > 0) {
516 		ret = -ENOENT;
517 		goto out;
518 	}
519 
520 	leaf = path->nodes[0];
521 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
522 	ref_objectid = btrfs_ref_objectid(leaf, ref);
523 	if (btrfs_ref_root(leaf, ref) != ref_root ||
524 	    btrfs_ref_generation(leaf, ref) != ref_generation ||
525 	    (ref_objectid != owner_objectid &&
526 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
527 		ret = -EIO;
528 		WARN_ON(1);
529 		goto out;
530 	}
531 	ret = 0;
532 out:
533 	return ret;
534 }
535 
536 /*
537  * updates all the backrefs that are pending on update_list for the
538  * extent_root
539  */
540 static noinline int update_backrefs(struct btrfs_trans_handle *trans,
541 				    struct btrfs_root *extent_root,
542 				    struct btrfs_path *path,
543 				    struct list_head *update_list)
544 {
545 	struct btrfs_key key;
546 	struct btrfs_extent_ref *ref;
547 	struct btrfs_fs_info *info = extent_root->fs_info;
548 	struct pending_extent_op *op;
549 	struct extent_buffer *leaf;
550 	int ret = 0;
551 	struct list_head *cur = update_list->next;
552 	u64 ref_objectid;
553 	u64 ref_root = extent_root->root_key.objectid;
554 
555 	op = list_entry(cur, struct pending_extent_op, list);
556 
557 search:
558 	key.objectid = op->bytenr;
559 	key.type = BTRFS_EXTENT_REF_KEY;
560 	key.offset = op->orig_parent;
561 
562 	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
563 	BUG_ON(ret);
564 
565 	leaf = path->nodes[0];
566 
567 loop:
568 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
569 
570 	ref_objectid = btrfs_ref_objectid(leaf, ref);
571 
572 	if (btrfs_ref_root(leaf, ref) != ref_root ||
573 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
574 	    (ref_objectid != op->level &&
575 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
576 		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
577 		       "root %llu, owner %u\n",
578 		       (unsigned long long)op->bytenr,
579 		       (unsigned long long)op->orig_parent,
580 		       (unsigned long long)ref_root, op->level);
581 		btrfs_print_leaf(extent_root, leaf);
582 		BUG();
583 	}
584 
585 	key.objectid = op->bytenr;
586 	key.offset = op->parent;
587 	key.type = BTRFS_EXTENT_REF_KEY;
588 	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
589 	BUG_ON(ret);
590 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
591 	btrfs_set_ref_generation(leaf, ref, op->generation);
592 
593 	cur = cur->next;
594 
595 	list_del_init(&op->list);
596 	unlock_extent(&info->extent_ins, op->bytenr,
597 		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
598 	kfree(op);
599 
600 	if (cur == update_list) {
601 		btrfs_mark_buffer_dirty(path->nodes[0]);
602 		btrfs_release_path(extent_root, path);
603 		goto out;
604 	}
605 
606 	op = list_entry(cur, struct pending_extent_op, list);
607 
608 	path->slots[0]++;
609 	while (path->slots[0] < btrfs_header_nritems(leaf)) {
610 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
611 		if (key.objectid == op->bytenr &&
612 		    key.type == BTRFS_EXTENT_REF_KEY)
613 			goto loop;
614 		path->slots[0]++;
615 	}
616 
617 	btrfs_mark_buffer_dirty(path->nodes[0]);
618 	btrfs_release_path(extent_root, path);
619 	goto search;
620 
621 out:
622 	return 0;
623 }
624 
625 static noinline int insert_extents(struct btrfs_trans_handle *trans,
626 				   struct btrfs_root *extent_root,
627 				   struct btrfs_path *path,
628 				   struct list_head *insert_list, int nr)
629 {
630 	struct btrfs_key *keys;
631 	u32 *data_size;
632 	struct pending_extent_op *op;
633 	struct extent_buffer *leaf;
634 	struct list_head *cur = insert_list->next;
635 	struct btrfs_fs_info *info = extent_root->fs_info;
636 	u64 ref_root = extent_root->root_key.objectid;
637 	int i = 0, last = 0, ret;
638 	int total = nr * 2;
639 
640 	if (!nr)
641 		return 0;
642 
643 	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
644 	if (!keys)
645 		return -ENOMEM;
646 
647 	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
648 	if (!data_size) {
649 		kfree(keys);
650 		return -ENOMEM;
651 	}
652 
653 	list_for_each_entry(op, insert_list, list) {
654 		keys[i].objectid = op->bytenr;
655 		keys[i].offset = op->num_bytes;
656 		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
657 		data_size[i] = sizeof(struct btrfs_extent_item);
658 		i++;
659 
660 		keys[i].objectid = op->bytenr;
661 		keys[i].offset = op->parent;
662 		keys[i].type = BTRFS_EXTENT_REF_KEY;
663 		data_size[i] = sizeof(struct btrfs_extent_ref);
664 		i++;
665 	}
666 
667 	op = list_entry(cur, struct pending_extent_op, list);
668 	i = 0;
669 	while (i < total) {
670 		int c;
671 		ret = btrfs_insert_some_items(trans, extent_root, path,
672 					      keys+i, data_size+i, total-i);
673 		BUG_ON(ret < 0);
674 
675 		if (last && ret > 1)
676 			BUG();
677 
678 		leaf = path->nodes[0];
679 		for (c = 0; c < ret; c++) {
680 			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
681 
682 			/*
683 			 * if the first item we inserted was a backref, then
684 			 * the EXTENT_ITEM will be the odd c's, else it will
685 			 * be the even c's
686 			 */
687 			if ((ref_first && (c % 2)) ||
688 			    (!ref_first && !(c % 2))) {
689 				struct btrfs_extent_item *itm;
690 
691 				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
692 						     struct btrfs_extent_item);
693 				btrfs_set_extent_refs(path->nodes[0], itm, 1);
694 				op->del++;
695 			} else {
696 				struct btrfs_extent_ref *ref;
697 
698 				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
699 						     struct btrfs_extent_ref);
700 				btrfs_set_ref_root(leaf, ref, ref_root);
701 				btrfs_set_ref_generation(leaf, ref,
702 							 op->generation);
703 				btrfs_set_ref_objectid(leaf, ref, op->level);
704 				btrfs_set_ref_num_refs(leaf, ref, 1);
705 				op->del++;
706 			}
707 
708 			/*
709 			 * using del to see when its ok to free up the
710 			 * pending_extent_op.  In the case where we insert the
711 			 * last item on the list in order to help do batching
712 			 * we need to not free the extent op until we actually
713 			 * insert the extent_item
714 			 */
715 			if (op->del == 2) {
716 				unlock_extent(&info->extent_ins, op->bytenr,
717 					      op->bytenr + op->num_bytes - 1,
718 					      GFP_NOFS);
719 				cur = cur->next;
720 				list_del_init(&op->list);
721 				kfree(op);
722 				if (cur != insert_list)
723 					op = list_entry(cur,
724 						struct pending_extent_op,
725 						list);
726 			}
727 		}
728 		btrfs_mark_buffer_dirty(leaf);
729 		btrfs_release_path(extent_root, path);
730 
731 		/*
732 		 * Ok backref's and items usually go right next to eachother,
733 		 * but if we could only insert 1 item that means that we
734 		 * inserted on the end of a leaf, and we have no idea what may
735 		 * be on the next leaf so we just play it safe.  In order to
736 		 * try and help this case we insert the last thing on our
737 		 * insert list so hopefully it will end up being the last
738 		 * thing on the leaf and everything else will be before it,
739 		 * which will let us insert a whole bunch of items at the same
740 		 * time.
741 		 */
742 		if (ret == 1 && !last && (i + ret < total)) {
743 			/*
744 			 * last: where we will pick up the next time around
745 			 * i: our current key to insert, will be total - 1
746 			 * cur: the current op we are screwing with
747 			 * op: duh
748 			 */
749 			last = i + ret;
750 			i = total - 1;
751 			cur = insert_list->prev;
752 			op = list_entry(cur, struct pending_extent_op, list);
753 		} else if (last) {
754 			/*
755 			 * ok we successfully inserted the last item on the
756 			 * list, lets reset everything
757 			 *
758 			 * i: our current key to insert, so where we left off
759 			 *    last time
760 			 * last: done with this
761 			 * cur: the op we are messing with
762 			 * op: duh
763 			 * total: since we inserted the last key, we need to
764 			 *        decrement total so we dont overflow
765 			 */
766 			i = last;
767 			last = 0;
768 			total--;
769 			if (i < total) {
770 				cur = insert_list->next;
771 				op = list_entry(cur, struct pending_extent_op,
772 						list);
773 			}
774 		} else {
775 			i += ret;
776 		}
777 
778 		cond_resched();
779 	}
780 	ret = 0;
781 	kfree(keys);
782 	kfree(data_size);
783 	return ret;
784 }
785 
786 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
787 					  struct btrfs_root *root,
788 					  struct btrfs_path *path,
789 					  u64 bytenr, u64 parent,
790 					  u64 ref_root, u64 ref_generation,
791 					  u64 owner_objectid)
792 {
793 	struct btrfs_key key;
794 	struct extent_buffer *leaf;
795 	struct btrfs_extent_ref *ref;
796 	u32 num_refs;
797 	int ret;
798 
799 	key.objectid = bytenr;
800 	key.type = BTRFS_EXTENT_REF_KEY;
801 	key.offset = parent;
802 
803 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
804 	if (ret == 0) {
805 		leaf = path->nodes[0];
806 		ref = btrfs_item_ptr(leaf, path->slots[0],
807 				     struct btrfs_extent_ref);
808 		btrfs_set_ref_root(leaf, ref, ref_root);
809 		btrfs_set_ref_generation(leaf, ref, ref_generation);
810 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
811 		btrfs_set_ref_num_refs(leaf, ref, 1);
812 	} else if (ret == -EEXIST) {
813 		u64 existing_owner;
814 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
815 		leaf = path->nodes[0];
816 		ref = btrfs_item_ptr(leaf, path->slots[0],
817 				     struct btrfs_extent_ref);
818 		if (btrfs_ref_root(leaf, ref) != ref_root ||
819 		    btrfs_ref_generation(leaf, ref) != ref_generation) {
820 			ret = -EIO;
821 			WARN_ON(1);
822 			goto out;
823 		}
824 
825 		num_refs = btrfs_ref_num_refs(leaf, ref);
826 		BUG_ON(num_refs == 0);
827 		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
828 
829 		existing_owner = btrfs_ref_objectid(leaf, ref);
830 		if (existing_owner != owner_objectid &&
831 		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
832 			btrfs_set_ref_objectid(leaf, ref,
833 					BTRFS_MULTIPLE_OBJECTIDS);
834 		}
835 		ret = 0;
836 	} else {
837 		goto out;
838 	}
839 	btrfs_mark_buffer_dirty(path->nodes[0]);
840 out:
841 	btrfs_release_path(root, path);
842 	return ret;
843 }
844 
845 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
846 					  struct btrfs_root *root,
847 					  struct btrfs_path *path)
848 {
849 	struct extent_buffer *leaf;
850 	struct btrfs_extent_ref *ref;
851 	u32 num_refs;
852 	int ret = 0;
853 
854 	leaf = path->nodes[0];
855 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
856 	num_refs = btrfs_ref_num_refs(leaf, ref);
857 	BUG_ON(num_refs == 0);
858 	num_refs -= 1;
859 	if (num_refs == 0) {
860 		ret = btrfs_del_item(trans, root, path);
861 	} else {
862 		btrfs_set_ref_num_refs(leaf, ref, num_refs);
863 		btrfs_mark_buffer_dirty(leaf);
864 	}
865 	btrfs_release_path(root, path);
866 	return ret;
867 }
868 
869 #ifdef BIO_RW_DISCARD
870 static void btrfs_issue_discard(struct block_device *bdev,
871 				u64 start, u64 len)
872 {
873 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
874 }
875 #endif
876 
877 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
878 				u64 num_bytes)
879 {
880 #ifdef BIO_RW_DISCARD
881 	int ret;
882 	u64 map_length = num_bytes;
883 	struct btrfs_multi_bio *multi = NULL;
884 
885 	/* Tell the block device(s) that the sectors can be discarded */
886 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
887 			      bytenr, &map_length, &multi, 0);
888 	if (!ret) {
889 		struct btrfs_bio_stripe *stripe = multi->stripes;
890 		int i;
891 
892 		if (map_length > num_bytes)
893 			map_length = num_bytes;
894 
895 		for (i = 0; i < multi->num_stripes; i++, stripe++) {
896 			btrfs_issue_discard(stripe->dev->bdev,
897 					    stripe->physical,
898 					    map_length);
899 		}
900 		kfree(multi);
901 	}
902 
903 	return ret;
904 #else
905 	return 0;
906 #endif
907 }
908 
909 static noinline int free_extents(struct btrfs_trans_handle *trans,
910 				 struct btrfs_root *extent_root,
911 				 struct list_head *del_list)
912 {
913 	struct btrfs_fs_info *info = extent_root->fs_info;
914 	struct btrfs_path *path;
915 	struct btrfs_key key, found_key;
916 	struct extent_buffer *leaf;
917 	struct list_head *cur;
918 	struct pending_extent_op *op;
919 	struct btrfs_extent_item *ei;
920 	int ret, num_to_del, extent_slot = 0, found_extent = 0;
921 	u32 refs;
922 	u64 bytes_freed = 0;
923 
924 	path = btrfs_alloc_path();
925 	if (!path)
926 		return -ENOMEM;
927 	path->reada = 1;
928 
929 search:
930 	/* search for the backref for the current ref we want to delete */
931 	cur = del_list->next;
932 	op = list_entry(cur, struct pending_extent_op, list);
933 	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
934 				    op->orig_parent,
935 				    extent_root->root_key.objectid,
936 				    op->orig_generation, op->level, 1);
937 	if (ret) {
938 		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
939 		       "root %llu gen %llu owner %u\n",
940 		       (unsigned long long)op->bytenr,
941 		       (unsigned long long)extent_root->root_key.objectid,
942 		       (unsigned long long)op->orig_generation, op->level);
943 		btrfs_print_leaf(extent_root, path->nodes[0]);
944 		WARN_ON(1);
945 		goto out;
946 	}
947 
948 	extent_slot = path->slots[0];
949 	num_to_del = 1;
950 	found_extent = 0;
951 
952 	/*
953 	 * if we aren't the first item on the leaf we can move back one and see
954 	 * if our ref is right next to our extent item
955 	 */
956 	if (likely(extent_slot)) {
957 		extent_slot--;
958 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
959 				      extent_slot);
960 		if (found_key.objectid == op->bytenr &&
961 		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
962 		    found_key.offset == op->num_bytes) {
963 			num_to_del++;
964 			found_extent = 1;
965 		}
966 	}
967 
968 	/*
969 	 * if we didn't find the extent we need to delete the backref and then
970 	 * search for the extent item key so we can update its ref count
971 	 */
972 	if (!found_extent) {
973 		key.objectid = op->bytenr;
974 		key.type = BTRFS_EXTENT_ITEM_KEY;
975 		key.offset = op->num_bytes;
976 
977 		ret = remove_extent_backref(trans, extent_root, path);
978 		BUG_ON(ret);
979 		btrfs_release_path(extent_root, path);
980 		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
981 		BUG_ON(ret);
982 		extent_slot = path->slots[0];
983 	}
984 
985 	/* this is where we update the ref count for the extent */
986 	leaf = path->nodes[0];
987 	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
988 	refs = btrfs_extent_refs(leaf, ei);
989 	BUG_ON(refs == 0);
990 	refs--;
991 	btrfs_set_extent_refs(leaf, ei, refs);
992 
993 	btrfs_mark_buffer_dirty(leaf);
994 
995 	/*
996 	 * This extent needs deleting.  The reason cur_slot is extent_slot +
997 	 * num_to_del is because extent_slot points to the slot where the extent
998 	 * is, and if the backref was not right next to the extent we will be
999 	 * deleting at least 1 item, and will want to start searching at the
1000 	 * slot directly next to extent_slot.  However if we did find the
1001 	 * backref next to the extent item them we will be deleting at least 2
1002 	 * items and will want to start searching directly after the ref slot
1003 	 */
1004 	if (!refs) {
1005 		struct list_head *pos, *n, *end;
1006 		int cur_slot = extent_slot+num_to_del;
1007 		u64 super_used;
1008 		u64 root_used;
1009 
1010 		path->slots[0] = extent_slot;
1011 		bytes_freed = op->num_bytes;
1012 
1013 		mutex_lock(&info->pinned_mutex);
1014 		ret = pin_down_bytes(trans, extent_root, op->bytenr,
1015 				     op->num_bytes, op->level >=
1016 				     BTRFS_FIRST_FREE_OBJECTID);
1017 		mutex_unlock(&info->pinned_mutex);
1018 		BUG_ON(ret < 0);
1019 		op->del = ret;
1020 
1021 		/*
1022 		 * we need to see if we can delete multiple things at once, so
1023 		 * start looping through the list of extents we are wanting to
1024 		 * delete and see if their extent/backref's are right next to
1025 		 * eachother and the extents only have 1 ref
1026 		 */
1027 		for (pos = cur->next; pos != del_list; pos = pos->next) {
1028 			struct pending_extent_op *tmp;
1029 
1030 			tmp = list_entry(pos, struct pending_extent_op, list);
1031 
1032 			/* we only want to delete extent+ref at this stage */
1033 			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1034 				break;
1035 
1036 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1037 			if (found_key.objectid != tmp->bytenr ||
1038 			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1039 			    found_key.offset != tmp->num_bytes)
1040 				break;
1041 
1042 			/* check to make sure this extent only has one ref */
1043 			ei = btrfs_item_ptr(leaf, cur_slot,
1044 					    struct btrfs_extent_item);
1045 			if (btrfs_extent_refs(leaf, ei) != 1)
1046 				break;
1047 
1048 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1049 			if (found_key.objectid != tmp->bytenr ||
1050 			    found_key.type != BTRFS_EXTENT_REF_KEY ||
1051 			    found_key.offset != tmp->orig_parent)
1052 				break;
1053 
1054 			/*
1055 			 * the ref is right next to the extent, we can set the
1056 			 * ref count to 0 since we will delete them both now
1057 			 */
1058 			btrfs_set_extent_refs(leaf, ei, 0);
1059 
1060 			/* pin down the bytes for this extent */
1061 			mutex_lock(&info->pinned_mutex);
1062 			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1063 					     tmp->num_bytes, tmp->level >=
1064 					     BTRFS_FIRST_FREE_OBJECTID);
1065 			mutex_unlock(&info->pinned_mutex);
1066 			BUG_ON(ret < 0);
1067 
1068 			/*
1069 			 * use the del field to tell if we need to go ahead and
1070 			 * free up the extent when we delete the item or not.
1071 			 */
1072 			tmp->del = ret;
1073 			bytes_freed += tmp->num_bytes;
1074 
1075 			num_to_del += 2;
1076 			cur_slot += 2;
1077 		}
1078 		end = pos;
1079 
1080 		/* update the free space counters */
1081 		spin_lock(&info->delalloc_lock);
1082 		super_used = btrfs_super_bytes_used(&info->super_copy);
1083 		btrfs_set_super_bytes_used(&info->super_copy,
1084 					   super_used - bytes_freed);
1085 
1086 		root_used = btrfs_root_used(&extent_root->root_item);
1087 		btrfs_set_root_used(&extent_root->root_item,
1088 				    root_used - bytes_freed);
1089 		spin_unlock(&info->delalloc_lock);
1090 
1091 		/* delete the items */
1092 		ret = btrfs_del_items(trans, extent_root, path,
1093 				      path->slots[0], num_to_del);
1094 		BUG_ON(ret);
1095 
1096 		/*
1097 		 * loop through the extents we deleted and do the cleanup work
1098 		 * on them
1099 		 */
1100 		for (pos = cur, n = pos->next; pos != end;
1101 		     pos = n, n = pos->next) {
1102 			struct pending_extent_op *tmp;
1103 			tmp = list_entry(pos, struct pending_extent_op, list);
1104 
1105 			/*
1106 			 * remember tmp->del tells us wether or not we pinned
1107 			 * down the extent
1108 			 */
1109 			ret = update_block_group(trans, extent_root,
1110 						 tmp->bytenr, tmp->num_bytes, 0,
1111 						 tmp->del);
1112 			BUG_ON(ret);
1113 
1114 			list_del_init(&tmp->list);
1115 			unlock_extent(&info->extent_ins, tmp->bytenr,
1116 				      tmp->bytenr + tmp->num_bytes - 1,
1117 				      GFP_NOFS);
1118 			kfree(tmp);
1119 		}
1120 	} else if (refs && found_extent) {
1121 		/*
1122 		 * the ref and extent were right next to eachother, but the
1123 		 * extent still has a ref, so just free the backref and keep
1124 		 * going
1125 		 */
1126 		ret = remove_extent_backref(trans, extent_root, path);
1127 		BUG_ON(ret);
1128 
1129 		list_del_init(&op->list);
1130 		unlock_extent(&info->extent_ins, op->bytenr,
1131 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1132 		kfree(op);
1133 	} else {
1134 		/*
1135 		 * the extent has multiple refs and the backref we were looking
1136 		 * for was not right next to it, so just unlock and go next,
1137 		 * we're good to go
1138 		 */
1139 		list_del_init(&op->list);
1140 		unlock_extent(&info->extent_ins, op->bytenr,
1141 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1142 		kfree(op);
1143 	}
1144 
1145 	btrfs_release_path(extent_root, path);
1146 	if (!list_empty(del_list))
1147 		goto search;
1148 
1149 out:
1150 	btrfs_free_path(path);
1151 	return ret;
1152 }
1153 
1154 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1155 				     struct btrfs_root *root, u64 bytenr,
1156 				     u64 orig_parent, u64 parent,
1157 				     u64 orig_root, u64 ref_root,
1158 				     u64 orig_generation, u64 ref_generation,
1159 				     u64 owner_objectid)
1160 {
1161 	int ret;
1162 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1163 	struct btrfs_path *path;
1164 
1165 	if (root == root->fs_info->extent_root) {
1166 		struct pending_extent_op *extent_op;
1167 		u64 num_bytes;
1168 
1169 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1170 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
1171 		mutex_lock(&root->fs_info->extent_ins_mutex);
1172 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1173 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1174 			u64 priv;
1175 			ret = get_state_private(&root->fs_info->extent_ins,
1176 						bytenr, &priv);
1177 			BUG_ON(ret);
1178 			extent_op = (struct pending_extent_op *)
1179 							(unsigned long)priv;
1180 			BUG_ON(extent_op->parent != orig_parent);
1181 			BUG_ON(extent_op->generation != orig_generation);
1182 
1183 			extent_op->parent = parent;
1184 			extent_op->generation = ref_generation;
1185 		} else {
1186 			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1187 			BUG_ON(!extent_op);
1188 
1189 			extent_op->type = PENDING_BACKREF_UPDATE;
1190 			extent_op->bytenr = bytenr;
1191 			extent_op->num_bytes = num_bytes;
1192 			extent_op->parent = parent;
1193 			extent_op->orig_parent = orig_parent;
1194 			extent_op->generation = ref_generation;
1195 			extent_op->orig_generation = orig_generation;
1196 			extent_op->level = (int)owner_objectid;
1197 			INIT_LIST_HEAD(&extent_op->list);
1198 			extent_op->del = 0;
1199 
1200 			set_extent_bits(&root->fs_info->extent_ins,
1201 					bytenr, bytenr + num_bytes - 1,
1202 					EXTENT_WRITEBACK, GFP_NOFS);
1203 			set_state_private(&root->fs_info->extent_ins,
1204 					  bytenr, (unsigned long)extent_op);
1205 		}
1206 		mutex_unlock(&root->fs_info->extent_ins_mutex);
1207 		return 0;
1208 	}
1209 
1210 	path = btrfs_alloc_path();
1211 	if (!path)
1212 		return -ENOMEM;
1213 	ret = lookup_extent_backref(trans, extent_root, path,
1214 				    bytenr, orig_parent, orig_root,
1215 				    orig_generation, owner_objectid, 1);
1216 	if (ret)
1217 		goto out;
1218 	ret = remove_extent_backref(trans, extent_root, path);
1219 	if (ret)
1220 		goto out;
1221 	ret = insert_extent_backref(trans, extent_root, path, bytenr,
1222 				    parent, ref_root, ref_generation,
1223 				    owner_objectid);
1224 	BUG_ON(ret);
1225 	finish_current_insert(trans, extent_root, 0);
1226 	del_pending_extents(trans, extent_root, 0);
1227 out:
1228 	btrfs_free_path(path);
1229 	return ret;
1230 }
1231 
1232 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1233 			    struct btrfs_root *root, u64 bytenr,
1234 			    u64 orig_parent, u64 parent,
1235 			    u64 ref_root, u64 ref_generation,
1236 			    u64 owner_objectid)
1237 {
1238 	int ret;
1239 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1240 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1241 		return 0;
1242 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1243 					parent, ref_root, ref_root,
1244 					ref_generation, ref_generation,
1245 					owner_objectid);
1246 	return ret;
1247 }
1248 
1249 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1250 				  struct btrfs_root *root, u64 bytenr,
1251 				  u64 orig_parent, u64 parent,
1252 				  u64 orig_root, u64 ref_root,
1253 				  u64 orig_generation, u64 ref_generation,
1254 				  u64 owner_objectid)
1255 {
1256 	struct btrfs_path *path;
1257 	int ret;
1258 	struct btrfs_key key;
1259 	struct extent_buffer *l;
1260 	struct btrfs_extent_item *item;
1261 	u32 refs;
1262 
1263 	path = btrfs_alloc_path();
1264 	if (!path)
1265 		return -ENOMEM;
1266 
1267 	path->reada = 1;
1268 	key.objectid = bytenr;
1269 	key.type = BTRFS_EXTENT_ITEM_KEY;
1270 	key.offset = (u64)-1;
1271 
1272 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1273 				0, 1);
1274 	if (ret < 0)
1275 		return ret;
1276 	BUG_ON(ret == 0 || path->slots[0] == 0);
1277 
1278 	path->slots[0]--;
1279 	l = path->nodes[0];
1280 
1281 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1282 	if (key.objectid != bytenr) {
1283 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1284 		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1285 		       (unsigned long long)bytenr,
1286 		       (unsigned long long)key.objectid);
1287 		BUG();
1288 	}
1289 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1290 
1291 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1292 	refs = btrfs_extent_refs(l, item);
1293 	btrfs_set_extent_refs(l, item, refs + 1);
1294 	btrfs_mark_buffer_dirty(path->nodes[0]);
1295 
1296 	btrfs_release_path(root->fs_info->extent_root, path);
1297 
1298 	path->reada = 1;
1299 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1300 				    path, bytenr, parent,
1301 				    ref_root, ref_generation,
1302 				    owner_objectid);
1303 	BUG_ON(ret);
1304 	finish_current_insert(trans, root->fs_info->extent_root, 0);
1305 	del_pending_extents(trans, root->fs_info->extent_root, 0);
1306 
1307 	btrfs_free_path(path);
1308 	return 0;
1309 }
1310 
1311 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1312 			 struct btrfs_root *root,
1313 			 u64 bytenr, u64 num_bytes, u64 parent,
1314 			 u64 ref_root, u64 ref_generation,
1315 			 u64 owner_objectid)
1316 {
1317 	int ret;
1318 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1319 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1320 		return 0;
1321 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1322 				     0, ref_root, 0, ref_generation,
1323 				     owner_objectid);
1324 	return ret;
1325 }
1326 
1327 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1328 			 struct btrfs_root *root)
1329 {
1330 	u64 start;
1331 	u64 end;
1332 	int ret;
1333 
1334 	while(1) {
1335 		finish_current_insert(trans, root->fs_info->extent_root, 1);
1336 		del_pending_extents(trans, root->fs_info->extent_root, 1);
1337 
1338 		/* is there more work to do? */
1339 		ret = find_first_extent_bit(&root->fs_info->pending_del,
1340 					    0, &start, &end, EXTENT_WRITEBACK);
1341 		if (!ret)
1342 			continue;
1343 		ret = find_first_extent_bit(&root->fs_info->extent_ins,
1344 					    0, &start, &end, EXTENT_WRITEBACK);
1345 		if (!ret)
1346 			continue;
1347 		break;
1348 	}
1349 	return 0;
1350 }
1351 
1352 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1353 			    struct btrfs_root *root, u64 bytenr,
1354 			    u64 num_bytes, u32 *refs)
1355 {
1356 	struct btrfs_path *path;
1357 	int ret;
1358 	struct btrfs_key key;
1359 	struct extent_buffer *l;
1360 	struct btrfs_extent_item *item;
1361 
1362 	WARN_ON(num_bytes < root->sectorsize);
1363 	path = btrfs_alloc_path();
1364 	path->reada = 1;
1365 	key.objectid = bytenr;
1366 	key.offset = num_bytes;
1367 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1368 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1369 				0, 0);
1370 	if (ret < 0)
1371 		goto out;
1372 	if (ret != 0) {
1373 		btrfs_print_leaf(root, path->nodes[0]);
1374 		printk(KERN_INFO "btrfs failed to find block number %llu\n",
1375 		       (unsigned long long)bytenr);
1376 		BUG();
1377 	}
1378 	l = path->nodes[0];
1379 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1380 	*refs = btrfs_extent_refs(l, item);
1381 out:
1382 	btrfs_free_path(path);
1383 	return 0;
1384 }
1385 
1386 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1387 			  struct btrfs_root *root, u64 objectid, u64 bytenr)
1388 {
1389 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1390 	struct btrfs_path *path;
1391 	struct extent_buffer *leaf;
1392 	struct btrfs_extent_ref *ref_item;
1393 	struct btrfs_key key;
1394 	struct btrfs_key found_key;
1395 	u64 ref_root;
1396 	u64 last_snapshot;
1397 	u32 nritems;
1398 	int ret;
1399 
1400 	key.objectid = bytenr;
1401 	key.offset = (u64)-1;
1402 	key.type = BTRFS_EXTENT_ITEM_KEY;
1403 
1404 	path = btrfs_alloc_path();
1405 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1406 	if (ret < 0)
1407 		goto out;
1408 	BUG_ON(ret == 0);
1409 
1410 	ret = -ENOENT;
1411 	if (path->slots[0] == 0)
1412 		goto out;
1413 
1414 	path->slots[0]--;
1415 	leaf = path->nodes[0];
1416 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1417 
1418 	if (found_key.objectid != bytenr ||
1419 	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
1420 		goto out;
1421 
1422 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423 	while (1) {
1424 		leaf = path->nodes[0];
1425 		nritems = btrfs_header_nritems(leaf);
1426 		if (path->slots[0] >= nritems) {
1427 			ret = btrfs_next_leaf(extent_root, path);
1428 			if (ret < 0)
1429 				goto out;
1430 			if (ret == 0)
1431 				continue;
1432 			break;
1433 		}
1434 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1435 		if (found_key.objectid != bytenr)
1436 			break;
1437 
1438 		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1439 			path->slots[0]++;
1440 			continue;
1441 		}
1442 
1443 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
1444 					  struct btrfs_extent_ref);
1445 		ref_root = btrfs_ref_root(leaf, ref_item);
1446 		if ((ref_root != root->root_key.objectid &&
1447 		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1448 		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
1449 			ret = 1;
1450 			goto out;
1451 		}
1452 		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1453 			ret = 1;
1454 			goto out;
1455 		}
1456 
1457 		path->slots[0]++;
1458 	}
1459 	ret = 0;
1460 out:
1461 	btrfs_free_path(path);
1462 	return ret;
1463 }
1464 
1465 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1466 		    struct extent_buffer *buf, u32 nr_extents)
1467 {
1468 	struct btrfs_key key;
1469 	struct btrfs_file_extent_item *fi;
1470 	u64 root_gen;
1471 	u32 nritems;
1472 	int i;
1473 	int level;
1474 	int ret = 0;
1475 	int shared = 0;
1476 
1477 	if (!root->ref_cows)
1478 		return 0;
1479 
1480 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1481 		shared = 0;
1482 		root_gen = root->root_key.offset;
1483 	} else {
1484 		shared = 1;
1485 		root_gen = trans->transid - 1;
1486 	}
1487 
1488 	level = btrfs_header_level(buf);
1489 	nritems = btrfs_header_nritems(buf);
1490 
1491 	if (level == 0) {
1492 		struct btrfs_leaf_ref *ref;
1493 		struct btrfs_extent_info *info;
1494 
1495 		ref = btrfs_alloc_leaf_ref(root, nr_extents);
1496 		if (!ref) {
1497 			ret = -ENOMEM;
1498 			goto out;
1499 		}
1500 
1501 		ref->root_gen = root_gen;
1502 		ref->bytenr = buf->start;
1503 		ref->owner = btrfs_header_owner(buf);
1504 		ref->generation = btrfs_header_generation(buf);
1505 		ref->nritems = nr_extents;
1506 		info = ref->extents;
1507 
1508 		for (i = 0; nr_extents > 0 && i < nritems; i++) {
1509 			u64 disk_bytenr;
1510 			btrfs_item_key_to_cpu(buf, &key, i);
1511 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1512 				continue;
1513 			fi = btrfs_item_ptr(buf, i,
1514 					    struct btrfs_file_extent_item);
1515 			if (btrfs_file_extent_type(buf, fi) ==
1516 			    BTRFS_FILE_EXTENT_INLINE)
1517 				continue;
1518 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1519 			if (disk_bytenr == 0)
1520 				continue;
1521 
1522 			info->bytenr = disk_bytenr;
1523 			info->num_bytes =
1524 				btrfs_file_extent_disk_num_bytes(buf, fi);
1525 			info->objectid = key.objectid;
1526 			info->offset = key.offset;
1527 			info++;
1528 		}
1529 
1530 		ret = btrfs_add_leaf_ref(root, ref, shared);
1531 		if (ret == -EEXIST && shared) {
1532 			struct btrfs_leaf_ref *old;
1533 			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1534 			BUG_ON(!old);
1535 			btrfs_remove_leaf_ref(root, old);
1536 			btrfs_free_leaf_ref(root, old);
1537 			ret = btrfs_add_leaf_ref(root, ref, shared);
1538 		}
1539 		WARN_ON(ret);
1540 		btrfs_free_leaf_ref(root, ref);
1541 	}
1542 out:
1543 	return ret;
1544 }
1545 
1546 /* when a block goes through cow, we update the reference counts of
1547  * everything that block points to.  The internal pointers of the block
1548  * can be in just about any order, and it is likely to have clusters of
1549  * things that are close together and clusters of things that are not.
1550  *
1551  * To help reduce the seeks that come with updating all of these reference
1552  * counts, sort them by byte number before actual updates are done.
1553  *
1554  * struct refsort is used to match byte number to slot in the btree block.
1555  * we sort based on the byte number and then use the slot to actually
1556  * find the item.
1557  *
1558  * struct refsort is smaller than strcut btrfs_item and smaller than
1559  * struct btrfs_key_ptr.  Since we're currently limited to the page size
1560  * for a btree block, there's no way for a kmalloc of refsorts for a
1561  * single node to be bigger than a page.
1562  */
1563 struct refsort {
1564 	u64 bytenr;
1565 	u32 slot;
1566 };
1567 
1568 /*
1569  * for passing into sort()
1570  */
1571 static int refsort_cmp(const void *a_void, const void *b_void)
1572 {
1573 	const struct refsort *a = a_void;
1574 	const struct refsort *b = b_void;
1575 
1576 	if (a->bytenr < b->bytenr)
1577 		return -1;
1578 	if (a->bytenr > b->bytenr)
1579 		return 1;
1580 	return 0;
1581 }
1582 
1583 
1584 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1585 			   struct btrfs_root *root,
1586 			   struct extent_buffer *orig_buf,
1587 			   struct extent_buffer *buf, u32 *nr_extents)
1588 {
1589 	u64 bytenr;
1590 	u64 ref_root;
1591 	u64 orig_root;
1592 	u64 ref_generation;
1593 	u64 orig_generation;
1594 	struct refsort *sorted;
1595 	u32 nritems;
1596 	u32 nr_file_extents = 0;
1597 	struct btrfs_key key;
1598 	struct btrfs_file_extent_item *fi;
1599 	int i;
1600 	int level;
1601 	int ret = 0;
1602 	int faili = 0;
1603 	int refi = 0;
1604 	int slot;
1605 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1606 			    u64, u64, u64, u64, u64, u64, u64, u64);
1607 
1608 	ref_root = btrfs_header_owner(buf);
1609 	ref_generation = btrfs_header_generation(buf);
1610 	orig_root = btrfs_header_owner(orig_buf);
1611 	orig_generation = btrfs_header_generation(orig_buf);
1612 
1613 	nritems = btrfs_header_nritems(buf);
1614 	level = btrfs_header_level(buf);
1615 
1616 	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1617 	BUG_ON(!sorted);
1618 
1619 	if (root->ref_cows) {
1620 		process_func = __btrfs_inc_extent_ref;
1621 	} else {
1622 		if (level == 0 &&
1623 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1624 			goto out;
1625 		if (level != 0 &&
1626 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1627 			goto out;
1628 		process_func = __btrfs_update_extent_ref;
1629 	}
1630 
1631 	/*
1632 	 * we make two passes through the items.  In the first pass we
1633 	 * only record the byte number and slot.  Then we sort based on
1634 	 * byte number and do the actual work based on the sorted results
1635 	 */
1636 	for (i = 0; i < nritems; i++) {
1637 		cond_resched();
1638 		if (level == 0) {
1639 			btrfs_item_key_to_cpu(buf, &key, i);
1640 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1641 				continue;
1642 			fi = btrfs_item_ptr(buf, i,
1643 					    struct btrfs_file_extent_item);
1644 			if (btrfs_file_extent_type(buf, fi) ==
1645 			    BTRFS_FILE_EXTENT_INLINE)
1646 				continue;
1647 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1648 			if (bytenr == 0)
1649 				continue;
1650 
1651 			nr_file_extents++;
1652 			sorted[refi].bytenr = bytenr;
1653 			sorted[refi].slot = i;
1654 			refi++;
1655 		} else {
1656 			bytenr = btrfs_node_blockptr(buf, i);
1657 			sorted[refi].bytenr = bytenr;
1658 			sorted[refi].slot = i;
1659 			refi++;
1660 		}
1661 	}
1662 	/*
1663 	 * if refi == 0, we didn't actually put anything into the sorted
1664 	 * array and we're done
1665 	 */
1666 	if (refi == 0)
1667 		goto out;
1668 
1669 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1670 
1671 	for (i = 0; i < refi; i++) {
1672 		cond_resched();
1673 		slot = sorted[i].slot;
1674 		bytenr = sorted[i].bytenr;
1675 
1676 		if (level == 0) {
1677 			btrfs_item_key_to_cpu(buf, &key, slot);
1678 
1679 			ret = process_func(trans, root, bytenr,
1680 					   orig_buf->start, buf->start,
1681 					   orig_root, ref_root,
1682 					   orig_generation, ref_generation,
1683 					   key.objectid);
1684 
1685 			if (ret) {
1686 				faili = slot;
1687 				WARN_ON(1);
1688 				goto fail;
1689 			}
1690 		} else {
1691 			ret = process_func(trans, root, bytenr,
1692 					   orig_buf->start, buf->start,
1693 					   orig_root, ref_root,
1694 					   orig_generation, ref_generation,
1695 					   level - 1);
1696 			if (ret) {
1697 				faili = slot;
1698 				WARN_ON(1);
1699 				goto fail;
1700 			}
1701 		}
1702 	}
1703 out:
1704 	kfree(sorted);
1705 	if (nr_extents) {
1706 		if (level == 0)
1707 			*nr_extents = nr_file_extents;
1708 		else
1709 			*nr_extents = nritems;
1710 	}
1711 	return 0;
1712 fail:
1713 	kfree(sorted);
1714 	WARN_ON(1);
1715 	return ret;
1716 }
1717 
1718 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1719 		     struct btrfs_root *root, struct extent_buffer *orig_buf,
1720 		     struct extent_buffer *buf, int start_slot, int nr)
1721 
1722 {
1723 	u64 bytenr;
1724 	u64 ref_root;
1725 	u64 orig_root;
1726 	u64 ref_generation;
1727 	u64 orig_generation;
1728 	struct btrfs_key key;
1729 	struct btrfs_file_extent_item *fi;
1730 	int i;
1731 	int ret;
1732 	int slot;
1733 	int level;
1734 
1735 	BUG_ON(start_slot < 0);
1736 	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1737 
1738 	ref_root = btrfs_header_owner(buf);
1739 	ref_generation = btrfs_header_generation(buf);
1740 	orig_root = btrfs_header_owner(orig_buf);
1741 	orig_generation = btrfs_header_generation(orig_buf);
1742 	level = btrfs_header_level(buf);
1743 
1744 	if (!root->ref_cows) {
1745 		if (level == 0 &&
1746 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1747 			return 0;
1748 		if (level != 0 &&
1749 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1750 			return 0;
1751 	}
1752 
1753 	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1754 		cond_resched();
1755 		if (level == 0) {
1756 			btrfs_item_key_to_cpu(buf, &key, slot);
1757 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1758 				continue;
1759 			fi = btrfs_item_ptr(buf, slot,
1760 					    struct btrfs_file_extent_item);
1761 			if (btrfs_file_extent_type(buf, fi) ==
1762 			    BTRFS_FILE_EXTENT_INLINE)
1763 				continue;
1764 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1765 			if (bytenr == 0)
1766 				continue;
1767 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1768 					    orig_buf->start, buf->start,
1769 					    orig_root, ref_root,
1770 					    orig_generation, ref_generation,
1771 					    key.objectid);
1772 			if (ret)
1773 				goto fail;
1774 		} else {
1775 			bytenr = btrfs_node_blockptr(buf, slot);
1776 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1777 					    orig_buf->start, buf->start,
1778 					    orig_root, ref_root,
1779 					    orig_generation, ref_generation,
1780 					    level - 1);
1781 			if (ret)
1782 				goto fail;
1783 		}
1784 	}
1785 	return 0;
1786 fail:
1787 	WARN_ON(1);
1788 	return -1;
1789 }
1790 
1791 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1792 				 struct btrfs_root *root,
1793 				 struct btrfs_path *path,
1794 				 struct btrfs_block_group_cache *cache)
1795 {
1796 	int ret;
1797 	int pending_ret;
1798 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1799 	unsigned long bi;
1800 	struct extent_buffer *leaf;
1801 
1802 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1803 	if (ret < 0)
1804 		goto fail;
1805 	BUG_ON(ret);
1806 
1807 	leaf = path->nodes[0];
1808 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1809 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1810 	btrfs_mark_buffer_dirty(leaf);
1811 	btrfs_release_path(extent_root, path);
1812 fail:
1813 	finish_current_insert(trans, extent_root, 0);
1814 	pending_ret = del_pending_extents(trans, extent_root, 0);
1815 	if (ret)
1816 		return ret;
1817 	if (pending_ret)
1818 		return pending_ret;
1819 	return 0;
1820 
1821 }
1822 
1823 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1824 				   struct btrfs_root *root)
1825 {
1826 	struct btrfs_block_group_cache *cache, *entry;
1827 	struct rb_node *n;
1828 	int err = 0;
1829 	int werr = 0;
1830 	struct btrfs_path *path;
1831 	u64 last = 0;
1832 
1833 	path = btrfs_alloc_path();
1834 	if (!path)
1835 		return -ENOMEM;
1836 
1837 	while (1) {
1838 		cache = NULL;
1839 		spin_lock(&root->fs_info->block_group_cache_lock);
1840 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
1841 		     n; n = rb_next(n)) {
1842 			entry = rb_entry(n, struct btrfs_block_group_cache,
1843 					 cache_node);
1844 			if (entry->dirty) {
1845 				cache = entry;
1846 				break;
1847 			}
1848 		}
1849 		spin_unlock(&root->fs_info->block_group_cache_lock);
1850 
1851 		if (!cache)
1852 			break;
1853 
1854 		cache->dirty = 0;
1855 		last += cache->key.offset;
1856 
1857 		err = write_one_cache_group(trans, root,
1858 					    path, cache);
1859 		/*
1860 		 * if we fail to write the cache group, we want
1861 		 * to keep it marked dirty in hopes that a later
1862 		 * write will work
1863 		 */
1864 		if (err) {
1865 			werr = err;
1866 			continue;
1867 		}
1868 	}
1869 	btrfs_free_path(path);
1870 	return werr;
1871 }
1872 
1873 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1874 {
1875 	struct btrfs_block_group_cache *block_group;
1876 	int readonly = 0;
1877 
1878 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1879 	if (!block_group || block_group->ro)
1880 		readonly = 1;
1881 	if (block_group)
1882 		put_block_group(block_group);
1883 	return readonly;
1884 }
1885 
1886 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1887 			     u64 total_bytes, u64 bytes_used,
1888 			     struct btrfs_space_info **space_info)
1889 {
1890 	struct btrfs_space_info *found;
1891 
1892 	found = __find_space_info(info, flags);
1893 	if (found) {
1894 		spin_lock(&found->lock);
1895 		found->total_bytes += total_bytes;
1896 		found->bytes_used += bytes_used;
1897 		found->full = 0;
1898 		spin_unlock(&found->lock);
1899 		*space_info = found;
1900 		return 0;
1901 	}
1902 	found = kzalloc(sizeof(*found), GFP_NOFS);
1903 	if (!found)
1904 		return -ENOMEM;
1905 
1906 	list_add(&found->list, &info->space_info);
1907 	INIT_LIST_HEAD(&found->block_groups);
1908 	init_rwsem(&found->groups_sem);
1909 	spin_lock_init(&found->lock);
1910 	found->flags = flags;
1911 	found->total_bytes = total_bytes;
1912 	found->bytes_used = bytes_used;
1913 	found->bytes_pinned = 0;
1914 	found->bytes_reserved = 0;
1915 	found->bytes_readonly = 0;
1916 	found->bytes_delalloc = 0;
1917 	found->full = 0;
1918 	found->force_alloc = 0;
1919 	*space_info = found;
1920 	return 0;
1921 }
1922 
1923 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1924 {
1925 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1926 				   BTRFS_BLOCK_GROUP_RAID1 |
1927 				   BTRFS_BLOCK_GROUP_RAID10 |
1928 				   BTRFS_BLOCK_GROUP_DUP);
1929 	if (extra_flags) {
1930 		if (flags & BTRFS_BLOCK_GROUP_DATA)
1931 			fs_info->avail_data_alloc_bits |= extra_flags;
1932 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
1933 			fs_info->avail_metadata_alloc_bits |= extra_flags;
1934 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1935 			fs_info->avail_system_alloc_bits |= extra_flags;
1936 	}
1937 }
1938 
1939 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1940 {
1941 	spin_lock(&cache->space_info->lock);
1942 	spin_lock(&cache->lock);
1943 	if (!cache->ro) {
1944 		cache->space_info->bytes_readonly += cache->key.offset -
1945 					btrfs_block_group_used(&cache->item);
1946 		cache->ro = 1;
1947 	}
1948 	spin_unlock(&cache->lock);
1949 	spin_unlock(&cache->space_info->lock);
1950 }
1951 
1952 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1953 {
1954 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
1955 
1956 	if (num_devices == 1)
1957 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1958 	if (num_devices < 4)
1959 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1960 
1961 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1962 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1963 		      BTRFS_BLOCK_GROUP_RAID10))) {
1964 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
1965 	}
1966 
1967 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1968 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1969 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1970 	}
1971 
1972 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1973 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1974 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
1975 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
1976 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1977 	return flags;
1978 }
1979 
1980 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1981 {
1982 	struct btrfs_fs_info *info = root->fs_info;
1983 	u64 alloc_profile;
1984 
1985 	if (data) {
1986 		alloc_profile = info->avail_data_alloc_bits &
1987 			info->data_alloc_profile;
1988 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1989 	} else if (root == root->fs_info->chunk_root) {
1990 		alloc_profile = info->avail_system_alloc_bits &
1991 			info->system_alloc_profile;
1992 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1993 	} else {
1994 		alloc_profile = info->avail_metadata_alloc_bits &
1995 			info->metadata_alloc_profile;
1996 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1997 	}
1998 
1999 	return btrfs_reduce_alloc_profile(root, data);
2000 }
2001 
2002 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2003 {
2004 	u64 alloc_target;
2005 
2006 	alloc_target = btrfs_get_alloc_profile(root, 1);
2007 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2008 						       alloc_target);
2009 }
2010 
2011 /*
2012  * for now this just makes sure we have at least 5% of our metadata space free
2013  * for use.
2014  */
2015 int btrfs_check_metadata_free_space(struct btrfs_root *root)
2016 {
2017 	struct btrfs_fs_info *info = root->fs_info;
2018 	struct btrfs_space_info *meta_sinfo;
2019 	u64 alloc_target, thresh;
2020 	int committed = 0, ret;
2021 
2022 	/* get the space info for where the metadata will live */
2023 	alloc_target = btrfs_get_alloc_profile(root, 0);
2024 	meta_sinfo = __find_space_info(info, alloc_target);
2025 
2026 again:
2027 	spin_lock(&meta_sinfo->lock);
2028 	if (!meta_sinfo->full)
2029 		thresh = meta_sinfo->total_bytes * 80;
2030 	else
2031 		thresh = meta_sinfo->total_bytes * 95;
2032 
2033 	do_div(thresh, 100);
2034 
2035 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2036 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2037 		struct btrfs_trans_handle *trans;
2038 		if (!meta_sinfo->full) {
2039 			meta_sinfo->force_alloc = 1;
2040 			spin_unlock(&meta_sinfo->lock);
2041 
2042 			trans = btrfs_start_transaction(root, 1);
2043 			if (!trans)
2044 				return -ENOMEM;
2045 
2046 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2047 					     2 * 1024 * 1024, alloc_target, 0);
2048 			btrfs_end_transaction(trans, root);
2049 			goto again;
2050 		}
2051 		spin_unlock(&meta_sinfo->lock);
2052 
2053 		if (!committed) {
2054 			committed = 1;
2055 			trans = btrfs_join_transaction(root, 1);
2056 			if (!trans)
2057 				return -ENOMEM;
2058 			ret = btrfs_commit_transaction(trans, root);
2059 			if (ret)
2060 				return ret;
2061 			goto again;
2062 		}
2063 		return -ENOSPC;
2064 	}
2065 	spin_unlock(&meta_sinfo->lock);
2066 
2067 	return 0;
2068 }
2069 
2070 /*
2071  * This will check the space that the inode allocates from to make sure we have
2072  * enough space for bytes.
2073  */
2074 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2075 				u64 bytes)
2076 {
2077 	struct btrfs_space_info *data_sinfo;
2078 	int ret = 0, committed = 0;
2079 
2080 	/* make sure bytes are sectorsize aligned */
2081 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2082 
2083 	data_sinfo = BTRFS_I(inode)->space_info;
2084 again:
2085 	/* make sure we have enough space to handle the data first */
2086 	spin_lock(&data_sinfo->lock);
2087 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2088 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2089 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2090 	    data_sinfo->bytes_may_use < bytes) {
2091 		struct btrfs_trans_handle *trans;
2092 
2093 		/*
2094 		 * if we don't have enough free bytes in this space then we need
2095 		 * to alloc a new chunk.
2096 		 */
2097 		if (!data_sinfo->full) {
2098 			u64 alloc_target;
2099 
2100 			data_sinfo->force_alloc = 1;
2101 			spin_unlock(&data_sinfo->lock);
2102 
2103 			alloc_target = btrfs_get_alloc_profile(root, 1);
2104 			trans = btrfs_start_transaction(root, 1);
2105 			if (!trans)
2106 				return -ENOMEM;
2107 
2108 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2109 					     bytes + 2 * 1024 * 1024,
2110 					     alloc_target, 0);
2111 			btrfs_end_transaction(trans, root);
2112 			if (ret)
2113 				return ret;
2114 			goto again;
2115 		}
2116 		spin_unlock(&data_sinfo->lock);
2117 
2118 		/* commit the current transaction and try again */
2119 		if (!committed) {
2120 			committed = 1;
2121 			trans = btrfs_join_transaction(root, 1);
2122 			if (!trans)
2123 				return -ENOMEM;
2124 			ret = btrfs_commit_transaction(trans, root);
2125 			if (ret)
2126 				return ret;
2127 			goto again;
2128 		}
2129 
2130 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2131 		       ", %llu bytes_used, %llu bytes_reserved, "
2132 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2133 		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2134 		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2135 		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2136 		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2137 		return -ENOSPC;
2138 	}
2139 	data_sinfo->bytes_may_use += bytes;
2140 	BTRFS_I(inode)->reserved_bytes += bytes;
2141 	spin_unlock(&data_sinfo->lock);
2142 
2143 	return btrfs_check_metadata_free_space(root);
2144 }
2145 
2146 /*
2147  * if there was an error for whatever reason after calling
2148  * btrfs_check_data_free_space, call this so we can cleanup the counters.
2149  */
2150 void btrfs_free_reserved_data_space(struct btrfs_root *root,
2151 				    struct inode *inode, u64 bytes)
2152 {
2153 	struct btrfs_space_info *data_sinfo;
2154 
2155 	/* make sure bytes are sectorsize aligned */
2156 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2157 
2158 	data_sinfo = BTRFS_I(inode)->space_info;
2159 	spin_lock(&data_sinfo->lock);
2160 	data_sinfo->bytes_may_use -= bytes;
2161 	BTRFS_I(inode)->reserved_bytes -= bytes;
2162 	spin_unlock(&data_sinfo->lock);
2163 }
2164 
2165 /* called when we are adding a delalloc extent to the inode's io_tree */
2166 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2167 				  u64 bytes)
2168 {
2169 	struct btrfs_space_info *data_sinfo;
2170 
2171 	/* get the space info for where this inode will be storing its data */
2172 	data_sinfo = BTRFS_I(inode)->space_info;
2173 
2174 	/* make sure we have enough space to handle the data first */
2175 	spin_lock(&data_sinfo->lock);
2176 	data_sinfo->bytes_delalloc += bytes;
2177 
2178 	/*
2179 	 * we are adding a delalloc extent without calling
2180 	 * btrfs_check_data_free_space first.  This happens on a weird
2181 	 * writepage condition, but shouldn't hurt our accounting
2182 	 */
2183 	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2184 		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2185 		BTRFS_I(inode)->reserved_bytes = 0;
2186 	} else {
2187 		data_sinfo->bytes_may_use -= bytes;
2188 		BTRFS_I(inode)->reserved_bytes -= bytes;
2189 	}
2190 
2191 	spin_unlock(&data_sinfo->lock);
2192 }
2193 
2194 /* called when we are clearing an delalloc extent from the inode's io_tree */
2195 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2196 			      u64 bytes)
2197 {
2198 	struct btrfs_space_info *info;
2199 
2200 	info = BTRFS_I(inode)->space_info;
2201 
2202 	spin_lock(&info->lock);
2203 	info->bytes_delalloc -= bytes;
2204 	spin_unlock(&info->lock);
2205 }
2206 
2207 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2208 			  struct btrfs_root *extent_root, u64 alloc_bytes,
2209 			  u64 flags, int force)
2210 {
2211 	struct btrfs_space_info *space_info;
2212 	u64 thresh;
2213 	int ret = 0;
2214 
2215 	mutex_lock(&extent_root->fs_info->chunk_mutex);
2216 
2217 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
2218 
2219 	space_info = __find_space_info(extent_root->fs_info, flags);
2220 	if (!space_info) {
2221 		ret = update_space_info(extent_root->fs_info, flags,
2222 					0, 0, &space_info);
2223 		BUG_ON(ret);
2224 	}
2225 	BUG_ON(!space_info);
2226 
2227 	spin_lock(&space_info->lock);
2228 	if (space_info->force_alloc) {
2229 		force = 1;
2230 		space_info->force_alloc = 0;
2231 	}
2232 	if (space_info->full) {
2233 		spin_unlock(&space_info->lock);
2234 		goto out;
2235 	}
2236 
2237 	thresh = space_info->total_bytes - space_info->bytes_readonly;
2238 	thresh = div_factor(thresh, 6);
2239 	if (!force &&
2240 	   (space_info->bytes_used + space_info->bytes_pinned +
2241 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
2242 		spin_unlock(&space_info->lock);
2243 		goto out;
2244 	}
2245 	spin_unlock(&space_info->lock);
2246 
2247 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
2248 	if (ret)
2249 		space_info->full = 1;
2250 out:
2251 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
2252 	return ret;
2253 }
2254 
2255 static int update_block_group(struct btrfs_trans_handle *trans,
2256 			      struct btrfs_root *root,
2257 			      u64 bytenr, u64 num_bytes, int alloc,
2258 			      int mark_free)
2259 {
2260 	struct btrfs_block_group_cache *cache;
2261 	struct btrfs_fs_info *info = root->fs_info;
2262 	u64 total = num_bytes;
2263 	u64 old_val;
2264 	u64 byte_in_group;
2265 
2266 	while (total) {
2267 		cache = btrfs_lookup_block_group(info, bytenr);
2268 		if (!cache)
2269 			return -1;
2270 		byte_in_group = bytenr - cache->key.objectid;
2271 		WARN_ON(byte_in_group > cache->key.offset);
2272 
2273 		spin_lock(&cache->space_info->lock);
2274 		spin_lock(&cache->lock);
2275 		cache->dirty = 1;
2276 		old_val = btrfs_block_group_used(&cache->item);
2277 		num_bytes = min(total, cache->key.offset - byte_in_group);
2278 		if (alloc) {
2279 			old_val += num_bytes;
2280 			cache->space_info->bytes_used += num_bytes;
2281 			if (cache->ro)
2282 				cache->space_info->bytes_readonly -= num_bytes;
2283 			btrfs_set_block_group_used(&cache->item, old_val);
2284 			spin_unlock(&cache->lock);
2285 			spin_unlock(&cache->space_info->lock);
2286 		} else {
2287 			old_val -= num_bytes;
2288 			cache->space_info->bytes_used -= num_bytes;
2289 			if (cache->ro)
2290 				cache->space_info->bytes_readonly += num_bytes;
2291 			btrfs_set_block_group_used(&cache->item, old_val);
2292 			spin_unlock(&cache->lock);
2293 			spin_unlock(&cache->space_info->lock);
2294 			if (mark_free) {
2295 				int ret;
2296 
2297 				ret = btrfs_discard_extent(root, bytenr,
2298 							   num_bytes);
2299 				WARN_ON(ret);
2300 
2301 				ret = btrfs_add_free_space(cache, bytenr,
2302 							   num_bytes);
2303 				WARN_ON(ret);
2304 			}
2305 		}
2306 		put_block_group(cache);
2307 		total -= num_bytes;
2308 		bytenr += num_bytes;
2309 	}
2310 	return 0;
2311 }
2312 
2313 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2314 {
2315 	struct btrfs_block_group_cache *cache;
2316 	u64 bytenr;
2317 
2318 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2319 	if (!cache)
2320 		return 0;
2321 
2322 	bytenr = cache->key.objectid;
2323 	put_block_group(cache);
2324 
2325 	return bytenr;
2326 }
2327 
2328 int btrfs_update_pinned_extents(struct btrfs_root *root,
2329 				u64 bytenr, u64 num, int pin)
2330 {
2331 	u64 len;
2332 	struct btrfs_block_group_cache *cache;
2333 	struct btrfs_fs_info *fs_info = root->fs_info;
2334 
2335 	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2336 	if (pin) {
2337 		set_extent_dirty(&fs_info->pinned_extents,
2338 				bytenr, bytenr + num - 1, GFP_NOFS);
2339 	} else {
2340 		clear_extent_dirty(&fs_info->pinned_extents,
2341 				bytenr, bytenr + num - 1, GFP_NOFS);
2342 	}
2343 	while (num > 0) {
2344 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2345 		BUG_ON(!cache);
2346 		len = min(num, cache->key.offset -
2347 			  (bytenr - cache->key.objectid));
2348 		if (pin) {
2349 			spin_lock(&cache->space_info->lock);
2350 			spin_lock(&cache->lock);
2351 			cache->pinned += len;
2352 			cache->space_info->bytes_pinned += len;
2353 			spin_unlock(&cache->lock);
2354 			spin_unlock(&cache->space_info->lock);
2355 			fs_info->total_pinned += len;
2356 		} else {
2357 			spin_lock(&cache->space_info->lock);
2358 			spin_lock(&cache->lock);
2359 			cache->pinned -= len;
2360 			cache->space_info->bytes_pinned -= len;
2361 			spin_unlock(&cache->lock);
2362 			spin_unlock(&cache->space_info->lock);
2363 			fs_info->total_pinned -= len;
2364 			if (cache->cached)
2365 				btrfs_add_free_space(cache, bytenr, len);
2366 		}
2367 		put_block_group(cache);
2368 		bytenr += len;
2369 		num -= len;
2370 	}
2371 	return 0;
2372 }
2373 
2374 static int update_reserved_extents(struct btrfs_root *root,
2375 				   u64 bytenr, u64 num, int reserve)
2376 {
2377 	u64 len;
2378 	struct btrfs_block_group_cache *cache;
2379 	struct btrfs_fs_info *fs_info = root->fs_info;
2380 
2381 	while (num > 0) {
2382 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2383 		BUG_ON(!cache);
2384 		len = min(num, cache->key.offset -
2385 			  (bytenr - cache->key.objectid));
2386 
2387 		spin_lock(&cache->space_info->lock);
2388 		spin_lock(&cache->lock);
2389 		if (reserve) {
2390 			cache->reserved += len;
2391 			cache->space_info->bytes_reserved += len;
2392 		} else {
2393 			cache->reserved -= len;
2394 			cache->space_info->bytes_reserved -= len;
2395 		}
2396 		spin_unlock(&cache->lock);
2397 		spin_unlock(&cache->space_info->lock);
2398 		put_block_group(cache);
2399 		bytenr += len;
2400 		num -= len;
2401 	}
2402 	return 0;
2403 }
2404 
2405 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2406 {
2407 	u64 last = 0;
2408 	u64 start;
2409 	u64 end;
2410 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2411 	int ret;
2412 
2413 	mutex_lock(&root->fs_info->pinned_mutex);
2414 	while (1) {
2415 		ret = find_first_extent_bit(pinned_extents, last,
2416 					    &start, &end, EXTENT_DIRTY);
2417 		if (ret)
2418 			break;
2419 		set_extent_dirty(copy, start, end, GFP_NOFS);
2420 		last = end + 1;
2421 	}
2422 	mutex_unlock(&root->fs_info->pinned_mutex);
2423 	return 0;
2424 }
2425 
2426 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2427 			       struct btrfs_root *root,
2428 			       struct extent_io_tree *unpin)
2429 {
2430 	u64 start;
2431 	u64 end;
2432 	int ret;
2433 
2434 	mutex_lock(&root->fs_info->pinned_mutex);
2435 	while (1) {
2436 		ret = find_first_extent_bit(unpin, 0, &start, &end,
2437 					    EXTENT_DIRTY);
2438 		if (ret)
2439 			break;
2440 
2441 		ret = btrfs_discard_extent(root, start, end + 1 - start);
2442 
2443 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2444 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
2445 
2446 		if (need_resched()) {
2447 			mutex_unlock(&root->fs_info->pinned_mutex);
2448 			cond_resched();
2449 			mutex_lock(&root->fs_info->pinned_mutex);
2450 		}
2451 	}
2452 	mutex_unlock(&root->fs_info->pinned_mutex);
2453 	return ret;
2454 }
2455 
2456 static int finish_current_insert(struct btrfs_trans_handle *trans,
2457 				 struct btrfs_root *extent_root, int all)
2458 {
2459 	u64 start;
2460 	u64 end;
2461 	u64 priv;
2462 	u64 search = 0;
2463 	struct btrfs_fs_info *info = extent_root->fs_info;
2464 	struct btrfs_path *path;
2465 	struct pending_extent_op *extent_op, *tmp;
2466 	struct list_head insert_list, update_list;
2467 	int ret;
2468 	int num_inserts = 0, max_inserts, restart = 0;
2469 
2470 	path = btrfs_alloc_path();
2471 	INIT_LIST_HEAD(&insert_list);
2472 	INIT_LIST_HEAD(&update_list);
2473 
2474 	max_inserts = extent_root->leafsize /
2475 		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2476 		 sizeof(struct btrfs_extent_ref) +
2477 		 sizeof(struct btrfs_extent_item));
2478 again:
2479 	mutex_lock(&info->extent_ins_mutex);
2480 	while (1) {
2481 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
2482 					    &end, EXTENT_WRITEBACK);
2483 		if (ret) {
2484 			if (restart && !num_inserts &&
2485 			    list_empty(&update_list)) {
2486 				restart = 0;
2487 				search = 0;
2488 				continue;
2489 			}
2490 			break;
2491 		}
2492 
2493 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2494 		if (!ret) {
2495 			if (all)
2496 				restart = 1;
2497 			search = end + 1;
2498 			if (need_resched()) {
2499 				mutex_unlock(&info->extent_ins_mutex);
2500 				cond_resched();
2501 				mutex_lock(&info->extent_ins_mutex);
2502 			}
2503 			continue;
2504 		}
2505 
2506 		ret = get_state_private(&info->extent_ins, start, &priv);
2507 		BUG_ON(ret);
2508 		extent_op = (struct pending_extent_op *)(unsigned long) priv;
2509 
2510 		if (extent_op->type == PENDING_EXTENT_INSERT) {
2511 			num_inserts++;
2512 			list_add_tail(&extent_op->list, &insert_list);
2513 			search = end + 1;
2514 			if (num_inserts == max_inserts) {
2515 				restart = 1;
2516 				break;
2517 			}
2518 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2519 			list_add_tail(&extent_op->list, &update_list);
2520 			search = end + 1;
2521 		} else {
2522 			BUG();
2523 		}
2524 	}
2525 
2526 	/*
2527 	 * process the update list, clear the writeback bit for it, and if
2528 	 * somebody marked this thing for deletion then just unlock it and be
2529 	 * done, the free_extents will handle it
2530 	 */
2531 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2532 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2533 				  extent_op->bytenr + extent_op->num_bytes - 1,
2534 				  EXTENT_WRITEBACK, GFP_NOFS);
2535 		if (extent_op->del) {
2536 			list_del_init(&extent_op->list);
2537 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2538 				      extent_op->bytenr + extent_op->num_bytes
2539 				      - 1, GFP_NOFS);
2540 			kfree(extent_op);
2541 		}
2542 	}
2543 	mutex_unlock(&info->extent_ins_mutex);
2544 
2545 	/*
2546 	 * still have things left on the update list, go ahead an update
2547 	 * everything
2548 	 */
2549 	if (!list_empty(&update_list)) {
2550 		ret = update_backrefs(trans, extent_root, path, &update_list);
2551 		BUG_ON(ret);
2552 
2553 		/* we may have COW'ed new blocks, so lets start over */
2554 		if (all)
2555 			restart = 1;
2556 	}
2557 
2558 	/*
2559 	 * if no inserts need to be done, but we skipped some extents and we
2560 	 * need to make sure everything is cleaned then reset everything and
2561 	 * go back to the beginning
2562 	 */
2563 	if (!num_inserts && restart) {
2564 		search = 0;
2565 		restart = 0;
2566 		INIT_LIST_HEAD(&update_list);
2567 		INIT_LIST_HEAD(&insert_list);
2568 		goto again;
2569 	} else if (!num_inserts) {
2570 		goto out;
2571 	}
2572 
2573 	/*
2574 	 * process the insert extents list.  Again if we are deleting this
2575 	 * extent, then just unlock it, pin down the bytes if need be, and be
2576 	 * done with it.  Saves us from having to actually insert the extent
2577 	 * into the tree and then subsequently come along and delete it
2578 	 */
2579 	mutex_lock(&info->extent_ins_mutex);
2580 	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2581 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2582 				  extent_op->bytenr + extent_op->num_bytes - 1,
2583 				  EXTENT_WRITEBACK, GFP_NOFS);
2584 		if (extent_op->del) {
2585 			u64 used;
2586 			list_del_init(&extent_op->list);
2587 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2588 				      extent_op->bytenr + extent_op->num_bytes
2589 				      - 1, GFP_NOFS);
2590 
2591 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2592 			ret = pin_down_bytes(trans, extent_root,
2593 					     extent_op->bytenr,
2594 					     extent_op->num_bytes, 0);
2595 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2596 
2597 			spin_lock(&info->delalloc_lock);
2598 			used = btrfs_super_bytes_used(&info->super_copy);
2599 			btrfs_set_super_bytes_used(&info->super_copy,
2600 					used - extent_op->num_bytes);
2601 			used = btrfs_root_used(&extent_root->root_item);
2602 			btrfs_set_root_used(&extent_root->root_item,
2603 					used - extent_op->num_bytes);
2604 			spin_unlock(&info->delalloc_lock);
2605 
2606 			ret = update_block_group(trans, extent_root,
2607 						 extent_op->bytenr,
2608 						 extent_op->num_bytes,
2609 						 0, ret > 0);
2610 			BUG_ON(ret);
2611 			kfree(extent_op);
2612 			num_inserts--;
2613 		}
2614 	}
2615 	mutex_unlock(&info->extent_ins_mutex);
2616 
2617 	ret = insert_extents(trans, extent_root, path, &insert_list,
2618 			     num_inserts);
2619 	BUG_ON(ret);
2620 
2621 	/*
2622 	 * if restart is set for whatever reason we need to go back and start
2623 	 * searching through the pending list again.
2624 	 *
2625 	 * We just inserted some extents, which could have resulted in new
2626 	 * blocks being allocated, which would result in new blocks needing
2627 	 * updates, so if all is set we _must_ restart to get the updated
2628 	 * blocks.
2629 	 */
2630 	if (restart || all) {
2631 		INIT_LIST_HEAD(&insert_list);
2632 		INIT_LIST_HEAD(&update_list);
2633 		search = 0;
2634 		restart = 0;
2635 		num_inserts = 0;
2636 		goto again;
2637 	}
2638 out:
2639 	btrfs_free_path(path);
2640 	return 0;
2641 }
2642 
2643 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2644 			  struct btrfs_root *root,
2645 			  u64 bytenr, u64 num_bytes, int is_data)
2646 {
2647 	int err = 0;
2648 	struct extent_buffer *buf;
2649 
2650 	if (is_data)
2651 		goto pinit;
2652 
2653 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2654 	if (!buf)
2655 		goto pinit;
2656 
2657 	/* we can reuse a block if it hasn't been written
2658 	 * and it is from this transaction.  We can't
2659 	 * reuse anything from the tree log root because
2660 	 * it has tiny sub-transactions.
2661 	 */
2662 	if (btrfs_buffer_uptodate(buf, 0) &&
2663 	    btrfs_try_tree_lock(buf)) {
2664 		u64 header_owner = btrfs_header_owner(buf);
2665 		u64 header_transid = btrfs_header_generation(buf);
2666 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2667 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2668 		    header_transid == trans->transid &&
2669 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2670 			clean_tree_block(NULL, root, buf);
2671 			btrfs_tree_unlock(buf);
2672 			free_extent_buffer(buf);
2673 			return 1;
2674 		}
2675 		btrfs_tree_unlock(buf);
2676 	}
2677 	free_extent_buffer(buf);
2678 pinit:
2679 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2680 
2681 	BUG_ON(err < 0);
2682 	return 0;
2683 }
2684 
2685 /*
2686  * remove an extent from the root, returns 0 on success
2687  */
2688 static int __free_extent(struct btrfs_trans_handle *trans,
2689 			 struct btrfs_root *root,
2690 			 u64 bytenr, u64 num_bytes, u64 parent,
2691 			 u64 root_objectid, u64 ref_generation,
2692 			 u64 owner_objectid, int pin, int mark_free)
2693 {
2694 	struct btrfs_path *path;
2695 	struct btrfs_key key;
2696 	struct btrfs_fs_info *info = root->fs_info;
2697 	struct btrfs_root *extent_root = info->extent_root;
2698 	struct extent_buffer *leaf;
2699 	int ret;
2700 	int extent_slot = 0;
2701 	int found_extent = 0;
2702 	int num_to_del = 1;
2703 	struct btrfs_extent_item *ei;
2704 	u32 refs;
2705 
2706 	key.objectid = bytenr;
2707 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2708 	key.offset = num_bytes;
2709 	path = btrfs_alloc_path();
2710 	if (!path)
2711 		return -ENOMEM;
2712 
2713 	path->reada = 1;
2714 	ret = lookup_extent_backref(trans, extent_root, path,
2715 				    bytenr, parent, root_objectid,
2716 				    ref_generation, owner_objectid, 1);
2717 	if (ret == 0) {
2718 		struct btrfs_key found_key;
2719 		extent_slot = path->slots[0];
2720 		while (extent_slot > 0) {
2721 			extent_slot--;
2722 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2723 					      extent_slot);
2724 			if (found_key.objectid != bytenr)
2725 				break;
2726 			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2727 			    found_key.offset == num_bytes) {
2728 				found_extent = 1;
2729 				break;
2730 			}
2731 			if (path->slots[0] - extent_slot > 5)
2732 				break;
2733 		}
2734 		if (!found_extent) {
2735 			ret = remove_extent_backref(trans, extent_root, path);
2736 			BUG_ON(ret);
2737 			btrfs_release_path(extent_root, path);
2738 			ret = btrfs_search_slot(trans, extent_root,
2739 						&key, path, -1, 1);
2740 			if (ret) {
2741 				printk(KERN_ERR "umm, got %d back from search"
2742 				       ", was looking for %llu\n", ret,
2743 				       (unsigned long long)bytenr);
2744 				btrfs_print_leaf(extent_root, path->nodes[0]);
2745 			}
2746 			BUG_ON(ret);
2747 			extent_slot = path->slots[0];
2748 		}
2749 	} else {
2750 		btrfs_print_leaf(extent_root, path->nodes[0]);
2751 		WARN_ON(1);
2752 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2753 		       "root %llu gen %llu owner %llu\n",
2754 		       (unsigned long long)bytenr,
2755 		       (unsigned long long)root_objectid,
2756 		       (unsigned long long)ref_generation,
2757 		       (unsigned long long)owner_objectid);
2758 	}
2759 
2760 	leaf = path->nodes[0];
2761 	ei = btrfs_item_ptr(leaf, extent_slot,
2762 			    struct btrfs_extent_item);
2763 	refs = btrfs_extent_refs(leaf, ei);
2764 	BUG_ON(refs == 0);
2765 	refs -= 1;
2766 	btrfs_set_extent_refs(leaf, ei, refs);
2767 
2768 	btrfs_mark_buffer_dirty(leaf);
2769 
2770 	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2771 		struct btrfs_extent_ref *ref;
2772 		ref = btrfs_item_ptr(leaf, path->slots[0],
2773 				     struct btrfs_extent_ref);
2774 		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2775 		/* if the back ref and the extent are next to each other
2776 		 * they get deleted below in one shot
2777 		 */
2778 		path->slots[0] = extent_slot;
2779 		num_to_del = 2;
2780 	} else if (found_extent) {
2781 		/* otherwise delete the extent back ref */
2782 		ret = remove_extent_backref(trans, extent_root, path);
2783 		BUG_ON(ret);
2784 		/* if refs are 0, we need to setup the path for deletion */
2785 		if (refs == 0) {
2786 			btrfs_release_path(extent_root, path);
2787 			ret = btrfs_search_slot(trans, extent_root, &key, path,
2788 						-1, 1);
2789 			BUG_ON(ret);
2790 		}
2791 	}
2792 
2793 	if (refs == 0) {
2794 		u64 super_used;
2795 		u64 root_used;
2796 
2797 		if (pin) {
2798 			mutex_lock(&root->fs_info->pinned_mutex);
2799 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2800 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2801 			mutex_unlock(&root->fs_info->pinned_mutex);
2802 			if (ret > 0)
2803 				mark_free = 1;
2804 			BUG_ON(ret < 0);
2805 		}
2806 		/* block accounting for super block */
2807 		spin_lock(&info->delalloc_lock);
2808 		super_used = btrfs_super_bytes_used(&info->super_copy);
2809 		btrfs_set_super_bytes_used(&info->super_copy,
2810 					   super_used - num_bytes);
2811 
2812 		/* block accounting for root item */
2813 		root_used = btrfs_root_used(&root->root_item);
2814 		btrfs_set_root_used(&root->root_item,
2815 					   root_used - num_bytes);
2816 		spin_unlock(&info->delalloc_lock);
2817 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2818 				      num_to_del);
2819 		BUG_ON(ret);
2820 		btrfs_release_path(extent_root, path);
2821 
2822 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2823 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2824 			BUG_ON(ret);
2825 		}
2826 
2827 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2828 					 mark_free);
2829 		BUG_ON(ret);
2830 	}
2831 	btrfs_free_path(path);
2832 	finish_current_insert(trans, extent_root, 0);
2833 	return ret;
2834 }
2835 
2836 /*
2837  * find all the blocks marked as pending in the radix tree and remove
2838  * them from the extent map
2839  */
2840 static int del_pending_extents(struct btrfs_trans_handle *trans,
2841 			       struct btrfs_root *extent_root, int all)
2842 {
2843 	int ret;
2844 	int err = 0;
2845 	u64 start;
2846 	u64 end;
2847 	u64 priv;
2848 	u64 search = 0;
2849 	int nr = 0, skipped = 0;
2850 	struct extent_io_tree *pending_del;
2851 	struct extent_io_tree *extent_ins;
2852 	struct pending_extent_op *extent_op;
2853 	struct btrfs_fs_info *info = extent_root->fs_info;
2854 	struct list_head delete_list;
2855 
2856 	INIT_LIST_HEAD(&delete_list);
2857 	extent_ins = &extent_root->fs_info->extent_ins;
2858 	pending_del = &extent_root->fs_info->pending_del;
2859 
2860 again:
2861 	mutex_lock(&info->extent_ins_mutex);
2862 	while (1) {
2863 		ret = find_first_extent_bit(pending_del, search, &start, &end,
2864 					    EXTENT_WRITEBACK);
2865 		if (ret) {
2866 			if (all && skipped && !nr) {
2867 				search = 0;
2868 				skipped = 0;
2869 				continue;
2870 			}
2871 			mutex_unlock(&info->extent_ins_mutex);
2872 			break;
2873 		}
2874 
2875 		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2876 		if (!ret) {
2877 			search = end+1;
2878 			skipped = 1;
2879 
2880 			if (need_resched()) {
2881 				mutex_unlock(&info->extent_ins_mutex);
2882 				cond_resched();
2883 				mutex_lock(&info->extent_ins_mutex);
2884 			}
2885 
2886 			continue;
2887 		}
2888 		BUG_ON(ret < 0);
2889 
2890 		ret = get_state_private(pending_del, start, &priv);
2891 		BUG_ON(ret);
2892 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
2893 
2894 		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2895 				  GFP_NOFS);
2896 		if (!test_range_bit(extent_ins, start, end,
2897 				    EXTENT_WRITEBACK, 0)) {
2898 			list_add_tail(&extent_op->list, &delete_list);
2899 			nr++;
2900 		} else {
2901 			kfree(extent_op);
2902 
2903 			ret = get_state_private(&info->extent_ins, start,
2904 						&priv);
2905 			BUG_ON(ret);
2906 			extent_op = (struct pending_extent_op *)
2907 						(unsigned long)priv;
2908 
2909 			clear_extent_bits(&info->extent_ins, start, end,
2910 					  EXTENT_WRITEBACK, GFP_NOFS);
2911 
2912 			if (extent_op->type == PENDING_BACKREF_UPDATE) {
2913 				list_add_tail(&extent_op->list, &delete_list);
2914 				search = end + 1;
2915 				nr++;
2916 				continue;
2917 			}
2918 
2919 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2920 			ret = pin_down_bytes(trans, extent_root, start,
2921 					     end + 1 - start, 0);
2922 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2923 
2924 			ret = update_block_group(trans, extent_root, start,
2925 						end + 1 - start, 0, ret > 0);
2926 
2927 			unlock_extent(extent_ins, start, end, GFP_NOFS);
2928 			BUG_ON(ret);
2929 			kfree(extent_op);
2930 		}
2931 		if (ret)
2932 			err = ret;
2933 
2934 		search = end + 1;
2935 
2936 		if (need_resched()) {
2937 			mutex_unlock(&info->extent_ins_mutex);
2938 			cond_resched();
2939 			mutex_lock(&info->extent_ins_mutex);
2940 		}
2941 	}
2942 
2943 	if (nr) {
2944 		ret = free_extents(trans, extent_root, &delete_list);
2945 		BUG_ON(ret);
2946 	}
2947 
2948 	if (all && skipped) {
2949 		INIT_LIST_HEAD(&delete_list);
2950 		search = 0;
2951 		nr = 0;
2952 		goto again;
2953 	}
2954 
2955 	if (!err)
2956 		finish_current_insert(trans, extent_root, 0);
2957 	return err;
2958 }
2959 
2960 /*
2961  * remove an extent from the root, returns 0 on success
2962  */
2963 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2964 			       struct btrfs_root *root,
2965 			       u64 bytenr, u64 num_bytes, u64 parent,
2966 			       u64 root_objectid, u64 ref_generation,
2967 			       u64 owner_objectid, int pin)
2968 {
2969 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2970 	int pending_ret;
2971 	int ret;
2972 
2973 	WARN_ON(num_bytes < root->sectorsize);
2974 	if (root == extent_root) {
2975 		struct pending_extent_op *extent_op = NULL;
2976 
2977 		mutex_lock(&root->fs_info->extent_ins_mutex);
2978 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2979 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2980 			u64 priv;
2981 			ret = get_state_private(&root->fs_info->extent_ins,
2982 						bytenr, &priv);
2983 			BUG_ON(ret);
2984 			extent_op = (struct pending_extent_op *)
2985 						(unsigned long)priv;
2986 
2987 			extent_op->del = 1;
2988 			if (extent_op->type == PENDING_EXTENT_INSERT) {
2989 				mutex_unlock(&root->fs_info->extent_ins_mutex);
2990 				return 0;
2991 			}
2992 		}
2993 
2994 		if (extent_op) {
2995 			ref_generation = extent_op->orig_generation;
2996 			parent = extent_op->orig_parent;
2997 		}
2998 
2999 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3000 		BUG_ON(!extent_op);
3001 
3002 		extent_op->type = PENDING_EXTENT_DELETE;
3003 		extent_op->bytenr = bytenr;
3004 		extent_op->num_bytes = num_bytes;
3005 		extent_op->parent = parent;
3006 		extent_op->orig_parent = parent;
3007 		extent_op->generation = ref_generation;
3008 		extent_op->orig_generation = ref_generation;
3009 		extent_op->level = (int)owner_objectid;
3010 		INIT_LIST_HEAD(&extent_op->list);
3011 		extent_op->del = 0;
3012 
3013 		set_extent_bits(&root->fs_info->pending_del,
3014 				bytenr, bytenr + num_bytes - 1,
3015 				EXTENT_WRITEBACK, GFP_NOFS);
3016 		set_state_private(&root->fs_info->pending_del,
3017 				  bytenr, (unsigned long)extent_op);
3018 		mutex_unlock(&root->fs_info->extent_ins_mutex);
3019 		return 0;
3020 	}
3021 	/* if metadata always pin */
3022 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3023 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3024 			mutex_lock(&root->fs_info->pinned_mutex);
3025 			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3026 			mutex_unlock(&root->fs_info->pinned_mutex);
3027 			update_reserved_extents(root, bytenr, num_bytes, 0);
3028 			return 0;
3029 		}
3030 		pin = 1;
3031 	}
3032 
3033 	/* if data pin when any transaction has committed this */
3034 	if (ref_generation != trans->transid)
3035 		pin = 1;
3036 
3037 	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
3038 			    root_objectid, ref_generation,
3039 			    owner_objectid, pin, pin == 0);
3040 
3041 	finish_current_insert(trans, root->fs_info->extent_root, 0);
3042 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
3043 	return ret ? ret : pending_ret;
3044 }
3045 
3046 int btrfs_free_extent(struct btrfs_trans_handle *trans,
3047 		      struct btrfs_root *root,
3048 		      u64 bytenr, u64 num_bytes, u64 parent,
3049 		      u64 root_objectid, u64 ref_generation,
3050 		      u64 owner_objectid, int pin)
3051 {
3052 	int ret;
3053 
3054 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
3055 				  root_objectid, ref_generation,
3056 				  owner_objectid, pin);
3057 	return ret;
3058 }
3059 
3060 static u64 stripe_align(struct btrfs_root *root, u64 val)
3061 {
3062 	u64 mask = ((u64)root->stripesize - 1);
3063 	u64 ret = (val + mask) & ~mask;
3064 	return ret;
3065 }
3066 
3067 /*
3068  * walks the btree of allocated extents and find a hole of a given size.
3069  * The key ins is changed to record the hole:
3070  * ins->objectid == block start
3071  * ins->flags = BTRFS_EXTENT_ITEM_KEY
3072  * ins->offset == number of blocks
3073  * Any available blocks before search_start are skipped.
3074  */
3075 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3076 				     struct btrfs_root *orig_root,
3077 				     u64 num_bytes, u64 empty_size,
3078 				     u64 search_start, u64 search_end,
3079 				     u64 hint_byte, struct btrfs_key *ins,
3080 				     u64 exclude_start, u64 exclude_nr,
3081 				     int data)
3082 {
3083 	int ret = 0;
3084 	struct btrfs_root *root = orig_root->fs_info->extent_root;
3085 	u64 total_needed = num_bytes;
3086 	u64 *last_ptr = NULL;
3087 	u64 last_wanted = 0;
3088 	struct btrfs_block_group_cache *block_group = NULL;
3089 	int chunk_alloc_done = 0;
3090 	int empty_cluster = 2 * 1024 * 1024;
3091 	int allowed_chunk_alloc = 0;
3092 	struct list_head *head = NULL, *cur = NULL;
3093 	int loop = 0;
3094 	int extra_loop = 0;
3095 	struct btrfs_space_info *space_info;
3096 
3097 	WARN_ON(num_bytes < root->sectorsize);
3098 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3099 	ins->objectid = 0;
3100 	ins->offset = 0;
3101 
3102 	if (orig_root->ref_cows || empty_size)
3103 		allowed_chunk_alloc = 1;
3104 
3105 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
3106 		last_ptr = &root->fs_info->last_alloc;
3107 		if (!btrfs_test_opt(root, SSD))
3108 			empty_cluster = 64 * 1024;
3109 	}
3110 
3111 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
3112 		last_ptr = &root->fs_info->last_data_alloc;
3113 
3114 	if (last_ptr) {
3115 		if (*last_ptr) {
3116 			hint_byte = *last_ptr;
3117 			last_wanted = *last_ptr;
3118 		} else
3119 			empty_size += empty_cluster;
3120 	} else {
3121 		empty_cluster = 0;
3122 	}
3123 	search_start = max(search_start, first_logical_byte(root, 0));
3124 	search_start = max(search_start, hint_byte);
3125 
3126 	if (last_wanted && search_start != last_wanted) {
3127 		last_wanted = 0;
3128 		empty_size += empty_cluster;
3129 	}
3130 
3131 	total_needed += empty_size;
3132 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
3133 	if (!block_group)
3134 		block_group = btrfs_lookup_first_block_group(root->fs_info,
3135 							     search_start);
3136 	space_info = __find_space_info(root->fs_info, data);
3137 
3138 	down_read(&space_info->groups_sem);
3139 	while (1) {
3140 		struct btrfs_free_space *free_space;
3141 		/*
3142 		 * the only way this happens if our hint points to a block
3143 		 * group thats not of the proper type, while looping this
3144 		 * should never happen
3145 		 */
3146 		if (empty_size)
3147 			extra_loop = 1;
3148 
3149 		if (!block_group)
3150 			goto new_group_no_lock;
3151 
3152 		if (unlikely(!block_group->cached)) {
3153 			mutex_lock(&block_group->cache_mutex);
3154 			ret = cache_block_group(root, block_group);
3155 			mutex_unlock(&block_group->cache_mutex);
3156 			if (ret)
3157 				break;
3158 		}
3159 
3160 		mutex_lock(&block_group->alloc_mutex);
3161 		if (unlikely(!block_group_bits(block_group, data)))
3162 			goto new_group;
3163 
3164 		if (unlikely(block_group->ro))
3165 			goto new_group;
3166 
3167 		free_space = btrfs_find_free_space(block_group, search_start,
3168 						   total_needed);
3169 		if (free_space) {
3170 			u64 start = block_group->key.objectid;
3171 			u64 end = block_group->key.objectid +
3172 				block_group->key.offset;
3173 
3174 			search_start = stripe_align(root, free_space->offset);
3175 
3176 			/* move on to the next group */
3177 			if (search_start + num_bytes >= search_end)
3178 				goto new_group;
3179 
3180 			/* move on to the next group */
3181 			if (search_start + num_bytes > end)
3182 				goto new_group;
3183 
3184 			if (last_wanted && search_start != last_wanted) {
3185 				total_needed += empty_cluster;
3186 				empty_size += empty_cluster;
3187 				last_wanted = 0;
3188 				/*
3189 				 * if search_start is still in this block group
3190 				 * then we just re-search this block group
3191 				 */
3192 				if (search_start >= start &&
3193 				    search_start < end) {
3194 					mutex_unlock(&block_group->alloc_mutex);
3195 					continue;
3196 				}
3197 
3198 				/* else we go to the next block group */
3199 				goto new_group;
3200 			}
3201 
3202 			if (exclude_nr > 0 &&
3203 			    (search_start + num_bytes > exclude_start &&
3204 			     search_start < exclude_start + exclude_nr)) {
3205 				search_start = exclude_start + exclude_nr;
3206 				/*
3207 				 * if search_start is still in this block group
3208 				 * then we just re-search this block group
3209 				 */
3210 				if (search_start >= start &&
3211 				    search_start < end) {
3212 					mutex_unlock(&block_group->alloc_mutex);
3213 					last_wanted = 0;
3214 					continue;
3215 				}
3216 
3217 				/* else we go to the next block group */
3218 				goto new_group;
3219 			}
3220 
3221 			ins->objectid = search_start;
3222 			ins->offset = num_bytes;
3223 
3224 			btrfs_remove_free_space_lock(block_group, search_start,
3225 						     num_bytes);
3226 			/* we are all good, lets return */
3227 			mutex_unlock(&block_group->alloc_mutex);
3228 			break;
3229 		}
3230 new_group:
3231 		mutex_unlock(&block_group->alloc_mutex);
3232 		put_block_group(block_group);
3233 		block_group = NULL;
3234 new_group_no_lock:
3235 		/* don't try to compare new allocations against the
3236 		 * last allocation any more
3237 		 */
3238 		last_wanted = 0;
3239 
3240 		/*
3241 		 * Here's how this works.
3242 		 * loop == 0: we were searching a block group via a hint
3243 		 *		and didn't find anything, so we start at
3244 		 *		the head of the block groups and keep searching
3245 		 * loop == 1: we're searching through all of the block groups
3246 		 *		if we hit the head again we have searched
3247 		 *		all of the block groups for this space and we
3248 		 *		need to try and allocate, if we cant error out.
3249 		 * loop == 2: we allocated more space and are looping through
3250 		 *		all of the block groups again.
3251 		 */
3252 		if (loop == 0) {
3253 			head = &space_info->block_groups;
3254 			cur = head->next;
3255 			loop++;
3256 		} else if (loop == 1 && cur == head) {
3257 			int keep_going;
3258 
3259 			/* at this point we give up on the empty_size
3260 			 * allocations and just try to allocate the min
3261 			 * space.
3262 			 *
3263 			 * The extra_loop field was set if an empty_size
3264 			 * allocation was attempted above, and if this
3265 			 * is try we need to try the loop again without
3266 			 * the additional empty_size.
3267 			 */
3268 			total_needed -= empty_size;
3269 			empty_size = 0;
3270 			keep_going = extra_loop;
3271 			loop++;
3272 
3273 			if (allowed_chunk_alloc && !chunk_alloc_done) {
3274 				up_read(&space_info->groups_sem);
3275 				ret = do_chunk_alloc(trans, root, num_bytes +
3276 						     2 * 1024 * 1024, data, 1);
3277 				down_read(&space_info->groups_sem);
3278 				if (ret < 0)
3279 					goto loop_check;
3280 				head = &space_info->block_groups;
3281 				/*
3282 				 * we've allocated a new chunk, keep
3283 				 * trying
3284 				 */
3285 				keep_going = 1;
3286 				chunk_alloc_done = 1;
3287 			} else if (!allowed_chunk_alloc) {
3288 				space_info->force_alloc = 1;
3289 			}
3290 loop_check:
3291 			if (keep_going) {
3292 				cur = head->next;
3293 				extra_loop = 0;
3294 			} else {
3295 				break;
3296 			}
3297 		} else if (cur == head) {
3298 			break;
3299 		}
3300 
3301 		block_group = list_entry(cur, struct btrfs_block_group_cache,
3302 					 list);
3303 		atomic_inc(&block_group->count);
3304 
3305 		search_start = block_group->key.objectid;
3306 		cur = cur->next;
3307 	}
3308 
3309 	/* we found what we needed */
3310 	if (ins->objectid) {
3311 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
3312 			trans->block_group = block_group->key.objectid;
3313 
3314 		if (last_ptr)
3315 			*last_ptr = ins->objectid + ins->offset;
3316 		ret = 0;
3317 	} else if (!ret) {
3318 		printk(KERN_ERR "btrfs searching for %llu bytes, "
3319 		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
3320 		       (unsigned long long)total_needed,
3321 		       (unsigned long long)num_bytes,
3322 		       loop, allowed_chunk_alloc);
3323 		ret = -ENOSPC;
3324 	}
3325 	if (block_group)
3326 		put_block_group(block_group);
3327 
3328 	up_read(&space_info->groups_sem);
3329 	return ret;
3330 }
3331 
3332 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3333 {
3334 	struct btrfs_block_group_cache *cache;
3335 
3336 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3337 	       (unsigned long long)(info->total_bytes - info->bytes_used -
3338 				    info->bytes_pinned - info->bytes_reserved),
3339 	       (info->full) ? "" : "not ");
3340 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3341 	       " may_use=%llu, used=%llu\n", info->total_bytes,
3342 	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3343 	       info->bytes_used);
3344 
3345 	down_read(&info->groups_sem);
3346 	list_for_each_entry(cache, &info->block_groups, list) {
3347 		spin_lock(&cache->lock);
3348 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3349 		       "%llu pinned %llu reserved\n",
3350 		       (unsigned long long)cache->key.objectid,
3351 		       (unsigned long long)cache->key.offset,
3352 		       (unsigned long long)btrfs_block_group_used(&cache->item),
3353 		       (unsigned long long)cache->pinned,
3354 		       (unsigned long long)cache->reserved);
3355 		btrfs_dump_free_space(cache, bytes);
3356 		spin_unlock(&cache->lock);
3357 	}
3358 	up_read(&info->groups_sem);
3359 }
3360 
3361 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3362 				  struct btrfs_root *root,
3363 				  u64 num_bytes, u64 min_alloc_size,
3364 				  u64 empty_size, u64 hint_byte,
3365 				  u64 search_end, struct btrfs_key *ins,
3366 				  u64 data)
3367 {
3368 	int ret;
3369 	u64 search_start = 0;
3370 	struct btrfs_fs_info *info = root->fs_info;
3371 
3372 	data = btrfs_get_alloc_profile(root, data);
3373 again:
3374 	/*
3375 	 * the only place that sets empty_size is btrfs_realloc_node, which
3376 	 * is not called recursively on allocations
3377 	 */
3378 	if (empty_size || root->ref_cows) {
3379 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3380 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3381 				     2 * 1024 * 1024,
3382 				     BTRFS_BLOCK_GROUP_METADATA |
3383 				     (info->metadata_alloc_profile &
3384 				      info->avail_metadata_alloc_bits), 0);
3385 		}
3386 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3387 				     num_bytes + 2 * 1024 * 1024, data, 0);
3388 	}
3389 
3390 	WARN_ON(num_bytes < root->sectorsize);
3391 	ret = find_free_extent(trans, root, num_bytes, empty_size,
3392 			       search_start, search_end, hint_byte, ins,
3393 			       trans->alloc_exclude_start,
3394 			       trans->alloc_exclude_nr, data);
3395 
3396 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3397 		num_bytes = num_bytes >> 1;
3398 		num_bytes = num_bytes & ~(root->sectorsize - 1);
3399 		num_bytes = max(num_bytes, min_alloc_size);
3400 		do_chunk_alloc(trans, root->fs_info->extent_root,
3401 			       num_bytes, data, 1);
3402 		goto again;
3403 	}
3404 	if (ret) {
3405 		struct btrfs_space_info *sinfo;
3406 
3407 		sinfo = __find_space_info(root->fs_info, data);
3408 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
3409 		       "wanted %llu\n", (unsigned long long)data,
3410 		       (unsigned long long)num_bytes);
3411 		dump_space_info(sinfo, num_bytes);
3412 		BUG();
3413 	}
3414 
3415 	return ret;
3416 }
3417 
3418 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3419 {
3420 	struct btrfs_block_group_cache *cache;
3421 	int ret = 0;
3422 
3423 	cache = btrfs_lookup_block_group(root->fs_info, start);
3424 	if (!cache) {
3425 		printk(KERN_ERR "Unable to find block group for %llu\n",
3426 		       (unsigned long long)start);
3427 		return -ENOSPC;
3428 	}
3429 
3430 	ret = btrfs_discard_extent(root, start, len);
3431 
3432 	btrfs_add_free_space(cache, start, len);
3433 	put_block_group(cache);
3434 	update_reserved_extents(root, start, len, 0);
3435 
3436 	return ret;
3437 }
3438 
3439 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3440 				  struct btrfs_root *root,
3441 				  u64 num_bytes, u64 min_alloc_size,
3442 				  u64 empty_size, u64 hint_byte,
3443 				  u64 search_end, struct btrfs_key *ins,
3444 				  u64 data)
3445 {
3446 	int ret;
3447 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3448 				     empty_size, hint_byte, search_end, ins,
3449 				     data);
3450 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
3451 	return ret;
3452 }
3453 
3454 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3455 					 struct btrfs_root *root, u64 parent,
3456 					 u64 root_objectid, u64 ref_generation,
3457 					 u64 owner, struct btrfs_key *ins)
3458 {
3459 	int ret;
3460 	int pending_ret;
3461 	u64 super_used;
3462 	u64 root_used;
3463 	u64 num_bytes = ins->offset;
3464 	u32 sizes[2];
3465 	struct btrfs_fs_info *info = root->fs_info;
3466 	struct btrfs_root *extent_root = info->extent_root;
3467 	struct btrfs_extent_item *extent_item;
3468 	struct btrfs_extent_ref *ref;
3469 	struct btrfs_path *path;
3470 	struct btrfs_key keys[2];
3471 
3472 	if (parent == 0)
3473 		parent = ins->objectid;
3474 
3475 	/* block accounting for super block */
3476 	spin_lock(&info->delalloc_lock);
3477 	super_used = btrfs_super_bytes_used(&info->super_copy);
3478 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3479 
3480 	/* block accounting for root item */
3481 	root_used = btrfs_root_used(&root->root_item);
3482 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3483 	spin_unlock(&info->delalloc_lock);
3484 
3485 	if (root == extent_root) {
3486 		struct pending_extent_op *extent_op;
3487 
3488 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3489 		BUG_ON(!extent_op);
3490 
3491 		extent_op->type = PENDING_EXTENT_INSERT;
3492 		extent_op->bytenr = ins->objectid;
3493 		extent_op->num_bytes = ins->offset;
3494 		extent_op->parent = parent;
3495 		extent_op->orig_parent = 0;
3496 		extent_op->generation = ref_generation;
3497 		extent_op->orig_generation = 0;
3498 		extent_op->level = (int)owner;
3499 		INIT_LIST_HEAD(&extent_op->list);
3500 		extent_op->del = 0;
3501 
3502 		mutex_lock(&root->fs_info->extent_ins_mutex);
3503 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3504 				ins->objectid + ins->offset - 1,
3505 				EXTENT_WRITEBACK, GFP_NOFS);
3506 		set_state_private(&root->fs_info->extent_ins,
3507 				  ins->objectid, (unsigned long)extent_op);
3508 		mutex_unlock(&root->fs_info->extent_ins_mutex);
3509 		goto update_block;
3510 	}
3511 
3512 	memcpy(&keys[0], ins, sizeof(*ins));
3513 	keys[1].objectid = ins->objectid;
3514 	keys[1].type = BTRFS_EXTENT_REF_KEY;
3515 	keys[1].offset = parent;
3516 	sizes[0] = sizeof(*extent_item);
3517 	sizes[1] = sizeof(*ref);
3518 
3519 	path = btrfs_alloc_path();
3520 	BUG_ON(!path);
3521 
3522 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3523 				       sizes, 2);
3524 	BUG_ON(ret);
3525 
3526 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3527 				     struct btrfs_extent_item);
3528 	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3529 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3530 			     struct btrfs_extent_ref);
3531 
3532 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3533 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3534 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3535 	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3536 
3537 	btrfs_mark_buffer_dirty(path->nodes[0]);
3538 
3539 	trans->alloc_exclude_start = 0;
3540 	trans->alloc_exclude_nr = 0;
3541 	btrfs_free_path(path);
3542 	finish_current_insert(trans, extent_root, 0);
3543 	pending_ret = del_pending_extents(trans, extent_root, 0);
3544 
3545 	if (ret)
3546 		goto out;
3547 	if (pending_ret) {
3548 		ret = pending_ret;
3549 		goto out;
3550 	}
3551 
3552 update_block:
3553 	ret = update_block_group(trans, root, ins->objectid,
3554 				 ins->offset, 1, 0);
3555 	if (ret) {
3556 		printk(KERN_ERR "btrfs update block group failed for %llu "
3557 		       "%llu\n", (unsigned long long)ins->objectid,
3558 		       (unsigned long long)ins->offset);
3559 		BUG();
3560 	}
3561 out:
3562 	return ret;
3563 }
3564 
3565 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3566 				struct btrfs_root *root, u64 parent,
3567 				u64 root_objectid, u64 ref_generation,
3568 				u64 owner, struct btrfs_key *ins)
3569 {
3570 	int ret;
3571 
3572 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3573 		return 0;
3574 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3575 					    ref_generation, owner, ins);
3576 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
3577 	return ret;
3578 }
3579 
3580 /*
3581  * this is used by the tree logging recovery code.  It records that
3582  * an extent has been allocated and makes sure to clear the free
3583  * space cache bits as well
3584  */
3585 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3586 				struct btrfs_root *root, u64 parent,
3587 				u64 root_objectid, u64 ref_generation,
3588 				u64 owner, struct btrfs_key *ins)
3589 {
3590 	int ret;
3591 	struct btrfs_block_group_cache *block_group;
3592 
3593 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3594 	mutex_lock(&block_group->cache_mutex);
3595 	cache_block_group(root, block_group);
3596 	mutex_unlock(&block_group->cache_mutex);
3597 
3598 	ret = btrfs_remove_free_space(block_group, ins->objectid,
3599 				      ins->offset);
3600 	BUG_ON(ret);
3601 	put_block_group(block_group);
3602 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3603 					    ref_generation, owner, ins);
3604 	return ret;
3605 }
3606 
3607 /*
3608  * finds a free extent and does all the dirty work required for allocation
3609  * returns the key for the extent through ins, and a tree buffer for
3610  * the first block of the extent through buf.
3611  *
3612  * returns 0 if everything worked, non-zero otherwise.
3613  */
3614 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3615 		       struct btrfs_root *root,
3616 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
3617 		       u64 root_objectid, u64 ref_generation,
3618 		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
3619 		       u64 search_end, struct btrfs_key *ins, u64 data)
3620 {
3621 	int ret;
3622 
3623 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
3624 				     min_alloc_size, empty_size, hint_byte,
3625 				     search_end, ins, data);
3626 	BUG_ON(ret);
3627 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3628 		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3629 					root_objectid, ref_generation,
3630 					owner_objectid, ins);
3631 		BUG_ON(ret);
3632 
3633 	} else {
3634 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
3635 	}
3636 	return ret;
3637 }
3638 
3639 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3640 					    struct btrfs_root *root,
3641 					    u64 bytenr, u32 blocksize,
3642 					    int level)
3643 {
3644 	struct extent_buffer *buf;
3645 
3646 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3647 	if (!buf)
3648 		return ERR_PTR(-ENOMEM);
3649 	btrfs_set_header_generation(buf, trans->transid);
3650 	btrfs_set_buffer_lockdep_class(buf, level);
3651 	btrfs_tree_lock(buf);
3652 	clean_tree_block(trans, root, buf);
3653 
3654 	btrfs_set_lock_blocking(buf);
3655 	btrfs_set_buffer_uptodate(buf);
3656 
3657 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3658 		set_extent_dirty(&root->dirty_log_pages, buf->start,
3659 			 buf->start + buf->len - 1, GFP_NOFS);
3660 	} else {
3661 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3662 			 buf->start + buf->len - 1, GFP_NOFS);
3663 	}
3664 	trans->blocks_used++;
3665 	/* this returns a buffer locked for blocking */
3666 	return buf;
3667 }
3668 
3669 /*
3670  * helper function to allocate a block for a given tree
3671  * returns the tree buffer or NULL.
3672  */
3673 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3674 					     struct btrfs_root *root,
3675 					     u32 blocksize, u64 parent,
3676 					     u64 root_objectid,
3677 					     u64 ref_generation,
3678 					     int level,
3679 					     u64 hint,
3680 					     u64 empty_size)
3681 {
3682 	struct btrfs_key ins;
3683 	int ret;
3684 	struct extent_buffer *buf;
3685 
3686 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3687 				 root_objectid, ref_generation, level,
3688 				 empty_size, hint, (u64)-1, &ins, 0);
3689 	if (ret) {
3690 		BUG_ON(ret > 0);
3691 		return ERR_PTR(ret);
3692 	}
3693 
3694 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3695 				    blocksize, level);
3696 	return buf;
3697 }
3698 
3699 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3700 			struct btrfs_root *root, struct extent_buffer *leaf)
3701 {
3702 	u64 leaf_owner;
3703 	u64 leaf_generation;
3704 	struct refsort *sorted;
3705 	struct btrfs_key key;
3706 	struct btrfs_file_extent_item *fi;
3707 	int i;
3708 	int nritems;
3709 	int ret;
3710 	int refi = 0;
3711 	int slot;
3712 
3713 	BUG_ON(!btrfs_is_leaf(leaf));
3714 	nritems = btrfs_header_nritems(leaf);
3715 	leaf_owner = btrfs_header_owner(leaf);
3716 	leaf_generation = btrfs_header_generation(leaf);
3717 
3718 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3719 	/* we do this loop twice.  The first time we build a list
3720 	 * of the extents we have a reference on, then we sort the list
3721 	 * by bytenr.  The second time around we actually do the
3722 	 * extent freeing.
3723 	 */
3724 	for (i = 0; i < nritems; i++) {
3725 		u64 disk_bytenr;
3726 		cond_resched();
3727 
3728 		btrfs_item_key_to_cpu(leaf, &key, i);
3729 
3730 		/* only extents have references, skip everything else */
3731 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3732 			continue;
3733 
3734 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3735 
3736 		/* inline extents live in the btree, they don't have refs */
3737 		if (btrfs_file_extent_type(leaf, fi) ==
3738 		    BTRFS_FILE_EXTENT_INLINE)
3739 			continue;
3740 
3741 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3742 
3743 		/* holes don't have refs */
3744 		if (disk_bytenr == 0)
3745 			continue;
3746 
3747 		sorted[refi].bytenr = disk_bytenr;
3748 		sorted[refi].slot = i;
3749 		refi++;
3750 	}
3751 
3752 	if (refi == 0)
3753 		goto out;
3754 
3755 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3756 
3757 	for (i = 0; i < refi; i++) {
3758 		u64 disk_bytenr;
3759 
3760 		disk_bytenr = sorted[i].bytenr;
3761 		slot = sorted[i].slot;
3762 
3763 		cond_resched();
3764 
3765 		btrfs_item_key_to_cpu(leaf, &key, slot);
3766 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3767 			continue;
3768 
3769 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3770 
3771 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
3772 				btrfs_file_extent_disk_num_bytes(leaf, fi),
3773 				leaf->start, leaf_owner, leaf_generation,
3774 				key.objectid, 0);
3775 		BUG_ON(ret);
3776 
3777 		atomic_inc(&root->fs_info->throttle_gen);
3778 		wake_up(&root->fs_info->transaction_throttle);
3779 		cond_resched();
3780 	}
3781 out:
3782 	kfree(sorted);
3783 	return 0;
3784 }
3785 
3786 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3787 					struct btrfs_root *root,
3788 					struct btrfs_leaf_ref *ref)
3789 {
3790 	int i;
3791 	int ret;
3792 	struct btrfs_extent_info *info;
3793 	struct refsort *sorted;
3794 
3795 	if (ref->nritems == 0)
3796 		return 0;
3797 
3798 	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3799 	for (i = 0; i < ref->nritems; i++) {
3800 		sorted[i].bytenr = ref->extents[i].bytenr;
3801 		sorted[i].slot = i;
3802 	}
3803 	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3804 
3805 	/*
3806 	 * the items in the ref were sorted when the ref was inserted
3807 	 * into the ref cache, so this is already in order
3808 	 */
3809 	for (i = 0; i < ref->nritems; i++) {
3810 		info = ref->extents + sorted[i].slot;
3811 		ret = __btrfs_free_extent(trans, root, info->bytenr,
3812 					  info->num_bytes, ref->bytenr,
3813 					  ref->owner, ref->generation,
3814 					  info->objectid, 0);
3815 
3816 		atomic_inc(&root->fs_info->throttle_gen);
3817 		wake_up(&root->fs_info->transaction_throttle);
3818 		cond_resched();
3819 
3820 		BUG_ON(ret);
3821 		info++;
3822 	}
3823 
3824 	kfree(sorted);
3825 	return 0;
3826 }
3827 
3828 static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3829 				     u64 len, u32 *refs)
3830 {
3831 	int ret;
3832 
3833 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3834 	BUG_ON(ret);
3835 
3836 #if 0 /* some debugging code in case we see problems here */
3837 	/* if the refs count is one, it won't get increased again.  But
3838 	 * if the ref count is > 1, someone may be decreasing it at
3839 	 * the same time we are.
3840 	 */
3841 	if (*refs != 1) {
3842 		struct extent_buffer *eb = NULL;
3843 		eb = btrfs_find_create_tree_block(root, start, len);
3844 		if (eb)
3845 			btrfs_tree_lock(eb);
3846 
3847 		mutex_lock(&root->fs_info->alloc_mutex);
3848 		ret = lookup_extent_ref(NULL, root, start, len, refs);
3849 		BUG_ON(ret);
3850 		mutex_unlock(&root->fs_info->alloc_mutex);
3851 
3852 		if (eb) {
3853 			btrfs_tree_unlock(eb);
3854 			free_extent_buffer(eb);
3855 		}
3856 		if (*refs == 1) {
3857 			printk(KERN_ERR "btrfs block %llu went down to one "
3858 			       "during drop_snap\n", (unsigned long long)start);
3859 		}
3860 
3861 	}
3862 #endif
3863 
3864 	cond_resched();
3865 	return ret;
3866 }
3867 
3868 /*
3869  * this is used while deleting old snapshots, and it drops the refs
3870  * on a whole subtree starting from a level 1 node.
3871  *
3872  * The idea is to sort all the leaf pointers, and then drop the
3873  * ref on all the leaves in order.  Most of the time the leaves
3874  * will have ref cache entries, so no leaf IOs will be required to
3875  * find the extents they have references on.
3876  *
3877  * For each leaf, any references it has are also dropped in order
3878  *
3879  * This ends up dropping the references in something close to optimal
3880  * order for reading and modifying the extent allocation tree.
3881  */
3882 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3883 					struct btrfs_root *root,
3884 					struct btrfs_path *path)
3885 {
3886 	u64 bytenr;
3887 	u64 root_owner;
3888 	u64 root_gen;
3889 	struct extent_buffer *eb = path->nodes[1];
3890 	struct extent_buffer *leaf;
3891 	struct btrfs_leaf_ref *ref;
3892 	struct refsort *sorted = NULL;
3893 	int nritems = btrfs_header_nritems(eb);
3894 	int ret;
3895 	int i;
3896 	int refi = 0;
3897 	int slot = path->slots[1];
3898 	u32 blocksize = btrfs_level_size(root, 0);
3899 	u32 refs;
3900 
3901 	if (nritems == 0)
3902 		goto out;
3903 
3904 	root_owner = btrfs_header_owner(eb);
3905 	root_gen = btrfs_header_generation(eb);
3906 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3907 
3908 	/*
3909 	 * step one, sort all the leaf pointers so we don't scribble
3910 	 * randomly into the extent allocation tree
3911 	 */
3912 	for (i = slot; i < nritems; i++) {
3913 		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3914 		sorted[refi].slot = i;
3915 		refi++;
3916 	}
3917 
3918 	/*
3919 	 * nritems won't be zero, but if we're picking up drop_snapshot
3920 	 * after a crash, slot might be > 0, so double check things
3921 	 * just in case.
3922 	 */
3923 	if (refi == 0)
3924 		goto out;
3925 
3926 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3927 
3928 	/*
3929 	 * the first loop frees everything the leaves point to
3930 	 */
3931 	for (i = 0; i < refi; i++) {
3932 		u64 ptr_gen;
3933 
3934 		bytenr = sorted[i].bytenr;
3935 
3936 		/*
3937 		 * check the reference count on this leaf.  If it is > 1
3938 		 * we just decrement it below and don't update any
3939 		 * of the refs the leaf points to.
3940 		 */
3941 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3942 		BUG_ON(ret);
3943 		if (refs != 1)
3944 			continue;
3945 
3946 		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3947 
3948 		/*
3949 		 * the leaf only had one reference, which means the
3950 		 * only thing pointing to this leaf is the snapshot
3951 		 * we're deleting.  It isn't possible for the reference
3952 		 * count to increase again later
3953 		 *
3954 		 * The reference cache is checked for the leaf,
3955 		 * and if found we'll be able to drop any refs held by
3956 		 * the leaf without needing to read it in.
3957 		 */
3958 		ref = btrfs_lookup_leaf_ref(root, bytenr);
3959 		if (ref && ref->generation != ptr_gen) {
3960 			btrfs_free_leaf_ref(root, ref);
3961 			ref = NULL;
3962 		}
3963 		if (ref) {
3964 			ret = cache_drop_leaf_ref(trans, root, ref);
3965 			BUG_ON(ret);
3966 			btrfs_remove_leaf_ref(root, ref);
3967 			btrfs_free_leaf_ref(root, ref);
3968 		} else {
3969 			/*
3970 			 * the leaf wasn't in the reference cache, so
3971 			 * we have to read it.
3972 			 */
3973 			leaf = read_tree_block(root, bytenr, blocksize,
3974 					       ptr_gen);
3975 			ret = btrfs_drop_leaf_ref(trans, root, leaf);
3976 			BUG_ON(ret);
3977 			free_extent_buffer(leaf);
3978 		}
3979 		atomic_inc(&root->fs_info->throttle_gen);
3980 		wake_up(&root->fs_info->transaction_throttle);
3981 		cond_resched();
3982 	}
3983 
3984 	/*
3985 	 * run through the loop again to free the refs on the leaves.
3986 	 * This is faster than doing it in the loop above because
3987 	 * the leaves are likely to be clustered together.  We end up
3988 	 * working in nice chunks on the extent allocation tree.
3989 	 */
3990 	for (i = 0; i < refi; i++) {
3991 		bytenr = sorted[i].bytenr;
3992 		ret = __btrfs_free_extent(trans, root, bytenr,
3993 					blocksize, eb->start,
3994 					root_owner, root_gen, 0, 1);
3995 		BUG_ON(ret);
3996 
3997 		atomic_inc(&root->fs_info->throttle_gen);
3998 		wake_up(&root->fs_info->transaction_throttle);
3999 		cond_resched();
4000 	}
4001 out:
4002 	kfree(sorted);
4003 
4004 	/*
4005 	 * update the path to show we've processed the entire level 1
4006 	 * node.  This will get saved into the root's drop_snapshot_progress
4007 	 * field so these drops are not repeated again if this transaction
4008 	 * commits.
4009 	 */
4010 	path->slots[1] = nritems;
4011 	return 0;
4012 }
4013 
4014 /*
4015  * helper function for drop_snapshot, this walks down the tree dropping ref
4016  * counts as it goes.
4017  */
4018 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4019 				   struct btrfs_root *root,
4020 				   struct btrfs_path *path, int *level)
4021 {
4022 	u64 root_owner;
4023 	u64 root_gen;
4024 	u64 bytenr;
4025 	u64 ptr_gen;
4026 	struct extent_buffer *next;
4027 	struct extent_buffer *cur;
4028 	struct extent_buffer *parent;
4029 	u32 blocksize;
4030 	int ret;
4031 	u32 refs;
4032 
4033 	WARN_ON(*level < 0);
4034 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4035 	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
4036 				path->nodes[*level]->len, &refs);
4037 	BUG_ON(ret);
4038 	if (refs > 1)
4039 		goto out;
4040 
4041 	/*
4042 	 * walk down to the last node level and free all the leaves
4043 	 */
4044 	while (*level >= 0) {
4045 		WARN_ON(*level < 0);
4046 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
4047 		cur = path->nodes[*level];
4048 
4049 		if (btrfs_header_level(cur) != *level)
4050 			WARN_ON(1);
4051 
4052 		if (path->slots[*level] >=
4053 		    btrfs_header_nritems(cur))
4054 			break;
4055 
4056 		/* the new code goes down to level 1 and does all the
4057 		 * leaves pointed to that node in bulk.  So, this check
4058 		 * for level 0 will always be false.
4059 		 *
4060 		 * But, the disk format allows the drop_snapshot_progress
4061 		 * field in the root to leave things in a state where
4062 		 * a leaf will need cleaning up here.  If someone crashes
4063 		 * with the old code and then boots with the new code,
4064 		 * we might find a leaf here.
4065 		 */
4066 		if (*level == 0) {
4067 			ret = btrfs_drop_leaf_ref(trans, root, cur);
4068 			BUG_ON(ret);
4069 			break;
4070 		}
4071 
4072 		/*
4073 		 * once we get to level one, process the whole node
4074 		 * at once, including everything below it.
4075 		 */
4076 		if (*level == 1) {
4077 			ret = drop_level_one_refs(trans, root, path);
4078 			BUG_ON(ret);
4079 			break;
4080 		}
4081 
4082 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4083 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4084 		blocksize = btrfs_level_size(root, *level - 1);
4085 
4086 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
4087 		BUG_ON(ret);
4088 
4089 		/*
4090 		 * if there is more than one reference, we don't need
4091 		 * to read that node to drop any references it has.  We
4092 		 * just drop the ref we hold on that node and move on to the
4093 		 * next slot in this level.
4094 		 */
4095 		if (refs != 1) {
4096 			parent = path->nodes[*level];
4097 			root_owner = btrfs_header_owner(parent);
4098 			root_gen = btrfs_header_generation(parent);
4099 			path->slots[*level]++;
4100 
4101 			ret = __btrfs_free_extent(trans, root, bytenr,
4102 						blocksize, parent->start,
4103 						root_owner, root_gen,
4104 						*level - 1, 1);
4105 			BUG_ON(ret);
4106 
4107 			atomic_inc(&root->fs_info->throttle_gen);
4108 			wake_up(&root->fs_info->transaction_throttle);
4109 			cond_resched();
4110 
4111 			continue;
4112 		}
4113 
4114 		/*
4115 		 * we need to keep freeing things in the next level down.
4116 		 * read the block and loop around to process it
4117 		 */
4118 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4119 		WARN_ON(*level <= 0);
4120 		if (path->nodes[*level-1])
4121 			free_extent_buffer(path->nodes[*level-1]);
4122 		path->nodes[*level-1] = next;
4123 		*level = btrfs_header_level(next);
4124 		path->slots[*level] = 0;
4125 		cond_resched();
4126 	}
4127 out:
4128 	WARN_ON(*level < 0);
4129 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4130 
4131 	if (path->nodes[*level] == root->node) {
4132 		parent = path->nodes[*level];
4133 		bytenr = path->nodes[*level]->start;
4134 	} else {
4135 		parent = path->nodes[*level + 1];
4136 		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4137 	}
4138 
4139 	blocksize = btrfs_level_size(root, *level);
4140 	root_owner = btrfs_header_owner(parent);
4141 	root_gen = btrfs_header_generation(parent);
4142 
4143 	/*
4144 	 * cleanup and free the reference on the last node
4145 	 * we processed
4146 	 */
4147 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
4148 				  parent->start, root_owner, root_gen,
4149 				  *level, 1);
4150 	free_extent_buffer(path->nodes[*level]);
4151 	path->nodes[*level] = NULL;
4152 
4153 	*level += 1;
4154 	BUG_ON(ret);
4155 
4156 	cond_resched();
4157 	return 0;
4158 }
4159 
4160 /*
4161  * helper function for drop_subtree, this function is similar to
4162  * walk_down_tree. The main difference is that it checks reference
4163  * counts while tree blocks are locked.
4164  */
4165 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
4166 				      struct btrfs_root *root,
4167 				      struct btrfs_path *path, int *level)
4168 {
4169 	struct extent_buffer *next;
4170 	struct extent_buffer *cur;
4171 	struct extent_buffer *parent;
4172 	u64 bytenr;
4173 	u64 ptr_gen;
4174 	u32 blocksize;
4175 	u32 refs;
4176 	int ret;
4177 
4178 	cur = path->nodes[*level];
4179 	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
4180 				      &refs);
4181 	BUG_ON(ret);
4182 	if (refs > 1)
4183 		goto out;
4184 
4185 	while (*level >= 0) {
4186 		cur = path->nodes[*level];
4187 		if (*level == 0) {
4188 			ret = btrfs_drop_leaf_ref(trans, root, cur);
4189 			BUG_ON(ret);
4190 			clean_tree_block(trans, root, cur);
4191 			break;
4192 		}
4193 		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
4194 			clean_tree_block(trans, root, cur);
4195 			break;
4196 		}
4197 
4198 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4199 		blocksize = btrfs_level_size(root, *level - 1);
4200 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4201 
4202 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4203 		btrfs_tree_lock(next);
4204 		btrfs_set_lock_blocking(next);
4205 
4206 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
4207 					      &refs);
4208 		BUG_ON(ret);
4209 		if (refs > 1) {
4210 			parent = path->nodes[*level];
4211 			ret = btrfs_free_extent(trans, root, bytenr,
4212 					blocksize, parent->start,
4213 					btrfs_header_owner(parent),
4214 					btrfs_header_generation(parent),
4215 					*level - 1, 1);
4216 			BUG_ON(ret);
4217 			path->slots[*level]++;
4218 			btrfs_tree_unlock(next);
4219 			free_extent_buffer(next);
4220 			continue;
4221 		}
4222 
4223 		*level = btrfs_header_level(next);
4224 		path->nodes[*level] = next;
4225 		path->slots[*level] = 0;
4226 		path->locks[*level] = 1;
4227 		cond_resched();
4228 	}
4229 out:
4230 	parent = path->nodes[*level + 1];
4231 	bytenr = path->nodes[*level]->start;
4232 	blocksize = path->nodes[*level]->len;
4233 
4234 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4235 			parent->start, btrfs_header_owner(parent),
4236 			btrfs_header_generation(parent), *level, 1);
4237 	BUG_ON(ret);
4238 
4239 	if (path->locks[*level]) {
4240 		btrfs_tree_unlock(path->nodes[*level]);
4241 		path->locks[*level] = 0;
4242 	}
4243 	free_extent_buffer(path->nodes[*level]);
4244 	path->nodes[*level] = NULL;
4245 	*level += 1;
4246 	cond_resched();
4247 	return 0;
4248 }
4249 
4250 /*
4251  * helper for dropping snapshots.  This walks back up the tree in the path
4252  * to find the first node higher up where we haven't yet gone through
4253  * all the slots
4254  */
4255 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
4256 				 struct btrfs_root *root,
4257 				 struct btrfs_path *path,
4258 				 int *level, int max_level)
4259 {
4260 	u64 root_owner;
4261 	u64 root_gen;
4262 	struct btrfs_root_item *root_item = &root->root_item;
4263 	int i;
4264 	int slot;
4265 	int ret;
4266 
4267 	for (i = *level; i < max_level && path->nodes[i]; i++) {
4268 		slot = path->slots[i];
4269 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
4270 			struct extent_buffer *node;
4271 			struct btrfs_disk_key disk_key;
4272 
4273 			/*
4274 			 * there is more work to do in this level.
4275 			 * Update the drop_progress marker to reflect
4276 			 * the work we've done so far, and then bump
4277 			 * the slot number
4278 			 */
4279 			node = path->nodes[i];
4280 			path->slots[i]++;
4281 			*level = i;
4282 			WARN_ON(*level == 0);
4283 			btrfs_node_key(node, &disk_key, path->slots[i]);
4284 			memcpy(&root_item->drop_progress,
4285 			       &disk_key, sizeof(disk_key));
4286 			root_item->drop_level = i;
4287 			return 0;
4288 		} else {
4289 			struct extent_buffer *parent;
4290 
4291 			/*
4292 			 * this whole node is done, free our reference
4293 			 * on it and go up one level
4294 			 */
4295 			if (path->nodes[*level] == root->node)
4296 				parent = path->nodes[*level];
4297 			else
4298 				parent = path->nodes[*level + 1];
4299 
4300 			root_owner = btrfs_header_owner(parent);
4301 			root_gen = btrfs_header_generation(parent);
4302 
4303 			clean_tree_block(trans, root, path->nodes[*level]);
4304 			ret = btrfs_free_extent(trans, root,
4305 						path->nodes[*level]->start,
4306 						path->nodes[*level]->len,
4307 						parent->start, root_owner,
4308 						root_gen, *level, 1);
4309 			BUG_ON(ret);
4310 			if (path->locks[*level]) {
4311 				btrfs_tree_unlock(path->nodes[*level]);
4312 				path->locks[*level] = 0;
4313 			}
4314 			free_extent_buffer(path->nodes[*level]);
4315 			path->nodes[*level] = NULL;
4316 			*level = i + 1;
4317 		}
4318 	}
4319 	return 1;
4320 }
4321 
4322 /*
4323  * drop the reference count on the tree rooted at 'snap'.  This traverses
4324  * the tree freeing any blocks that have a ref count of zero after being
4325  * decremented.
4326  */
4327 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4328 			*root)
4329 {
4330 	int ret = 0;
4331 	int wret;
4332 	int level;
4333 	struct btrfs_path *path;
4334 	int i;
4335 	int orig_level;
4336 	struct btrfs_root_item *root_item = &root->root_item;
4337 
4338 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
4339 	path = btrfs_alloc_path();
4340 	BUG_ON(!path);
4341 
4342 	level = btrfs_header_level(root->node);
4343 	orig_level = level;
4344 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
4345 		path->nodes[level] = root->node;
4346 		extent_buffer_get(root->node);
4347 		path->slots[level] = 0;
4348 	} else {
4349 		struct btrfs_key key;
4350 		struct btrfs_disk_key found_key;
4351 		struct extent_buffer *node;
4352 
4353 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
4354 		level = root_item->drop_level;
4355 		path->lowest_level = level;
4356 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4357 		if (wret < 0) {
4358 			ret = wret;
4359 			goto out;
4360 		}
4361 		node = path->nodes[level];
4362 		btrfs_node_key(node, &found_key, path->slots[level]);
4363 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
4364 			       sizeof(found_key)));
4365 		/*
4366 		 * unlock our path, this is safe because only this
4367 		 * function is allowed to delete this snapshot
4368 		 */
4369 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4370 			if (path->nodes[i] && path->locks[i]) {
4371 				path->locks[i] = 0;
4372 				btrfs_tree_unlock(path->nodes[i]);
4373 			}
4374 		}
4375 	}
4376 	while (1) {
4377 		wret = walk_down_tree(trans, root, path, &level);
4378 		if (wret > 0)
4379 			break;
4380 		if (wret < 0)
4381 			ret = wret;
4382 
4383 		wret = walk_up_tree(trans, root, path, &level,
4384 				    BTRFS_MAX_LEVEL);
4385 		if (wret > 0)
4386 			break;
4387 		if (wret < 0)
4388 			ret = wret;
4389 		if (trans->transaction->in_commit) {
4390 			ret = -EAGAIN;
4391 			break;
4392 		}
4393 		atomic_inc(&root->fs_info->throttle_gen);
4394 		wake_up(&root->fs_info->transaction_throttle);
4395 	}
4396 	for (i = 0; i <= orig_level; i++) {
4397 		if (path->nodes[i]) {
4398 			free_extent_buffer(path->nodes[i]);
4399 			path->nodes[i] = NULL;
4400 		}
4401 	}
4402 out:
4403 	btrfs_free_path(path);
4404 	return ret;
4405 }
4406 
4407 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4408 			struct btrfs_root *root,
4409 			struct extent_buffer *node,
4410 			struct extent_buffer *parent)
4411 {
4412 	struct btrfs_path *path;
4413 	int level;
4414 	int parent_level;
4415 	int ret = 0;
4416 	int wret;
4417 
4418 	path = btrfs_alloc_path();
4419 	BUG_ON(!path);
4420 
4421 	BUG_ON(!btrfs_tree_locked(parent));
4422 	parent_level = btrfs_header_level(parent);
4423 	extent_buffer_get(parent);
4424 	path->nodes[parent_level] = parent;
4425 	path->slots[parent_level] = btrfs_header_nritems(parent);
4426 
4427 	BUG_ON(!btrfs_tree_locked(node));
4428 	level = btrfs_header_level(node);
4429 	extent_buffer_get(node);
4430 	path->nodes[level] = node;
4431 	path->slots[level] = 0;
4432 
4433 	while (1) {
4434 		wret = walk_down_subtree(trans, root, path, &level);
4435 		if (wret < 0)
4436 			ret = wret;
4437 		if (wret != 0)
4438 			break;
4439 
4440 		wret = walk_up_tree(trans, root, path, &level, parent_level);
4441 		if (wret < 0)
4442 			ret = wret;
4443 		if (wret != 0)
4444 			break;
4445 	}
4446 
4447 	btrfs_free_path(path);
4448 	return ret;
4449 }
4450 
4451 static unsigned long calc_ra(unsigned long start, unsigned long last,
4452 			     unsigned long nr)
4453 {
4454 	return min(last, start + nr - 1);
4455 }
4456 
4457 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
4458 					 u64 len)
4459 {
4460 	u64 page_start;
4461 	u64 page_end;
4462 	unsigned long first_index;
4463 	unsigned long last_index;
4464 	unsigned long i;
4465 	struct page *page;
4466 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4467 	struct file_ra_state *ra;
4468 	struct btrfs_ordered_extent *ordered;
4469 	unsigned int total_read = 0;
4470 	unsigned int total_dirty = 0;
4471 	int ret = 0;
4472 
4473 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
4474 
4475 	mutex_lock(&inode->i_mutex);
4476 	first_index = start >> PAGE_CACHE_SHIFT;
4477 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
4478 
4479 	/* make sure the dirty trick played by the caller work */
4480 	ret = invalidate_inode_pages2_range(inode->i_mapping,
4481 					    first_index, last_index);
4482 	if (ret)
4483 		goto out_unlock;
4484 
4485 	file_ra_state_init(ra, inode->i_mapping);
4486 
4487 	for (i = first_index ; i <= last_index; i++) {
4488 		if (total_read % ra->ra_pages == 0) {
4489 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
4490 				       calc_ra(i, last_index, ra->ra_pages));
4491 		}
4492 		total_read++;
4493 again:
4494 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
4495 			BUG_ON(1);
4496 		page = grab_cache_page(inode->i_mapping, i);
4497 		if (!page) {
4498 			ret = -ENOMEM;
4499 			goto out_unlock;
4500 		}
4501 		if (!PageUptodate(page)) {
4502 			btrfs_readpage(NULL, page);
4503 			lock_page(page);
4504 			if (!PageUptodate(page)) {
4505 				unlock_page(page);
4506 				page_cache_release(page);
4507 				ret = -EIO;
4508 				goto out_unlock;
4509 			}
4510 		}
4511 		wait_on_page_writeback(page);
4512 
4513 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
4514 		page_end = page_start + PAGE_CACHE_SIZE - 1;
4515 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4516 
4517 		ordered = btrfs_lookup_ordered_extent(inode, page_start);
4518 		if (ordered) {
4519 			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4520 			unlock_page(page);
4521 			page_cache_release(page);
4522 			btrfs_start_ordered_extent(inode, ordered, 1);
4523 			btrfs_put_ordered_extent(ordered);
4524 			goto again;
4525 		}
4526 		set_page_extent_mapped(page);
4527 
4528 		if (i == first_index)
4529 			set_extent_bits(io_tree, page_start, page_end,
4530 					EXTENT_BOUNDARY, GFP_NOFS);
4531 		btrfs_set_extent_delalloc(inode, page_start, page_end);
4532 
4533 		set_page_dirty(page);
4534 		total_dirty++;
4535 
4536 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4537 		unlock_page(page);
4538 		page_cache_release(page);
4539 	}
4540 
4541 out_unlock:
4542 	kfree(ra);
4543 	mutex_unlock(&inode->i_mutex);
4544 	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4545 	return ret;
4546 }
4547 
4548 static noinline int relocate_data_extent(struct inode *reloc_inode,
4549 					 struct btrfs_key *extent_key,
4550 					 u64 offset)
4551 {
4552 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4553 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4554 	struct extent_map *em;
4555 	u64 start = extent_key->objectid - offset;
4556 	u64 end = start + extent_key->offset - 1;
4557 
4558 	em = alloc_extent_map(GFP_NOFS);
4559 	BUG_ON(!em || IS_ERR(em));
4560 
4561 	em->start = start;
4562 	em->len = extent_key->offset;
4563 	em->block_len = extent_key->offset;
4564 	em->block_start = extent_key->objectid;
4565 	em->bdev = root->fs_info->fs_devices->latest_bdev;
4566 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
4567 
4568 	/* setup extent map to cheat btrfs_readpage */
4569 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4570 	while (1) {
4571 		int ret;
4572 		spin_lock(&em_tree->lock);
4573 		ret = add_extent_mapping(em_tree, em);
4574 		spin_unlock(&em_tree->lock);
4575 		if (ret != -EEXIST) {
4576 			free_extent_map(em);
4577 			break;
4578 		}
4579 		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4580 	}
4581 	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4582 
4583 	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4584 }
4585 
4586 struct btrfs_ref_path {
4587 	u64 extent_start;
4588 	u64 nodes[BTRFS_MAX_LEVEL];
4589 	u64 root_objectid;
4590 	u64 root_generation;
4591 	u64 owner_objectid;
4592 	u32 num_refs;
4593 	int lowest_level;
4594 	int current_level;
4595 	int shared_level;
4596 
4597 	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4598 	u64 new_nodes[BTRFS_MAX_LEVEL];
4599 };
4600 
4601 struct disk_extent {
4602 	u64 ram_bytes;
4603 	u64 disk_bytenr;
4604 	u64 disk_num_bytes;
4605 	u64 offset;
4606 	u64 num_bytes;
4607 	u8 compression;
4608 	u8 encryption;
4609 	u16 other_encoding;
4610 };
4611 
4612 static int is_cowonly_root(u64 root_objectid)
4613 {
4614 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4615 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4616 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4617 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4618 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4619 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4620 		return 1;
4621 	return 0;
4622 }
4623 
4624 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4625 				    struct btrfs_root *extent_root,
4626 				    struct btrfs_ref_path *ref_path,
4627 				    int first_time)
4628 {
4629 	struct extent_buffer *leaf;
4630 	struct btrfs_path *path;
4631 	struct btrfs_extent_ref *ref;
4632 	struct btrfs_key key;
4633 	struct btrfs_key found_key;
4634 	u64 bytenr;
4635 	u32 nritems;
4636 	int level;
4637 	int ret = 1;
4638 
4639 	path = btrfs_alloc_path();
4640 	if (!path)
4641 		return -ENOMEM;
4642 
4643 	if (first_time) {
4644 		ref_path->lowest_level = -1;
4645 		ref_path->current_level = -1;
4646 		ref_path->shared_level = -1;
4647 		goto walk_up;
4648 	}
4649 walk_down:
4650 	level = ref_path->current_level - 1;
4651 	while (level >= -1) {
4652 		u64 parent;
4653 		if (level < ref_path->lowest_level)
4654 			break;
4655 
4656 		if (level >= 0)
4657 			bytenr = ref_path->nodes[level];
4658 		else
4659 			bytenr = ref_path->extent_start;
4660 		BUG_ON(bytenr == 0);
4661 
4662 		parent = ref_path->nodes[level + 1];
4663 		ref_path->nodes[level + 1] = 0;
4664 		ref_path->current_level = level;
4665 		BUG_ON(parent == 0);
4666 
4667 		key.objectid = bytenr;
4668 		key.offset = parent + 1;
4669 		key.type = BTRFS_EXTENT_REF_KEY;
4670 
4671 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4672 		if (ret < 0)
4673 			goto out;
4674 		BUG_ON(ret == 0);
4675 
4676 		leaf = path->nodes[0];
4677 		nritems = btrfs_header_nritems(leaf);
4678 		if (path->slots[0] >= nritems) {
4679 			ret = btrfs_next_leaf(extent_root, path);
4680 			if (ret < 0)
4681 				goto out;
4682 			if (ret > 0)
4683 				goto next;
4684 			leaf = path->nodes[0];
4685 		}
4686 
4687 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4688 		if (found_key.objectid == bytenr &&
4689 		    found_key.type == BTRFS_EXTENT_REF_KEY) {
4690 			if (level < ref_path->shared_level)
4691 				ref_path->shared_level = level;
4692 			goto found;
4693 		}
4694 next:
4695 		level--;
4696 		btrfs_release_path(extent_root, path);
4697 		cond_resched();
4698 	}
4699 	/* reached lowest level */
4700 	ret = 1;
4701 	goto out;
4702 walk_up:
4703 	level = ref_path->current_level;
4704 	while (level < BTRFS_MAX_LEVEL - 1) {
4705 		u64 ref_objectid;
4706 
4707 		if (level >= 0)
4708 			bytenr = ref_path->nodes[level];
4709 		else
4710 			bytenr = ref_path->extent_start;
4711 
4712 		BUG_ON(bytenr == 0);
4713 
4714 		key.objectid = bytenr;
4715 		key.offset = 0;
4716 		key.type = BTRFS_EXTENT_REF_KEY;
4717 
4718 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4719 		if (ret < 0)
4720 			goto out;
4721 
4722 		leaf = path->nodes[0];
4723 		nritems = btrfs_header_nritems(leaf);
4724 		if (path->slots[0] >= nritems) {
4725 			ret = btrfs_next_leaf(extent_root, path);
4726 			if (ret < 0)
4727 				goto out;
4728 			if (ret > 0) {
4729 				/* the extent was freed by someone */
4730 				if (ref_path->lowest_level == level)
4731 					goto out;
4732 				btrfs_release_path(extent_root, path);
4733 				goto walk_down;
4734 			}
4735 			leaf = path->nodes[0];
4736 		}
4737 
4738 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4739 		if (found_key.objectid != bytenr ||
4740 				found_key.type != BTRFS_EXTENT_REF_KEY) {
4741 			/* the extent was freed by someone */
4742 			if (ref_path->lowest_level == level) {
4743 				ret = 1;
4744 				goto out;
4745 			}
4746 			btrfs_release_path(extent_root, path);
4747 			goto walk_down;
4748 		}
4749 found:
4750 		ref = btrfs_item_ptr(leaf, path->slots[0],
4751 				struct btrfs_extent_ref);
4752 		ref_objectid = btrfs_ref_objectid(leaf, ref);
4753 		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4754 			if (first_time) {
4755 				level = (int)ref_objectid;
4756 				BUG_ON(level >= BTRFS_MAX_LEVEL);
4757 				ref_path->lowest_level = level;
4758 				ref_path->current_level = level;
4759 				ref_path->nodes[level] = bytenr;
4760 			} else {
4761 				WARN_ON(ref_objectid != level);
4762 			}
4763 		} else {
4764 			WARN_ON(level != -1);
4765 		}
4766 		first_time = 0;
4767 
4768 		if (ref_path->lowest_level == level) {
4769 			ref_path->owner_objectid = ref_objectid;
4770 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4771 		}
4772 
4773 		/*
4774 		 * the block is tree root or the block isn't in reference
4775 		 * counted tree.
4776 		 */
4777 		if (found_key.objectid == found_key.offset ||
4778 		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4779 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4780 			ref_path->root_generation =
4781 				btrfs_ref_generation(leaf, ref);
4782 			if (level < 0) {
4783 				/* special reference from the tree log */
4784 				ref_path->nodes[0] = found_key.offset;
4785 				ref_path->current_level = 0;
4786 			}
4787 			ret = 0;
4788 			goto out;
4789 		}
4790 
4791 		level++;
4792 		BUG_ON(ref_path->nodes[level] != 0);
4793 		ref_path->nodes[level] = found_key.offset;
4794 		ref_path->current_level = level;
4795 
4796 		/*
4797 		 * the reference was created in the running transaction,
4798 		 * no need to continue walking up.
4799 		 */
4800 		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4801 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4802 			ref_path->root_generation =
4803 				btrfs_ref_generation(leaf, ref);
4804 			ret = 0;
4805 			goto out;
4806 		}
4807 
4808 		btrfs_release_path(extent_root, path);
4809 		cond_resched();
4810 	}
4811 	/* reached max tree level, but no tree root found. */
4812 	BUG();
4813 out:
4814 	btrfs_free_path(path);
4815 	return ret;
4816 }
4817 
4818 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4819 				struct btrfs_root *extent_root,
4820 				struct btrfs_ref_path *ref_path,
4821 				u64 extent_start)
4822 {
4823 	memset(ref_path, 0, sizeof(*ref_path));
4824 	ref_path->extent_start = extent_start;
4825 
4826 	return __next_ref_path(trans, extent_root, ref_path, 1);
4827 }
4828 
4829 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4830 			       struct btrfs_root *extent_root,
4831 			       struct btrfs_ref_path *ref_path)
4832 {
4833 	return __next_ref_path(trans, extent_root, ref_path, 0);
4834 }
4835 
4836 static noinline int get_new_locations(struct inode *reloc_inode,
4837 				      struct btrfs_key *extent_key,
4838 				      u64 offset, int no_fragment,
4839 				      struct disk_extent **extents,
4840 				      int *nr_extents)
4841 {
4842 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4843 	struct btrfs_path *path;
4844 	struct btrfs_file_extent_item *fi;
4845 	struct extent_buffer *leaf;
4846 	struct disk_extent *exts = *extents;
4847 	struct btrfs_key found_key;
4848 	u64 cur_pos;
4849 	u64 last_byte;
4850 	u32 nritems;
4851 	int nr = 0;
4852 	int max = *nr_extents;
4853 	int ret;
4854 
4855 	WARN_ON(!no_fragment && *extents);
4856 	if (!exts) {
4857 		max = 1;
4858 		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4859 		if (!exts)
4860 			return -ENOMEM;
4861 	}
4862 
4863 	path = btrfs_alloc_path();
4864 	BUG_ON(!path);
4865 
4866 	cur_pos = extent_key->objectid - offset;
4867 	last_byte = extent_key->objectid + extent_key->offset;
4868 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4869 				       cur_pos, 0);
4870 	if (ret < 0)
4871 		goto out;
4872 	if (ret > 0) {
4873 		ret = -ENOENT;
4874 		goto out;
4875 	}
4876 
4877 	while (1) {
4878 		leaf = path->nodes[0];
4879 		nritems = btrfs_header_nritems(leaf);
4880 		if (path->slots[0] >= nritems) {
4881 			ret = btrfs_next_leaf(root, path);
4882 			if (ret < 0)
4883 				goto out;
4884 			if (ret > 0)
4885 				break;
4886 			leaf = path->nodes[0];
4887 		}
4888 
4889 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4890 		if (found_key.offset != cur_pos ||
4891 		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
4892 		    found_key.objectid != reloc_inode->i_ino)
4893 			break;
4894 
4895 		fi = btrfs_item_ptr(leaf, path->slots[0],
4896 				    struct btrfs_file_extent_item);
4897 		if (btrfs_file_extent_type(leaf, fi) !=
4898 		    BTRFS_FILE_EXTENT_REG ||
4899 		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4900 			break;
4901 
4902 		if (nr == max) {
4903 			struct disk_extent *old = exts;
4904 			max *= 2;
4905 			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4906 			memcpy(exts, old, sizeof(*exts) * nr);
4907 			if (old != *extents)
4908 				kfree(old);
4909 		}
4910 
4911 		exts[nr].disk_bytenr =
4912 			btrfs_file_extent_disk_bytenr(leaf, fi);
4913 		exts[nr].disk_num_bytes =
4914 			btrfs_file_extent_disk_num_bytes(leaf, fi);
4915 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4916 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4917 		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4918 		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4919 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4920 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4921 									   fi);
4922 		BUG_ON(exts[nr].offset > 0);
4923 		BUG_ON(exts[nr].compression || exts[nr].encryption);
4924 		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4925 
4926 		cur_pos += exts[nr].num_bytes;
4927 		nr++;
4928 
4929 		if (cur_pos + offset >= last_byte)
4930 			break;
4931 
4932 		if (no_fragment) {
4933 			ret = 1;
4934 			goto out;
4935 		}
4936 		path->slots[0]++;
4937 	}
4938 
4939 	BUG_ON(cur_pos + offset > last_byte);
4940 	if (cur_pos + offset < last_byte) {
4941 		ret = -ENOENT;
4942 		goto out;
4943 	}
4944 	ret = 0;
4945 out:
4946 	btrfs_free_path(path);
4947 	if (ret) {
4948 		if (exts != *extents)
4949 			kfree(exts);
4950 	} else {
4951 		*extents = exts;
4952 		*nr_extents = nr;
4953 	}
4954 	return ret;
4955 }
4956 
4957 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4958 					struct btrfs_root *root,
4959 					struct btrfs_path *path,
4960 					struct btrfs_key *extent_key,
4961 					struct btrfs_key *leaf_key,
4962 					struct btrfs_ref_path *ref_path,
4963 					struct disk_extent *new_extents,
4964 					int nr_extents)
4965 {
4966 	struct extent_buffer *leaf;
4967 	struct btrfs_file_extent_item *fi;
4968 	struct inode *inode = NULL;
4969 	struct btrfs_key key;
4970 	u64 lock_start = 0;
4971 	u64 lock_end = 0;
4972 	u64 num_bytes;
4973 	u64 ext_offset;
4974 	u64 search_end = (u64)-1;
4975 	u32 nritems;
4976 	int nr_scaned = 0;
4977 	int extent_locked = 0;
4978 	int extent_type;
4979 	int ret;
4980 
4981 	memcpy(&key, leaf_key, sizeof(key));
4982 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4983 		if (key.objectid < ref_path->owner_objectid ||
4984 		    (key.objectid == ref_path->owner_objectid &&
4985 		     key.type < BTRFS_EXTENT_DATA_KEY)) {
4986 			key.objectid = ref_path->owner_objectid;
4987 			key.type = BTRFS_EXTENT_DATA_KEY;
4988 			key.offset = 0;
4989 		}
4990 	}
4991 
4992 	while (1) {
4993 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4994 		if (ret < 0)
4995 			goto out;
4996 
4997 		leaf = path->nodes[0];
4998 		nritems = btrfs_header_nritems(leaf);
4999 next:
5000 		if (extent_locked && ret > 0) {
5001 			/*
5002 			 * the file extent item was modified by someone
5003 			 * before the extent got locked.
5004 			 */
5005 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5006 				      lock_end, GFP_NOFS);
5007 			extent_locked = 0;
5008 		}
5009 
5010 		if (path->slots[0] >= nritems) {
5011 			if (++nr_scaned > 2)
5012 				break;
5013 
5014 			BUG_ON(extent_locked);
5015 			ret = btrfs_next_leaf(root, path);
5016 			if (ret < 0)
5017 				goto out;
5018 			if (ret > 0)
5019 				break;
5020 			leaf = path->nodes[0];
5021 			nritems = btrfs_header_nritems(leaf);
5022 		}
5023 
5024 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5025 
5026 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5027 			if ((key.objectid > ref_path->owner_objectid) ||
5028 			    (key.objectid == ref_path->owner_objectid &&
5029 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
5030 			    key.offset >= search_end)
5031 				break;
5032 		}
5033 
5034 		if (inode && key.objectid != inode->i_ino) {
5035 			BUG_ON(extent_locked);
5036 			btrfs_release_path(root, path);
5037 			mutex_unlock(&inode->i_mutex);
5038 			iput(inode);
5039 			inode = NULL;
5040 			continue;
5041 		}
5042 
5043 		if (key.type != BTRFS_EXTENT_DATA_KEY) {
5044 			path->slots[0]++;
5045 			ret = 1;
5046 			goto next;
5047 		}
5048 		fi = btrfs_item_ptr(leaf, path->slots[0],
5049 				    struct btrfs_file_extent_item);
5050 		extent_type = btrfs_file_extent_type(leaf, fi);
5051 		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
5052 		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
5053 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
5054 		     extent_key->objectid)) {
5055 			path->slots[0]++;
5056 			ret = 1;
5057 			goto next;
5058 		}
5059 
5060 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5061 		ext_offset = btrfs_file_extent_offset(leaf, fi);
5062 
5063 		if (search_end == (u64)-1) {
5064 			search_end = key.offset - ext_offset +
5065 				btrfs_file_extent_ram_bytes(leaf, fi);
5066 		}
5067 
5068 		if (!extent_locked) {
5069 			lock_start = key.offset;
5070 			lock_end = lock_start + num_bytes - 1;
5071 		} else {
5072 			if (lock_start > key.offset ||
5073 			    lock_end + 1 < key.offset + num_bytes) {
5074 				unlock_extent(&BTRFS_I(inode)->io_tree,
5075 					      lock_start, lock_end, GFP_NOFS);
5076 				extent_locked = 0;
5077 			}
5078 		}
5079 
5080 		if (!inode) {
5081 			btrfs_release_path(root, path);
5082 
5083 			inode = btrfs_iget_locked(root->fs_info->sb,
5084 						  key.objectid, root);
5085 			if (inode->i_state & I_NEW) {
5086 				BTRFS_I(inode)->root = root;
5087 				BTRFS_I(inode)->location.objectid =
5088 					key.objectid;
5089 				BTRFS_I(inode)->location.type =
5090 					BTRFS_INODE_ITEM_KEY;
5091 				BTRFS_I(inode)->location.offset = 0;
5092 				btrfs_read_locked_inode(inode);
5093 				unlock_new_inode(inode);
5094 			}
5095 			/*
5096 			 * some code call btrfs_commit_transaction while
5097 			 * holding the i_mutex, so we can't use mutex_lock
5098 			 * here.
5099 			 */
5100 			if (is_bad_inode(inode) ||
5101 			    !mutex_trylock(&inode->i_mutex)) {
5102 				iput(inode);
5103 				inode = NULL;
5104 				key.offset = (u64)-1;
5105 				goto skip;
5106 			}
5107 		}
5108 
5109 		if (!extent_locked) {
5110 			struct btrfs_ordered_extent *ordered;
5111 
5112 			btrfs_release_path(root, path);
5113 
5114 			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5115 				    lock_end, GFP_NOFS);
5116 			ordered = btrfs_lookup_first_ordered_extent(inode,
5117 								    lock_end);
5118 			if (ordered &&
5119 			    ordered->file_offset <= lock_end &&
5120 			    ordered->file_offset + ordered->len > lock_start) {
5121 				unlock_extent(&BTRFS_I(inode)->io_tree,
5122 					      lock_start, lock_end, GFP_NOFS);
5123 				btrfs_start_ordered_extent(inode, ordered, 1);
5124 				btrfs_put_ordered_extent(ordered);
5125 				key.offset += num_bytes;
5126 				goto skip;
5127 			}
5128 			if (ordered)
5129 				btrfs_put_ordered_extent(ordered);
5130 
5131 			extent_locked = 1;
5132 			continue;
5133 		}
5134 
5135 		if (nr_extents == 1) {
5136 			/* update extent pointer in place */
5137 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
5138 						new_extents[0].disk_bytenr);
5139 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5140 						new_extents[0].disk_num_bytes);
5141 			btrfs_mark_buffer_dirty(leaf);
5142 
5143 			btrfs_drop_extent_cache(inode, key.offset,
5144 						key.offset + num_bytes - 1, 0);
5145 
5146 			ret = btrfs_inc_extent_ref(trans, root,
5147 						new_extents[0].disk_bytenr,
5148 						new_extents[0].disk_num_bytes,
5149 						leaf->start,
5150 						root->root_key.objectid,
5151 						trans->transid,
5152 						key.objectid);
5153 			BUG_ON(ret);
5154 
5155 			ret = btrfs_free_extent(trans, root,
5156 						extent_key->objectid,
5157 						extent_key->offset,
5158 						leaf->start,
5159 						btrfs_header_owner(leaf),
5160 						btrfs_header_generation(leaf),
5161 						key.objectid, 0);
5162 			BUG_ON(ret);
5163 
5164 			btrfs_release_path(root, path);
5165 			key.offset += num_bytes;
5166 		} else {
5167 			BUG_ON(1);
5168 #if 0
5169 			u64 alloc_hint;
5170 			u64 extent_len;
5171 			int i;
5172 			/*
5173 			 * drop old extent pointer at first, then insert the
5174 			 * new pointers one bye one
5175 			 */
5176 			btrfs_release_path(root, path);
5177 			ret = btrfs_drop_extents(trans, root, inode, key.offset,
5178 						 key.offset + num_bytes,
5179 						 key.offset, &alloc_hint);
5180 			BUG_ON(ret);
5181 
5182 			for (i = 0; i < nr_extents; i++) {
5183 				if (ext_offset >= new_extents[i].num_bytes) {
5184 					ext_offset -= new_extents[i].num_bytes;
5185 					continue;
5186 				}
5187 				extent_len = min(new_extents[i].num_bytes -
5188 						 ext_offset, num_bytes);
5189 
5190 				ret = btrfs_insert_empty_item(trans, root,
5191 							      path, &key,
5192 							      sizeof(*fi));
5193 				BUG_ON(ret);
5194 
5195 				leaf = path->nodes[0];
5196 				fi = btrfs_item_ptr(leaf, path->slots[0],
5197 						struct btrfs_file_extent_item);
5198 				btrfs_set_file_extent_generation(leaf, fi,
5199 							trans->transid);
5200 				btrfs_set_file_extent_type(leaf, fi,
5201 							BTRFS_FILE_EXTENT_REG);
5202 				btrfs_set_file_extent_disk_bytenr(leaf, fi,
5203 						new_extents[i].disk_bytenr);
5204 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5205 						new_extents[i].disk_num_bytes);
5206 				btrfs_set_file_extent_ram_bytes(leaf, fi,
5207 						new_extents[i].ram_bytes);
5208 
5209 				btrfs_set_file_extent_compression(leaf, fi,
5210 						new_extents[i].compression);
5211 				btrfs_set_file_extent_encryption(leaf, fi,
5212 						new_extents[i].encryption);
5213 				btrfs_set_file_extent_other_encoding(leaf, fi,
5214 						new_extents[i].other_encoding);
5215 
5216 				btrfs_set_file_extent_num_bytes(leaf, fi,
5217 							extent_len);
5218 				ext_offset += new_extents[i].offset;
5219 				btrfs_set_file_extent_offset(leaf, fi,
5220 							ext_offset);
5221 				btrfs_mark_buffer_dirty(leaf);
5222 
5223 				btrfs_drop_extent_cache(inode, key.offset,
5224 						key.offset + extent_len - 1, 0);
5225 
5226 				ret = btrfs_inc_extent_ref(trans, root,
5227 						new_extents[i].disk_bytenr,
5228 						new_extents[i].disk_num_bytes,
5229 						leaf->start,
5230 						root->root_key.objectid,
5231 						trans->transid, key.objectid);
5232 				BUG_ON(ret);
5233 				btrfs_release_path(root, path);
5234 
5235 				inode_add_bytes(inode, extent_len);
5236 
5237 				ext_offset = 0;
5238 				num_bytes -= extent_len;
5239 				key.offset += extent_len;
5240 
5241 				if (num_bytes == 0)
5242 					break;
5243 			}
5244 			BUG_ON(i >= nr_extents);
5245 #endif
5246 		}
5247 
5248 		if (extent_locked) {
5249 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5250 				      lock_end, GFP_NOFS);
5251 			extent_locked = 0;
5252 		}
5253 skip:
5254 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
5255 		    key.offset >= search_end)
5256 			break;
5257 
5258 		cond_resched();
5259 	}
5260 	ret = 0;
5261 out:
5262 	btrfs_release_path(root, path);
5263 	if (inode) {
5264 		mutex_unlock(&inode->i_mutex);
5265 		if (extent_locked) {
5266 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5267 				      lock_end, GFP_NOFS);
5268 		}
5269 		iput(inode);
5270 	}
5271 	return ret;
5272 }
5273 
5274 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
5275 			       struct btrfs_root *root,
5276 			       struct extent_buffer *buf, u64 orig_start)
5277 {
5278 	int level;
5279 	int ret;
5280 
5281 	BUG_ON(btrfs_header_generation(buf) != trans->transid);
5282 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5283 
5284 	level = btrfs_header_level(buf);
5285 	if (level == 0) {
5286 		struct btrfs_leaf_ref *ref;
5287 		struct btrfs_leaf_ref *orig_ref;
5288 
5289 		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
5290 		if (!orig_ref)
5291 			return -ENOENT;
5292 
5293 		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
5294 		if (!ref) {
5295 			btrfs_free_leaf_ref(root, orig_ref);
5296 			return -ENOMEM;
5297 		}
5298 
5299 		ref->nritems = orig_ref->nritems;
5300 		memcpy(ref->extents, orig_ref->extents,
5301 			sizeof(ref->extents[0]) * ref->nritems);
5302 
5303 		btrfs_free_leaf_ref(root, orig_ref);
5304 
5305 		ref->root_gen = trans->transid;
5306 		ref->bytenr = buf->start;
5307 		ref->owner = btrfs_header_owner(buf);
5308 		ref->generation = btrfs_header_generation(buf);
5309 
5310 		ret = btrfs_add_leaf_ref(root, ref, 0);
5311 		WARN_ON(ret);
5312 		btrfs_free_leaf_ref(root, ref);
5313 	}
5314 	return 0;
5315 }
5316 
5317 static noinline int invalidate_extent_cache(struct btrfs_root *root,
5318 					struct extent_buffer *leaf,
5319 					struct btrfs_block_group_cache *group,
5320 					struct btrfs_root *target_root)
5321 {
5322 	struct btrfs_key key;
5323 	struct inode *inode = NULL;
5324 	struct btrfs_file_extent_item *fi;
5325 	u64 num_bytes;
5326 	u64 skip_objectid = 0;
5327 	u32 nritems;
5328 	u32 i;
5329 
5330 	nritems = btrfs_header_nritems(leaf);
5331 	for (i = 0; i < nritems; i++) {
5332 		btrfs_item_key_to_cpu(leaf, &key, i);
5333 		if (key.objectid == skip_objectid ||
5334 		    key.type != BTRFS_EXTENT_DATA_KEY)
5335 			continue;
5336 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
5337 		if (btrfs_file_extent_type(leaf, fi) ==
5338 		    BTRFS_FILE_EXTENT_INLINE)
5339 			continue;
5340 		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
5341 			continue;
5342 		if (!inode || inode->i_ino != key.objectid) {
5343 			iput(inode);
5344 			inode = btrfs_ilookup(target_root->fs_info->sb,
5345 					      key.objectid, target_root, 1);
5346 		}
5347 		if (!inode) {
5348 			skip_objectid = key.objectid;
5349 			continue;
5350 		}
5351 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5352 
5353 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5354 			    key.offset + num_bytes - 1, GFP_NOFS);
5355 		btrfs_drop_extent_cache(inode, key.offset,
5356 					key.offset + num_bytes - 1, 1);
5357 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5358 			      key.offset + num_bytes - 1, GFP_NOFS);
5359 		cond_resched();
5360 	}
5361 	iput(inode);
5362 	return 0;
5363 }
5364 
5365 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5366 					struct btrfs_root *root,
5367 					struct extent_buffer *leaf,
5368 					struct btrfs_block_group_cache *group,
5369 					struct inode *reloc_inode)
5370 {
5371 	struct btrfs_key key;
5372 	struct btrfs_key extent_key;
5373 	struct btrfs_file_extent_item *fi;
5374 	struct btrfs_leaf_ref *ref;
5375 	struct disk_extent *new_extent;
5376 	u64 bytenr;
5377 	u64 num_bytes;
5378 	u32 nritems;
5379 	u32 i;
5380 	int ext_index;
5381 	int nr_extent;
5382 	int ret;
5383 
5384 	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
5385 	BUG_ON(!new_extent);
5386 
5387 	ref = btrfs_lookup_leaf_ref(root, leaf->start);
5388 	BUG_ON(!ref);
5389 
5390 	ext_index = -1;
5391 	nritems = btrfs_header_nritems(leaf);
5392 	for (i = 0; i < nritems; i++) {
5393 		btrfs_item_key_to_cpu(leaf, &key, i);
5394 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
5395 			continue;
5396 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
5397 		if (btrfs_file_extent_type(leaf, fi) ==
5398 		    BTRFS_FILE_EXTENT_INLINE)
5399 			continue;
5400 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5401 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
5402 		if (bytenr == 0)
5403 			continue;
5404 
5405 		ext_index++;
5406 		if (bytenr >= group->key.objectid + group->key.offset ||
5407 		    bytenr + num_bytes <= group->key.objectid)
5408 			continue;
5409 
5410 		extent_key.objectid = bytenr;
5411 		extent_key.offset = num_bytes;
5412 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
5413 		nr_extent = 1;
5414 		ret = get_new_locations(reloc_inode, &extent_key,
5415 					group->key.objectid, 1,
5416 					&new_extent, &nr_extent);
5417 		if (ret > 0)
5418 			continue;
5419 		BUG_ON(ret < 0);
5420 
5421 		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
5422 		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
5423 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
5424 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
5425 
5426 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
5427 						new_extent->disk_bytenr);
5428 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5429 						new_extent->disk_num_bytes);
5430 		btrfs_mark_buffer_dirty(leaf);
5431 
5432 		ret = btrfs_inc_extent_ref(trans, root,
5433 					new_extent->disk_bytenr,
5434 					new_extent->disk_num_bytes,
5435 					leaf->start,
5436 					root->root_key.objectid,
5437 					trans->transid, key.objectid);
5438 		BUG_ON(ret);
5439 		ret = btrfs_free_extent(trans, root,
5440 					bytenr, num_bytes, leaf->start,
5441 					btrfs_header_owner(leaf),
5442 					btrfs_header_generation(leaf),
5443 					key.objectid, 0);
5444 		BUG_ON(ret);
5445 		cond_resched();
5446 	}
5447 	kfree(new_extent);
5448 	BUG_ON(ext_index + 1 != ref->nritems);
5449 	btrfs_free_leaf_ref(root, ref);
5450 	return 0;
5451 }
5452 
5453 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
5454 			  struct btrfs_root *root)
5455 {
5456 	struct btrfs_root *reloc_root;
5457 	int ret;
5458 
5459 	if (root->reloc_root) {
5460 		reloc_root = root->reloc_root;
5461 		root->reloc_root = NULL;
5462 		list_add(&reloc_root->dead_list,
5463 			 &root->fs_info->dead_reloc_roots);
5464 
5465 		btrfs_set_root_bytenr(&reloc_root->root_item,
5466 				      reloc_root->node->start);
5467 		btrfs_set_root_level(&root->root_item,
5468 				     btrfs_header_level(reloc_root->node));
5469 		memset(&reloc_root->root_item.drop_progress, 0,
5470 			sizeof(struct btrfs_disk_key));
5471 		reloc_root->root_item.drop_level = 0;
5472 
5473 		ret = btrfs_update_root(trans, root->fs_info->tree_root,
5474 					&reloc_root->root_key,
5475 					&reloc_root->root_item);
5476 		BUG_ON(ret);
5477 	}
5478 	return 0;
5479 }
5480 
5481 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
5482 {
5483 	struct btrfs_trans_handle *trans;
5484 	struct btrfs_root *reloc_root;
5485 	struct btrfs_root *prev_root = NULL;
5486 	struct list_head dead_roots;
5487 	int ret;
5488 	unsigned long nr;
5489 
5490 	INIT_LIST_HEAD(&dead_roots);
5491 	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
5492 
5493 	while (!list_empty(&dead_roots)) {
5494 		reloc_root = list_entry(dead_roots.prev,
5495 					struct btrfs_root, dead_list);
5496 		list_del_init(&reloc_root->dead_list);
5497 
5498 		BUG_ON(reloc_root->commit_root != NULL);
5499 		while (1) {
5500 			trans = btrfs_join_transaction(root, 1);
5501 			BUG_ON(!trans);
5502 
5503 			mutex_lock(&root->fs_info->drop_mutex);
5504 			ret = btrfs_drop_snapshot(trans, reloc_root);
5505 			if (ret != -EAGAIN)
5506 				break;
5507 			mutex_unlock(&root->fs_info->drop_mutex);
5508 
5509 			nr = trans->blocks_used;
5510 			ret = btrfs_end_transaction(trans, root);
5511 			BUG_ON(ret);
5512 			btrfs_btree_balance_dirty(root, nr);
5513 		}
5514 
5515 		free_extent_buffer(reloc_root->node);
5516 
5517 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
5518 				     &reloc_root->root_key);
5519 		BUG_ON(ret);
5520 		mutex_unlock(&root->fs_info->drop_mutex);
5521 
5522 		nr = trans->blocks_used;
5523 		ret = btrfs_end_transaction(trans, root);
5524 		BUG_ON(ret);
5525 		btrfs_btree_balance_dirty(root, nr);
5526 
5527 		kfree(prev_root);
5528 		prev_root = reloc_root;
5529 	}
5530 	if (prev_root) {
5531 		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5532 		kfree(prev_root);
5533 	}
5534 	return 0;
5535 }
5536 
5537 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5538 {
5539 	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5540 	return 0;
5541 }
5542 
5543 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5544 {
5545 	struct btrfs_root *reloc_root;
5546 	struct btrfs_trans_handle *trans;
5547 	struct btrfs_key location;
5548 	int found;
5549 	int ret;
5550 
5551 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5552 	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5553 	BUG_ON(ret);
5554 	found = !list_empty(&root->fs_info->dead_reloc_roots);
5555 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5556 
5557 	if (found) {
5558 		trans = btrfs_start_transaction(root, 1);
5559 		BUG_ON(!trans);
5560 		ret = btrfs_commit_transaction(trans, root);
5561 		BUG_ON(ret);
5562 	}
5563 
5564 	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5565 	location.offset = (u64)-1;
5566 	location.type = BTRFS_ROOT_ITEM_KEY;
5567 
5568 	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5569 	BUG_ON(!reloc_root);
5570 	btrfs_orphan_cleanup(reloc_root);
5571 	return 0;
5572 }
5573 
5574 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5575 				    struct btrfs_root *root)
5576 {
5577 	struct btrfs_root *reloc_root;
5578 	struct extent_buffer *eb;
5579 	struct btrfs_root_item *root_item;
5580 	struct btrfs_key root_key;
5581 	int ret;
5582 
5583 	BUG_ON(!root->ref_cows);
5584 	if (root->reloc_root)
5585 		return 0;
5586 
5587 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5588 	BUG_ON(!root_item);
5589 
5590 	ret = btrfs_copy_root(trans, root, root->commit_root,
5591 			      &eb, BTRFS_TREE_RELOC_OBJECTID);
5592 	BUG_ON(ret);
5593 
5594 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5595 	root_key.offset = root->root_key.objectid;
5596 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5597 
5598 	memcpy(root_item, &root->root_item, sizeof(root_item));
5599 	btrfs_set_root_refs(root_item, 0);
5600 	btrfs_set_root_bytenr(root_item, eb->start);
5601 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
5602 	btrfs_set_root_generation(root_item, trans->transid);
5603 
5604 	btrfs_tree_unlock(eb);
5605 	free_extent_buffer(eb);
5606 
5607 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5608 				&root_key, root_item);
5609 	BUG_ON(ret);
5610 	kfree(root_item);
5611 
5612 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5613 						 &root_key);
5614 	BUG_ON(!reloc_root);
5615 	reloc_root->last_trans = trans->transid;
5616 	reloc_root->commit_root = NULL;
5617 	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5618 
5619 	root->reloc_root = reloc_root;
5620 	return 0;
5621 }
5622 
5623 /*
5624  * Core function of space balance.
5625  *
5626  * The idea is using reloc trees to relocate tree blocks in reference
5627  * counted roots. There is one reloc tree for each subvol, and all
5628  * reloc trees share same root key objectid. Reloc trees are snapshots
5629  * of the latest committed roots of subvols (root->commit_root).
5630  *
5631  * To relocate a tree block referenced by a subvol, there are two steps.
5632  * COW the block through subvol's reloc tree, then update block pointer
5633  * in the subvol to point to the new block. Since all reloc trees share
5634  * same root key objectid, doing special handing for tree blocks owned
5635  * by them is easy. Once a tree block has been COWed in one reloc tree,
5636  * we can use the resulting new block directly when the same block is
5637  * required to COW again through other reloc trees. By this way, relocated
5638  * tree blocks are shared between reloc trees, so they are also shared
5639  * between subvols.
5640  */
5641 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5642 				      struct btrfs_root *root,
5643 				      struct btrfs_path *path,
5644 				      struct btrfs_key *first_key,
5645 				      struct btrfs_ref_path *ref_path,
5646 				      struct btrfs_block_group_cache *group,
5647 				      struct inode *reloc_inode)
5648 {
5649 	struct btrfs_root *reloc_root;
5650 	struct extent_buffer *eb = NULL;
5651 	struct btrfs_key *keys;
5652 	u64 *nodes;
5653 	int level;
5654 	int shared_level;
5655 	int lowest_level = 0;
5656 	int ret;
5657 
5658 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5659 		lowest_level = ref_path->owner_objectid;
5660 
5661 	if (!root->ref_cows) {
5662 		path->lowest_level = lowest_level;
5663 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5664 		BUG_ON(ret < 0);
5665 		path->lowest_level = 0;
5666 		btrfs_release_path(root, path);
5667 		return 0;
5668 	}
5669 
5670 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5671 	ret = init_reloc_tree(trans, root);
5672 	BUG_ON(ret);
5673 	reloc_root = root->reloc_root;
5674 
5675 	shared_level = ref_path->shared_level;
5676 	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5677 
5678 	keys = ref_path->node_keys;
5679 	nodes = ref_path->new_nodes;
5680 	memset(&keys[shared_level + 1], 0,
5681 	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5682 	memset(&nodes[shared_level + 1], 0,
5683 	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5684 
5685 	if (nodes[lowest_level] == 0) {
5686 		path->lowest_level = lowest_level;
5687 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5688 					0, 1);
5689 		BUG_ON(ret);
5690 		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5691 			eb = path->nodes[level];
5692 			if (!eb || eb == reloc_root->node)
5693 				break;
5694 			nodes[level] = eb->start;
5695 			if (level == 0)
5696 				btrfs_item_key_to_cpu(eb, &keys[level], 0);
5697 			else
5698 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
5699 		}
5700 		if (nodes[0] &&
5701 		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5702 			eb = path->nodes[0];
5703 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
5704 						      group, reloc_inode);
5705 			BUG_ON(ret);
5706 		}
5707 		btrfs_release_path(reloc_root, path);
5708 	} else {
5709 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5710 				       lowest_level);
5711 		BUG_ON(ret);
5712 	}
5713 
5714 	/*
5715 	 * replace tree blocks in the fs tree with tree blocks in
5716 	 * the reloc tree.
5717 	 */
5718 	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5719 	BUG_ON(ret < 0);
5720 
5721 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5722 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5723 					0, 0);
5724 		BUG_ON(ret);
5725 		extent_buffer_get(path->nodes[0]);
5726 		eb = path->nodes[0];
5727 		btrfs_release_path(reloc_root, path);
5728 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
5729 		BUG_ON(ret);
5730 		free_extent_buffer(eb);
5731 	}
5732 
5733 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5734 	path->lowest_level = 0;
5735 	return 0;
5736 }
5737 
5738 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5739 					struct btrfs_root *root,
5740 					struct btrfs_path *path,
5741 					struct btrfs_key *first_key,
5742 					struct btrfs_ref_path *ref_path)
5743 {
5744 	int ret;
5745 
5746 	ret = relocate_one_path(trans, root, path, first_key,
5747 				ref_path, NULL, NULL);
5748 	BUG_ON(ret);
5749 
5750 	if (root == root->fs_info->extent_root)
5751 		btrfs_extent_post_op(trans, root);
5752 
5753 	return 0;
5754 }
5755 
5756 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5757 				    struct btrfs_root *extent_root,
5758 				    struct btrfs_path *path,
5759 				    struct btrfs_key *extent_key)
5760 {
5761 	int ret;
5762 
5763 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5764 	if (ret)
5765 		goto out;
5766 	ret = btrfs_del_item(trans, extent_root, path);
5767 out:
5768 	btrfs_release_path(extent_root, path);
5769 	return ret;
5770 }
5771 
5772 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5773 						struct btrfs_ref_path *ref_path)
5774 {
5775 	struct btrfs_key root_key;
5776 
5777 	root_key.objectid = ref_path->root_objectid;
5778 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5779 	if (is_cowonly_root(ref_path->root_objectid))
5780 		root_key.offset = 0;
5781 	else
5782 		root_key.offset = (u64)-1;
5783 
5784 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
5785 }
5786 
5787 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5788 					struct btrfs_path *path,
5789 					struct btrfs_key *extent_key,
5790 					struct btrfs_block_group_cache *group,
5791 					struct inode *reloc_inode, int pass)
5792 {
5793 	struct btrfs_trans_handle *trans;
5794 	struct btrfs_root *found_root;
5795 	struct btrfs_ref_path *ref_path = NULL;
5796 	struct disk_extent *new_extents = NULL;
5797 	int nr_extents = 0;
5798 	int loops;
5799 	int ret;
5800 	int level;
5801 	struct btrfs_key first_key;
5802 	u64 prev_block = 0;
5803 
5804 
5805 	trans = btrfs_start_transaction(extent_root, 1);
5806 	BUG_ON(!trans);
5807 
5808 	if (extent_key->objectid == 0) {
5809 		ret = del_extent_zero(trans, extent_root, path, extent_key);
5810 		goto out;
5811 	}
5812 
5813 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5814 	if (!ref_path) {
5815 		ret = -ENOMEM;
5816 		goto out;
5817 	}
5818 
5819 	for (loops = 0; ; loops++) {
5820 		if (loops == 0) {
5821 			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5822 						   extent_key->objectid);
5823 		} else {
5824 			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5825 		}
5826 		if (ret < 0)
5827 			goto out;
5828 		if (ret > 0)
5829 			break;
5830 
5831 		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5832 		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5833 			continue;
5834 
5835 		found_root = read_ref_root(extent_root->fs_info, ref_path);
5836 		BUG_ON(!found_root);
5837 		/*
5838 		 * for reference counted tree, only process reference paths
5839 		 * rooted at the latest committed root.
5840 		 */
5841 		if (found_root->ref_cows &&
5842 		    ref_path->root_generation != found_root->root_key.offset)
5843 			continue;
5844 
5845 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5846 			if (pass == 0) {
5847 				/*
5848 				 * copy data extents to new locations
5849 				 */
5850 				u64 group_start = group->key.objectid;
5851 				ret = relocate_data_extent(reloc_inode,
5852 							   extent_key,
5853 							   group_start);
5854 				if (ret < 0)
5855 					goto out;
5856 				break;
5857 			}
5858 			level = 0;
5859 		} else {
5860 			level = ref_path->owner_objectid;
5861 		}
5862 
5863 		if (prev_block != ref_path->nodes[level]) {
5864 			struct extent_buffer *eb;
5865 			u64 block_start = ref_path->nodes[level];
5866 			u64 block_size = btrfs_level_size(found_root, level);
5867 
5868 			eb = read_tree_block(found_root, block_start,
5869 					     block_size, 0);
5870 			btrfs_tree_lock(eb);
5871 			BUG_ON(level != btrfs_header_level(eb));
5872 
5873 			if (level == 0)
5874 				btrfs_item_key_to_cpu(eb, &first_key, 0);
5875 			else
5876 				btrfs_node_key_to_cpu(eb, &first_key, 0);
5877 
5878 			btrfs_tree_unlock(eb);
5879 			free_extent_buffer(eb);
5880 			prev_block = block_start;
5881 		}
5882 
5883 		mutex_lock(&extent_root->fs_info->trans_mutex);
5884 		btrfs_record_root_in_trans(found_root);
5885 		mutex_unlock(&extent_root->fs_info->trans_mutex);
5886 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5887 			/*
5888 			 * try to update data extent references while
5889 			 * keeping metadata shared between snapshots.
5890 			 */
5891 			if (pass == 1) {
5892 				ret = relocate_one_path(trans, found_root,
5893 						path, &first_key, ref_path,
5894 						group, reloc_inode);
5895 				if (ret < 0)
5896 					goto out;
5897 				continue;
5898 			}
5899 			/*
5900 			 * use fallback method to process the remaining
5901 			 * references.
5902 			 */
5903 			if (!new_extents) {
5904 				u64 group_start = group->key.objectid;
5905 				new_extents = kmalloc(sizeof(*new_extents),
5906 						      GFP_NOFS);
5907 				nr_extents = 1;
5908 				ret = get_new_locations(reloc_inode,
5909 							extent_key,
5910 							group_start, 1,
5911 							&new_extents,
5912 							&nr_extents);
5913 				if (ret)
5914 					goto out;
5915 			}
5916 			ret = replace_one_extent(trans, found_root,
5917 						path, extent_key,
5918 						&first_key, ref_path,
5919 						new_extents, nr_extents);
5920 		} else {
5921 			ret = relocate_tree_block(trans, found_root, path,
5922 						  &first_key, ref_path);
5923 		}
5924 		if (ret < 0)
5925 			goto out;
5926 	}
5927 	ret = 0;
5928 out:
5929 	btrfs_end_transaction(trans, extent_root);
5930 	kfree(new_extents);
5931 	kfree(ref_path);
5932 	return ret;
5933 }
5934 
5935 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5936 {
5937 	u64 num_devices;
5938 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5939 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5940 
5941 	num_devices = root->fs_info->fs_devices->rw_devices;
5942 	if (num_devices == 1) {
5943 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5944 		stripped = flags & ~stripped;
5945 
5946 		/* turn raid0 into single device chunks */
5947 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
5948 			return stripped;
5949 
5950 		/* turn mirroring into duplication */
5951 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5952 			     BTRFS_BLOCK_GROUP_RAID10))
5953 			return stripped | BTRFS_BLOCK_GROUP_DUP;
5954 		return flags;
5955 	} else {
5956 		/* they already had raid on here, just return */
5957 		if (flags & stripped)
5958 			return flags;
5959 
5960 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5961 		stripped = flags & ~stripped;
5962 
5963 		/* switch duplicated blocks with raid1 */
5964 		if (flags & BTRFS_BLOCK_GROUP_DUP)
5965 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
5966 
5967 		/* turn single device chunks into raid0 */
5968 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
5969 	}
5970 	return flags;
5971 }
5972 
5973 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5974 		     struct btrfs_block_group_cache *shrink_block_group,
5975 		     int force)
5976 {
5977 	struct btrfs_trans_handle *trans;
5978 	u64 new_alloc_flags;
5979 	u64 calc;
5980 
5981 	spin_lock(&shrink_block_group->lock);
5982 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5983 		spin_unlock(&shrink_block_group->lock);
5984 
5985 		trans = btrfs_start_transaction(root, 1);
5986 		spin_lock(&shrink_block_group->lock);
5987 
5988 		new_alloc_flags = update_block_group_flags(root,
5989 						   shrink_block_group->flags);
5990 		if (new_alloc_flags != shrink_block_group->flags) {
5991 			calc =
5992 			     btrfs_block_group_used(&shrink_block_group->item);
5993 		} else {
5994 			calc = shrink_block_group->key.offset;
5995 		}
5996 		spin_unlock(&shrink_block_group->lock);
5997 
5998 		do_chunk_alloc(trans, root->fs_info->extent_root,
5999 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
6000 
6001 		btrfs_end_transaction(trans, root);
6002 	} else
6003 		spin_unlock(&shrink_block_group->lock);
6004 	return 0;
6005 }
6006 
6007 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6008 				 struct btrfs_root *root,
6009 				 u64 objectid, u64 size)
6010 {
6011 	struct btrfs_path *path;
6012 	struct btrfs_inode_item *item;
6013 	struct extent_buffer *leaf;
6014 	int ret;
6015 
6016 	path = btrfs_alloc_path();
6017 	if (!path)
6018 		return -ENOMEM;
6019 
6020 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6021 	if (ret)
6022 		goto out;
6023 
6024 	leaf = path->nodes[0];
6025 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6026 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6027 	btrfs_set_inode_generation(leaf, item, 1);
6028 	btrfs_set_inode_size(leaf, item, size);
6029 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6030 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6031 	btrfs_mark_buffer_dirty(leaf);
6032 	btrfs_release_path(root, path);
6033 out:
6034 	btrfs_free_path(path);
6035 	return ret;
6036 }
6037 
6038 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6039 					struct btrfs_block_group_cache *group)
6040 {
6041 	struct inode *inode = NULL;
6042 	struct btrfs_trans_handle *trans;
6043 	struct btrfs_root *root;
6044 	struct btrfs_key root_key;
6045 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
6046 	int err = 0;
6047 
6048 	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6049 	root_key.type = BTRFS_ROOT_ITEM_KEY;
6050 	root_key.offset = (u64)-1;
6051 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6052 	if (IS_ERR(root))
6053 		return ERR_CAST(root);
6054 
6055 	trans = btrfs_start_transaction(root, 1);
6056 	BUG_ON(!trans);
6057 
6058 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
6059 	if (err)
6060 		goto out;
6061 
6062 	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
6063 	BUG_ON(err);
6064 
6065 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6066 				       group->key.offset, 0, group->key.offset,
6067 				       0, 0, 0);
6068 	BUG_ON(err);
6069 
6070 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6071 	if (inode->i_state & I_NEW) {
6072 		BTRFS_I(inode)->root = root;
6073 		BTRFS_I(inode)->location.objectid = objectid;
6074 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6075 		BTRFS_I(inode)->location.offset = 0;
6076 		btrfs_read_locked_inode(inode);
6077 		unlock_new_inode(inode);
6078 		BUG_ON(is_bad_inode(inode));
6079 	} else {
6080 		BUG_ON(1);
6081 	}
6082 	BTRFS_I(inode)->index_cnt = group->key.objectid;
6083 
6084 	err = btrfs_orphan_add(trans, inode);
6085 out:
6086 	btrfs_end_transaction(trans, root);
6087 	if (err) {
6088 		if (inode)
6089 			iput(inode);
6090 		inode = ERR_PTR(err);
6091 	}
6092 	return inode;
6093 }
6094 
6095 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6096 {
6097 
6098 	struct btrfs_ordered_sum *sums;
6099 	struct btrfs_sector_sum *sector_sum;
6100 	struct btrfs_ordered_extent *ordered;
6101 	struct btrfs_root *root = BTRFS_I(inode)->root;
6102 	struct list_head list;
6103 	size_t offset;
6104 	int ret;
6105 	u64 disk_bytenr;
6106 
6107 	INIT_LIST_HEAD(&list);
6108 
6109 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6110 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6111 
6112 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6113 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6114 				       disk_bytenr + len - 1, &list);
6115 
6116 	while (!list_empty(&list)) {
6117 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6118 		list_del_init(&sums->list);
6119 
6120 		sector_sum = sums->sums;
6121 		sums->bytenr = ordered->start;
6122 
6123 		offset = 0;
6124 		while (offset < sums->len) {
6125 			sector_sum->bytenr += ordered->start - disk_bytenr;
6126 			sector_sum++;
6127 			offset += root->sectorsize;
6128 		}
6129 
6130 		btrfs_add_ordered_sum(inode, ordered, sums);
6131 	}
6132 	btrfs_put_ordered_extent(ordered);
6133 	return 0;
6134 }
6135 
6136 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6137 {
6138 	struct btrfs_trans_handle *trans;
6139 	struct btrfs_path *path;
6140 	struct btrfs_fs_info *info = root->fs_info;
6141 	struct extent_buffer *leaf;
6142 	struct inode *reloc_inode;
6143 	struct btrfs_block_group_cache *block_group;
6144 	struct btrfs_key key;
6145 	u64 skipped;
6146 	u64 cur_byte;
6147 	u64 total_found;
6148 	u32 nritems;
6149 	int ret;
6150 	int progress;
6151 	int pass = 0;
6152 
6153 	root = root->fs_info->extent_root;
6154 
6155 	block_group = btrfs_lookup_block_group(info, group_start);
6156 	BUG_ON(!block_group);
6157 
6158 	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6159 	       (unsigned long long)block_group->key.objectid,
6160 	       (unsigned long long)block_group->flags);
6161 
6162 	path = btrfs_alloc_path();
6163 	BUG_ON(!path);
6164 
6165 	reloc_inode = create_reloc_inode(info, block_group);
6166 	BUG_ON(IS_ERR(reloc_inode));
6167 
6168 	__alloc_chunk_for_shrink(root, block_group, 1);
6169 	set_block_group_readonly(block_group);
6170 
6171 	btrfs_start_delalloc_inodes(info->tree_root);
6172 	btrfs_wait_ordered_extents(info->tree_root, 0);
6173 again:
6174 	skipped = 0;
6175 	total_found = 0;
6176 	progress = 0;
6177 	key.objectid = block_group->key.objectid;
6178 	key.offset = 0;
6179 	key.type = 0;
6180 	cur_byte = key.objectid;
6181 
6182 	trans = btrfs_start_transaction(info->tree_root, 1);
6183 	btrfs_commit_transaction(trans, info->tree_root);
6184 
6185 	mutex_lock(&root->fs_info->cleaner_mutex);
6186 	btrfs_clean_old_snapshots(info->tree_root);
6187 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6188 	mutex_unlock(&root->fs_info->cleaner_mutex);
6189 
6190 	while (1) {
6191 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6192 		if (ret < 0)
6193 			goto out;
6194 next:
6195 		leaf = path->nodes[0];
6196 		nritems = btrfs_header_nritems(leaf);
6197 		if (path->slots[0] >= nritems) {
6198 			ret = btrfs_next_leaf(root, path);
6199 			if (ret < 0)
6200 				goto out;
6201 			if (ret == 1) {
6202 				ret = 0;
6203 				break;
6204 			}
6205 			leaf = path->nodes[0];
6206 			nritems = btrfs_header_nritems(leaf);
6207 		}
6208 
6209 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6210 
6211 		if (key.objectid >= block_group->key.objectid +
6212 		    block_group->key.offset)
6213 			break;
6214 
6215 		if (progress && need_resched()) {
6216 			btrfs_release_path(root, path);
6217 			cond_resched();
6218 			progress = 0;
6219 			continue;
6220 		}
6221 		progress = 1;
6222 
6223 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
6224 		    key.objectid + key.offset <= cur_byte) {
6225 			path->slots[0]++;
6226 			goto next;
6227 		}
6228 
6229 		total_found++;
6230 		cur_byte = key.objectid + key.offset;
6231 		btrfs_release_path(root, path);
6232 
6233 		__alloc_chunk_for_shrink(root, block_group, 0);
6234 		ret = relocate_one_extent(root, path, &key, block_group,
6235 					  reloc_inode, pass);
6236 		BUG_ON(ret < 0);
6237 		if (ret > 0)
6238 			skipped++;
6239 
6240 		key.objectid = cur_byte;
6241 		key.type = 0;
6242 		key.offset = 0;
6243 	}
6244 
6245 	btrfs_release_path(root, path);
6246 
6247 	if (pass == 0) {
6248 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
6249 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
6250 	}
6251 
6252 	if (total_found > 0) {
6253 		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
6254 		       (unsigned long long)total_found, pass);
6255 		pass++;
6256 		if (total_found == skipped && pass > 2) {
6257 			iput(reloc_inode);
6258 			reloc_inode = create_reloc_inode(info, block_group);
6259 			pass = 0;
6260 		}
6261 		goto again;
6262 	}
6263 
6264 	/* delete reloc_inode */
6265 	iput(reloc_inode);
6266 
6267 	/* unpin extents in this range */
6268 	trans = btrfs_start_transaction(info->tree_root, 1);
6269 	btrfs_commit_transaction(trans, info->tree_root);
6270 
6271 	spin_lock(&block_group->lock);
6272 	WARN_ON(block_group->pinned > 0);
6273 	WARN_ON(block_group->reserved > 0);
6274 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6275 	spin_unlock(&block_group->lock);
6276 	put_block_group(block_group);
6277 	ret = 0;
6278 out:
6279 	btrfs_free_path(path);
6280 	return ret;
6281 }
6282 
6283 static int find_first_block_group(struct btrfs_root *root,
6284 		struct btrfs_path *path, struct btrfs_key *key)
6285 {
6286 	int ret = 0;
6287 	struct btrfs_key found_key;
6288 	struct extent_buffer *leaf;
6289 	int slot;
6290 
6291 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
6292 	if (ret < 0)
6293 		goto out;
6294 
6295 	while (1) {
6296 		slot = path->slots[0];
6297 		leaf = path->nodes[0];
6298 		if (slot >= btrfs_header_nritems(leaf)) {
6299 			ret = btrfs_next_leaf(root, path);
6300 			if (ret == 0)
6301 				continue;
6302 			if (ret < 0)
6303 				goto out;
6304 			break;
6305 		}
6306 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6307 
6308 		if (found_key.objectid >= key->objectid &&
6309 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6310 			ret = 0;
6311 			goto out;
6312 		}
6313 		path->slots[0]++;
6314 	}
6315 	ret = -ENOENT;
6316 out:
6317 	return ret;
6318 }
6319 
6320 int btrfs_free_block_groups(struct btrfs_fs_info *info)
6321 {
6322 	struct btrfs_block_group_cache *block_group;
6323 	struct rb_node *n;
6324 
6325 	spin_lock(&info->block_group_cache_lock);
6326 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
6327 		block_group = rb_entry(n, struct btrfs_block_group_cache,
6328 				       cache_node);
6329 		rb_erase(&block_group->cache_node,
6330 			 &info->block_group_cache_tree);
6331 		spin_unlock(&info->block_group_cache_lock);
6332 
6333 		btrfs_remove_free_space_cache(block_group);
6334 		down_write(&block_group->space_info->groups_sem);
6335 		list_del(&block_group->list);
6336 		up_write(&block_group->space_info->groups_sem);
6337 
6338 		WARN_ON(atomic_read(&block_group->count) != 1);
6339 		kfree(block_group);
6340 
6341 		spin_lock(&info->block_group_cache_lock);
6342 	}
6343 	spin_unlock(&info->block_group_cache_lock);
6344 	return 0;
6345 }
6346 
6347 int btrfs_read_block_groups(struct btrfs_root *root)
6348 {
6349 	struct btrfs_path *path;
6350 	int ret;
6351 	struct btrfs_block_group_cache *cache;
6352 	struct btrfs_fs_info *info = root->fs_info;
6353 	struct btrfs_space_info *space_info;
6354 	struct btrfs_key key;
6355 	struct btrfs_key found_key;
6356 	struct extent_buffer *leaf;
6357 
6358 	root = info->extent_root;
6359 	key.objectid = 0;
6360 	key.offset = 0;
6361 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
6362 	path = btrfs_alloc_path();
6363 	if (!path)
6364 		return -ENOMEM;
6365 
6366 	while (1) {
6367 		ret = find_first_block_group(root, path, &key);
6368 		if (ret > 0) {
6369 			ret = 0;
6370 			goto error;
6371 		}
6372 		if (ret != 0)
6373 			goto error;
6374 
6375 		leaf = path->nodes[0];
6376 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6377 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
6378 		if (!cache) {
6379 			ret = -ENOMEM;
6380 			break;
6381 		}
6382 
6383 		atomic_set(&cache->count, 1);
6384 		spin_lock_init(&cache->lock);
6385 		mutex_init(&cache->alloc_mutex);
6386 		mutex_init(&cache->cache_mutex);
6387 		INIT_LIST_HEAD(&cache->list);
6388 		read_extent_buffer(leaf, &cache->item,
6389 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
6390 				   sizeof(cache->item));
6391 		memcpy(&cache->key, &found_key, sizeof(found_key));
6392 
6393 		key.objectid = found_key.objectid + found_key.offset;
6394 		btrfs_release_path(root, path);
6395 		cache->flags = btrfs_block_group_flags(&cache->item);
6396 
6397 		ret = update_space_info(info, cache->flags, found_key.offset,
6398 					btrfs_block_group_used(&cache->item),
6399 					&space_info);
6400 		BUG_ON(ret);
6401 		cache->space_info = space_info;
6402 		down_write(&space_info->groups_sem);
6403 		list_add_tail(&cache->list, &space_info->block_groups);
6404 		up_write(&space_info->groups_sem);
6405 
6406 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
6407 		BUG_ON(ret);
6408 
6409 		set_avail_alloc_bits(root->fs_info, cache->flags);
6410 		if (btrfs_chunk_readonly(root, cache->key.objectid))
6411 			set_block_group_readonly(cache);
6412 	}
6413 	ret = 0;
6414 error:
6415 	btrfs_free_path(path);
6416 	return ret;
6417 }
6418 
6419 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6420 			   struct btrfs_root *root, u64 bytes_used,
6421 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
6422 			   u64 size)
6423 {
6424 	int ret;
6425 	struct btrfs_root *extent_root;
6426 	struct btrfs_block_group_cache *cache;
6427 
6428 	extent_root = root->fs_info->extent_root;
6429 
6430 	root->fs_info->last_trans_new_blockgroup = trans->transid;
6431 
6432 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
6433 	if (!cache)
6434 		return -ENOMEM;
6435 
6436 	cache->key.objectid = chunk_offset;
6437 	cache->key.offset = size;
6438 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6439 	atomic_set(&cache->count, 1);
6440 	spin_lock_init(&cache->lock);
6441 	mutex_init(&cache->alloc_mutex);
6442 	mutex_init(&cache->cache_mutex);
6443 	INIT_LIST_HEAD(&cache->list);
6444 
6445 	btrfs_set_block_group_used(&cache->item, bytes_used);
6446 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
6447 	cache->flags = type;
6448 	btrfs_set_block_group_flags(&cache->item, type);
6449 
6450 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
6451 				&cache->space_info);
6452 	BUG_ON(ret);
6453 	down_write(&cache->space_info->groups_sem);
6454 	list_add_tail(&cache->list, &cache->space_info->block_groups);
6455 	up_write(&cache->space_info->groups_sem);
6456 
6457 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
6458 	BUG_ON(ret);
6459 
6460 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
6461 				sizeof(cache->item));
6462 	BUG_ON(ret);
6463 
6464 	finish_current_insert(trans, extent_root, 0);
6465 	ret = del_pending_extents(trans, extent_root, 0);
6466 	BUG_ON(ret);
6467 	set_avail_alloc_bits(extent_root->fs_info, type);
6468 
6469 	return 0;
6470 }
6471 
6472 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6473 			     struct btrfs_root *root, u64 group_start)
6474 {
6475 	struct btrfs_path *path;
6476 	struct btrfs_block_group_cache *block_group;
6477 	struct btrfs_key key;
6478 	int ret;
6479 
6480 	root = root->fs_info->extent_root;
6481 
6482 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
6483 	BUG_ON(!block_group);
6484 	BUG_ON(!block_group->ro);
6485 
6486 	memcpy(&key, &block_group->key, sizeof(key));
6487 
6488 	path = btrfs_alloc_path();
6489 	BUG_ON(!path);
6490 
6491 	spin_lock(&root->fs_info->block_group_cache_lock);
6492 	rb_erase(&block_group->cache_node,
6493 		 &root->fs_info->block_group_cache_tree);
6494 	spin_unlock(&root->fs_info->block_group_cache_lock);
6495 	btrfs_remove_free_space_cache(block_group);
6496 	down_write(&block_group->space_info->groups_sem);
6497 	list_del(&block_group->list);
6498 	up_write(&block_group->space_info->groups_sem);
6499 
6500 	spin_lock(&block_group->space_info->lock);
6501 	block_group->space_info->total_bytes -= block_group->key.offset;
6502 	block_group->space_info->bytes_readonly -= block_group->key.offset;
6503 	spin_unlock(&block_group->space_info->lock);
6504 	block_group->space_info->full = 0;
6505 
6506 	put_block_group(block_group);
6507 	put_block_group(block_group);
6508 
6509 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6510 	if (ret > 0)
6511 		ret = -EIO;
6512 	if (ret < 0)
6513 		goto out;
6514 
6515 	ret = btrfs_del_item(trans, root, path);
6516 out:
6517 	btrfs_free_path(path);
6518 	return ret;
6519 }
6520