xref: /linux/fs/btrfs/extent-tree.c (revision 2277ab4a1df50e05bc732fe9488d4e902bb8399a)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include "compat.h"
25 #include "hash.h"
26 #include "ctree.h"
27 #include "disk-io.h"
28 #include "print-tree.h"
29 #include "transaction.h"
30 #include "volumes.h"
31 #include "locking.h"
32 #include "free-space-cache.h"
33 
34 static int update_reserved_extents(struct btrfs_root *root,
35 				   u64 bytenr, u64 num, int reserve);
36 static int update_block_group(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      u64 bytenr, u64 num_bytes, int alloc,
39 			      int mark_free);
40 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
41 				struct btrfs_root *root,
42 				u64 bytenr, u64 num_bytes, u64 parent,
43 				u64 root_objectid, u64 owner_objectid,
44 				u64 owner_offset, int refs_to_drop,
45 				struct btrfs_delayed_extent_op *extra_op);
46 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
47 				    struct extent_buffer *leaf,
48 				    struct btrfs_extent_item *ei);
49 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
50 				      struct btrfs_root *root,
51 				      u64 parent, u64 root_objectid,
52 				      u64 flags, u64 owner, u64 offset,
53 				      struct btrfs_key *ins, int ref_mod);
54 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
55 				     struct btrfs_root *root,
56 				     u64 parent, u64 root_objectid,
57 				     u64 flags, struct btrfs_disk_key *key,
58 				     int level, struct btrfs_key *ins);
59 
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 			  struct btrfs_root *extent_root, u64 alloc_bytes,
62 			  u64 flags, int force);
63 
64 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65 {
66 	return (cache->flags & bits) == bits;
67 }
68 
69 /*
70  * this adds the block group to the fs_info rb tree for the block group
71  * cache
72  */
73 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 				struct btrfs_block_group_cache *block_group)
75 {
76 	struct rb_node **p;
77 	struct rb_node *parent = NULL;
78 	struct btrfs_block_group_cache *cache;
79 
80 	spin_lock(&info->block_group_cache_lock);
81 	p = &info->block_group_cache_tree.rb_node;
82 
83 	while (*p) {
84 		parent = *p;
85 		cache = rb_entry(parent, struct btrfs_block_group_cache,
86 				 cache_node);
87 		if (block_group->key.objectid < cache->key.objectid) {
88 			p = &(*p)->rb_left;
89 		} else if (block_group->key.objectid > cache->key.objectid) {
90 			p = &(*p)->rb_right;
91 		} else {
92 			spin_unlock(&info->block_group_cache_lock);
93 			return -EEXIST;
94 		}
95 	}
96 
97 	rb_link_node(&block_group->cache_node, parent, p);
98 	rb_insert_color(&block_group->cache_node,
99 			&info->block_group_cache_tree);
100 	spin_unlock(&info->block_group_cache_lock);
101 
102 	return 0;
103 }
104 
105 /*
106  * This will return the block group at or after bytenr if contains is 0, else
107  * it will return the block group that contains the bytenr
108  */
109 static struct btrfs_block_group_cache *
110 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 			      int contains)
112 {
113 	struct btrfs_block_group_cache *cache, *ret = NULL;
114 	struct rb_node *n;
115 	u64 end, start;
116 
117 	spin_lock(&info->block_group_cache_lock);
118 	n = info->block_group_cache_tree.rb_node;
119 
120 	while (n) {
121 		cache = rb_entry(n, struct btrfs_block_group_cache,
122 				 cache_node);
123 		end = cache->key.objectid + cache->key.offset - 1;
124 		start = cache->key.objectid;
125 
126 		if (bytenr < start) {
127 			if (!contains && (!ret || start < ret->key.objectid))
128 				ret = cache;
129 			n = n->rb_left;
130 		} else if (bytenr > start) {
131 			if (contains && bytenr <= end) {
132 				ret = cache;
133 				break;
134 			}
135 			n = n->rb_right;
136 		} else {
137 			ret = cache;
138 			break;
139 		}
140 	}
141 	if (ret)
142 		atomic_inc(&ret->count);
143 	spin_unlock(&info->block_group_cache_lock);
144 
145 	return ret;
146 }
147 
148 /*
149  * this is only called by cache_block_group, since we could have freed extents
150  * we need to check the pinned_extents for any extents that can't be used yet
151  * since their free space will be released as soon as the transaction commits.
152  */
153 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 			      struct btrfs_fs_info *info, u64 start, u64 end)
155 {
156 	u64 extent_start, extent_end, size;
157 	int ret;
158 
159 	while (start < end) {
160 		ret = find_first_extent_bit(&info->pinned_extents, start,
161 					    &extent_start, &extent_end,
162 					    EXTENT_DIRTY);
163 		if (ret)
164 			break;
165 
166 		if (extent_start == start) {
167 			start = extent_end + 1;
168 		} else if (extent_start > start && extent_start < end) {
169 			size = extent_start - start;
170 			ret = btrfs_add_free_space(block_group, start,
171 						   size);
172 			BUG_ON(ret);
173 			start = extent_end + 1;
174 		} else {
175 			break;
176 		}
177 	}
178 
179 	if (start < end) {
180 		size = end - start;
181 		ret = btrfs_add_free_space(block_group, start, size);
182 		BUG_ON(ret);
183 	}
184 
185 	return 0;
186 }
187 
188 static int remove_sb_from_cache(struct btrfs_root *root,
189 				struct btrfs_block_group_cache *cache)
190 {
191 	u64 bytenr;
192 	u64 *logical;
193 	int stripe_len;
194 	int i, nr, ret;
195 
196 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
197 		bytenr = btrfs_sb_offset(i);
198 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
199 				       cache->key.objectid, bytenr, 0,
200 				       &logical, &nr, &stripe_len);
201 		BUG_ON(ret);
202 		while (nr--) {
203 			btrfs_remove_free_space(cache, logical[nr],
204 						stripe_len);
205 		}
206 		kfree(logical);
207 	}
208 	return 0;
209 }
210 
211 static int cache_block_group(struct btrfs_root *root,
212 			     struct btrfs_block_group_cache *block_group)
213 {
214 	struct btrfs_path *path;
215 	int ret = 0;
216 	struct btrfs_key key;
217 	struct extent_buffer *leaf;
218 	int slot;
219 	u64 last;
220 
221 	if (!block_group)
222 		return 0;
223 
224 	root = root->fs_info->extent_root;
225 
226 	if (block_group->cached)
227 		return 0;
228 
229 	path = btrfs_alloc_path();
230 	if (!path)
231 		return -ENOMEM;
232 
233 	path->reada = 2;
234 	/*
235 	 * we get into deadlocks with paths held by callers of this function.
236 	 * since the alloc_mutex is protecting things right now, just
237 	 * skip the locking here
238 	 */
239 	path->skip_locking = 1;
240 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
241 	key.objectid = last;
242 	key.offset = 0;
243 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
244 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
245 	if (ret < 0)
246 		goto err;
247 
248 	while (1) {
249 		leaf = path->nodes[0];
250 		slot = path->slots[0];
251 		if (slot >= btrfs_header_nritems(leaf)) {
252 			ret = btrfs_next_leaf(root, path);
253 			if (ret < 0)
254 				goto err;
255 			if (ret == 0)
256 				continue;
257 			else
258 				break;
259 		}
260 		btrfs_item_key_to_cpu(leaf, &key, slot);
261 		if (key.objectid < block_group->key.objectid)
262 			goto next;
263 
264 		if (key.objectid >= block_group->key.objectid +
265 		    block_group->key.offset)
266 			break;
267 
268 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
269 			add_new_free_space(block_group, root->fs_info, last,
270 					   key.objectid);
271 
272 			last = key.objectid + key.offset;
273 		}
274 next:
275 		path->slots[0]++;
276 	}
277 
278 	add_new_free_space(block_group, root->fs_info, last,
279 			   block_group->key.objectid +
280 			   block_group->key.offset);
281 
282 	block_group->cached = 1;
283 	remove_sb_from_cache(root, block_group);
284 	ret = 0;
285 err:
286 	btrfs_free_path(path);
287 	return ret;
288 }
289 
290 /*
291  * return the block group that starts at or after bytenr
292  */
293 static struct btrfs_block_group_cache *
294 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
295 {
296 	struct btrfs_block_group_cache *cache;
297 
298 	cache = block_group_cache_tree_search(info, bytenr, 0);
299 
300 	return cache;
301 }
302 
303 /*
304  * return the block group that contains the given bytenr
305  */
306 struct btrfs_block_group_cache *btrfs_lookup_block_group(
307 						 struct btrfs_fs_info *info,
308 						 u64 bytenr)
309 {
310 	struct btrfs_block_group_cache *cache;
311 
312 	cache = block_group_cache_tree_search(info, bytenr, 1);
313 
314 	return cache;
315 }
316 
317 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
318 {
319 	if (atomic_dec_and_test(&cache->count))
320 		kfree(cache);
321 }
322 
323 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
324 						  u64 flags)
325 {
326 	struct list_head *head = &info->space_info;
327 	struct btrfs_space_info *found;
328 
329 	rcu_read_lock();
330 	list_for_each_entry_rcu(found, head, list) {
331 		if (found->flags == flags) {
332 			rcu_read_unlock();
333 			return found;
334 		}
335 	}
336 	rcu_read_unlock();
337 	return NULL;
338 }
339 
340 /*
341  * after adding space to the filesystem, we need to clear the full flags
342  * on all the space infos.
343  */
344 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
345 {
346 	struct list_head *head = &info->space_info;
347 	struct btrfs_space_info *found;
348 
349 	rcu_read_lock();
350 	list_for_each_entry_rcu(found, head, list)
351 		found->full = 0;
352 	rcu_read_unlock();
353 }
354 
355 static u64 div_factor(u64 num, int factor)
356 {
357 	if (factor == 10)
358 		return num;
359 	num *= factor;
360 	do_div(num, 10);
361 	return num;
362 }
363 
364 u64 btrfs_find_block_group(struct btrfs_root *root,
365 			   u64 search_start, u64 search_hint, int owner)
366 {
367 	struct btrfs_block_group_cache *cache;
368 	u64 used;
369 	u64 last = max(search_hint, search_start);
370 	u64 group_start = 0;
371 	int full_search = 0;
372 	int factor = 9;
373 	int wrapped = 0;
374 again:
375 	while (1) {
376 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
377 		if (!cache)
378 			break;
379 
380 		spin_lock(&cache->lock);
381 		last = cache->key.objectid + cache->key.offset;
382 		used = btrfs_block_group_used(&cache->item);
383 
384 		if ((full_search || !cache->ro) &&
385 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
386 			if (used + cache->pinned + cache->reserved <
387 			    div_factor(cache->key.offset, factor)) {
388 				group_start = cache->key.objectid;
389 				spin_unlock(&cache->lock);
390 				btrfs_put_block_group(cache);
391 				goto found;
392 			}
393 		}
394 		spin_unlock(&cache->lock);
395 		btrfs_put_block_group(cache);
396 		cond_resched();
397 	}
398 	if (!wrapped) {
399 		last = search_start;
400 		wrapped = 1;
401 		goto again;
402 	}
403 	if (!full_search && factor < 10) {
404 		last = search_start;
405 		full_search = 1;
406 		factor = 10;
407 		goto again;
408 	}
409 found:
410 	return group_start;
411 }
412 
413 /* simple helper to search for an existing extent at a given offset */
414 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
415 {
416 	int ret;
417 	struct btrfs_key key;
418 	struct btrfs_path *path;
419 
420 	path = btrfs_alloc_path();
421 	BUG_ON(!path);
422 	key.objectid = start;
423 	key.offset = len;
424 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
425 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
426 				0, 0);
427 	btrfs_free_path(path);
428 	return ret;
429 }
430 
431 /*
432  * Back reference rules.  Back refs have three main goals:
433  *
434  * 1) differentiate between all holders of references to an extent so that
435  *    when a reference is dropped we can make sure it was a valid reference
436  *    before freeing the extent.
437  *
438  * 2) Provide enough information to quickly find the holders of an extent
439  *    if we notice a given block is corrupted or bad.
440  *
441  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
442  *    maintenance.  This is actually the same as #2, but with a slightly
443  *    different use case.
444  *
445  * There are two kinds of back refs. The implicit back refs is optimized
446  * for pointers in non-shared tree blocks. For a given pointer in a block,
447  * back refs of this kind provide information about the block's owner tree
448  * and the pointer's key. These information allow us to find the block by
449  * b-tree searching. The full back refs is for pointers in tree blocks not
450  * referenced by their owner trees. The location of tree block is recorded
451  * in the back refs. Actually the full back refs is generic, and can be
452  * used in all cases the implicit back refs is used. The major shortcoming
453  * of the full back refs is its overhead. Every time a tree block gets
454  * COWed, we have to update back refs entry for all pointers in it.
455  *
456  * For a newly allocated tree block, we use implicit back refs for
457  * pointers in it. This means most tree related operations only involve
458  * implicit back refs. For a tree block created in old transaction, the
459  * only way to drop a reference to it is COW it. So we can detect the
460  * event that tree block loses its owner tree's reference and do the
461  * back refs conversion.
462  *
463  * When a tree block is COW'd through a tree, there are four cases:
464  *
465  * The reference count of the block is one and the tree is the block's
466  * owner tree. Nothing to do in this case.
467  *
468  * The reference count of the block is one and the tree is not the
469  * block's owner tree. In this case, full back refs is used for pointers
470  * in the block. Remove these full back refs, add implicit back refs for
471  * every pointers in the new block.
472  *
473  * The reference count of the block is greater than one and the tree is
474  * the block's owner tree. In this case, implicit back refs is used for
475  * pointers in the block. Add full back refs for every pointers in the
476  * block, increase lower level extents' reference counts. The original
477  * implicit back refs are entailed to the new block.
478  *
479  * The reference count of the block is greater than one and the tree is
480  * not the block's owner tree. Add implicit back refs for every pointer in
481  * the new block, increase lower level extents' reference count.
482  *
483  * Back Reference Key composing:
484  *
485  * The key objectid corresponds to the first byte in the extent,
486  * The key type is used to differentiate between types of back refs.
487  * There are different meanings of the key offset for different types
488  * of back refs.
489  *
490  * File extents can be referenced by:
491  *
492  * - multiple snapshots, subvolumes, or different generations in one subvol
493  * - different files inside a single subvolume
494  * - different offsets inside a file (bookend extents in file.c)
495  *
496  * The extent ref structure for the implicit back refs has fields for:
497  *
498  * - Objectid of the subvolume root
499  * - objectid of the file holding the reference
500  * - original offset in the file
501  * - how many bookend extents
502  *
503  * The key offset for the implicit back refs is hash of the first
504  * three fields.
505  *
506  * The extent ref structure for the full back refs has field for:
507  *
508  * - number of pointers in the tree leaf
509  *
510  * The key offset for the implicit back refs is the first byte of
511  * the tree leaf
512  *
513  * When a file extent is allocated, The implicit back refs is used.
514  * the fields are filled in:
515  *
516  *     (root_key.objectid, inode objectid, offset in file, 1)
517  *
518  * When a file extent is removed file truncation, we find the
519  * corresponding implicit back refs and check the following fields:
520  *
521  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
522  *
523  * Btree extents can be referenced by:
524  *
525  * - Different subvolumes
526  *
527  * Both the implicit back refs and the full back refs for tree blocks
528  * only consist of key. The key offset for the implicit back refs is
529  * objectid of block's owner tree. The key offset for the full back refs
530  * is the first byte of parent block.
531  *
532  * When implicit back refs is used, information about the lowest key and
533  * level of the tree block are required. These information are stored in
534  * tree block info structure.
535  */
536 
537 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
538 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
539 				  struct btrfs_root *root,
540 				  struct btrfs_path *path,
541 				  u64 owner, u32 extra_size)
542 {
543 	struct btrfs_extent_item *item;
544 	struct btrfs_extent_item_v0 *ei0;
545 	struct btrfs_extent_ref_v0 *ref0;
546 	struct btrfs_tree_block_info *bi;
547 	struct extent_buffer *leaf;
548 	struct btrfs_key key;
549 	struct btrfs_key found_key;
550 	u32 new_size = sizeof(*item);
551 	u64 refs;
552 	int ret;
553 
554 	leaf = path->nodes[0];
555 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
556 
557 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
558 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
559 			     struct btrfs_extent_item_v0);
560 	refs = btrfs_extent_refs_v0(leaf, ei0);
561 
562 	if (owner == (u64)-1) {
563 		while (1) {
564 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
565 				ret = btrfs_next_leaf(root, path);
566 				if (ret < 0)
567 					return ret;
568 				BUG_ON(ret > 0);
569 				leaf = path->nodes[0];
570 			}
571 			btrfs_item_key_to_cpu(leaf, &found_key,
572 					      path->slots[0]);
573 			BUG_ON(key.objectid != found_key.objectid);
574 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
575 				path->slots[0]++;
576 				continue;
577 			}
578 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
579 					      struct btrfs_extent_ref_v0);
580 			owner = btrfs_ref_objectid_v0(leaf, ref0);
581 			break;
582 		}
583 	}
584 	btrfs_release_path(root, path);
585 
586 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
587 		new_size += sizeof(*bi);
588 
589 	new_size -= sizeof(*ei0);
590 	ret = btrfs_search_slot(trans, root, &key, path,
591 				new_size + extra_size, 1);
592 	if (ret < 0)
593 		return ret;
594 	BUG_ON(ret);
595 
596 	ret = btrfs_extend_item(trans, root, path, new_size);
597 	BUG_ON(ret);
598 
599 	leaf = path->nodes[0];
600 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
601 	btrfs_set_extent_refs(leaf, item, refs);
602 	/* FIXME: get real generation */
603 	btrfs_set_extent_generation(leaf, item, 0);
604 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
605 		btrfs_set_extent_flags(leaf, item,
606 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
607 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
608 		bi = (struct btrfs_tree_block_info *)(item + 1);
609 		/* FIXME: get first key of the block */
610 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
611 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
612 	} else {
613 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
614 	}
615 	btrfs_mark_buffer_dirty(leaf);
616 	return 0;
617 }
618 #endif
619 
620 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
621 {
622 	u32 high_crc = ~(u32)0;
623 	u32 low_crc = ~(u32)0;
624 	__le64 lenum;
625 
626 	lenum = cpu_to_le64(root_objectid);
627 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
628 	lenum = cpu_to_le64(owner);
629 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
630 	lenum = cpu_to_le64(offset);
631 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
632 
633 	return ((u64)high_crc << 31) ^ (u64)low_crc;
634 }
635 
636 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
637 				     struct btrfs_extent_data_ref *ref)
638 {
639 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
640 				    btrfs_extent_data_ref_objectid(leaf, ref),
641 				    btrfs_extent_data_ref_offset(leaf, ref));
642 }
643 
644 static int match_extent_data_ref(struct extent_buffer *leaf,
645 				 struct btrfs_extent_data_ref *ref,
646 				 u64 root_objectid, u64 owner, u64 offset)
647 {
648 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
649 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
650 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
651 		return 0;
652 	return 1;
653 }
654 
655 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
656 					   struct btrfs_root *root,
657 					   struct btrfs_path *path,
658 					   u64 bytenr, u64 parent,
659 					   u64 root_objectid,
660 					   u64 owner, u64 offset)
661 {
662 	struct btrfs_key key;
663 	struct btrfs_extent_data_ref *ref;
664 	struct extent_buffer *leaf;
665 	u32 nritems;
666 	int ret;
667 	int recow;
668 	int err = -ENOENT;
669 
670 	key.objectid = bytenr;
671 	if (parent) {
672 		key.type = BTRFS_SHARED_DATA_REF_KEY;
673 		key.offset = parent;
674 	} else {
675 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
676 		key.offset = hash_extent_data_ref(root_objectid,
677 						  owner, offset);
678 	}
679 again:
680 	recow = 0;
681 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
682 	if (ret < 0) {
683 		err = ret;
684 		goto fail;
685 	}
686 
687 	if (parent) {
688 		if (!ret)
689 			return 0;
690 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
691 		key.type = BTRFS_EXTENT_REF_V0_KEY;
692 		btrfs_release_path(root, path);
693 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
694 		if (ret < 0) {
695 			err = ret;
696 			goto fail;
697 		}
698 		if (!ret)
699 			return 0;
700 #endif
701 		goto fail;
702 	}
703 
704 	leaf = path->nodes[0];
705 	nritems = btrfs_header_nritems(leaf);
706 	while (1) {
707 		if (path->slots[0] >= nritems) {
708 			ret = btrfs_next_leaf(root, path);
709 			if (ret < 0)
710 				err = ret;
711 			if (ret)
712 				goto fail;
713 
714 			leaf = path->nodes[0];
715 			nritems = btrfs_header_nritems(leaf);
716 			recow = 1;
717 		}
718 
719 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
720 		if (key.objectid != bytenr ||
721 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
722 			goto fail;
723 
724 		ref = btrfs_item_ptr(leaf, path->slots[0],
725 				     struct btrfs_extent_data_ref);
726 
727 		if (match_extent_data_ref(leaf, ref, root_objectid,
728 					  owner, offset)) {
729 			if (recow) {
730 				btrfs_release_path(root, path);
731 				goto again;
732 			}
733 			err = 0;
734 			break;
735 		}
736 		path->slots[0]++;
737 	}
738 fail:
739 	return err;
740 }
741 
742 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
743 					   struct btrfs_root *root,
744 					   struct btrfs_path *path,
745 					   u64 bytenr, u64 parent,
746 					   u64 root_objectid, u64 owner,
747 					   u64 offset, int refs_to_add)
748 {
749 	struct btrfs_key key;
750 	struct extent_buffer *leaf;
751 	u32 size;
752 	u32 num_refs;
753 	int ret;
754 
755 	key.objectid = bytenr;
756 	if (parent) {
757 		key.type = BTRFS_SHARED_DATA_REF_KEY;
758 		key.offset = parent;
759 		size = sizeof(struct btrfs_shared_data_ref);
760 	} else {
761 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
762 		key.offset = hash_extent_data_ref(root_objectid,
763 						  owner, offset);
764 		size = sizeof(struct btrfs_extent_data_ref);
765 	}
766 
767 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
768 	if (ret && ret != -EEXIST)
769 		goto fail;
770 
771 	leaf = path->nodes[0];
772 	if (parent) {
773 		struct btrfs_shared_data_ref *ref;
774 		ref = btrfs_item_ptr(leaf, path->slots[0],
775 				     struct btrfs_shared_data_ref);
776 		if (ret == 0) {
777 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
778 		} else {
779 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
780 			num_refs += refs_to_add;
781 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
782 		}
783 	} else {
784 		struct btrfs_extent_data_ref *ref;
785 		while (ret == -EEXIST) {
786 			ref = btrfs_item_ptr(leaf, path->slots[0],
787 					     struct btrfs_extent_data_ref);
788 			if (match_extent_data_ref(leaf, ref, root_objectid,
789 						  owner, offset))
790 				break;
791 			btrfs_release_path(root, path);
792 			key.offset++;
793 			ret = btrfs_insert_empty_item(trans, root, path, &key,
794 						      size);
795 			if (ret && ret != -EEXIST)
796 				goto fail;
797 
798 			leaf = path->nodes[0];
799 		}
800 		ref = btrfs_item_ptr(leaf, path->slots[0],
801 				     struct btrfs_extent_data_ref);
802 		if (ret == 0) {
803 			btrfs_set_extent_data_ref_root(leaf, ref,
804 						       root_objectid);
805 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
806 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
807 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
808 		} else {
809 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
810 			num_refs += refs_to_add;
811 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
812 		}
813 	}
814 	btrfs_mark_buffer_dirty(leaf);
815 	ret = 0;
816 fail:
817 	btrfs_release_path(root, path);
818 	return ret;
819 }
820 
821 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
822 					   struct btrfs_root *root,
823 					   struct btrfs_path *path,
824 					   int refs_to_drop)
825 {
826 	struct btrfs_key key;
827 	struct btrfs_extent_data_ref *ref1 = NULL;
828 	struct btrfs_shared_data_ref *ref2 = NULL;
829 	struct extent_buffer *leaf;
830 	u32 num_refs = 0;
831 	int ret = 0;
832 
833 	leaf = path->nodes[0];
834 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
835 
836 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
837 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
838 				      struct btrfs_extent_data_ref);
839 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
840 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
841 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
842 				      struct btrfs_shared_data_ref);
843 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
844 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
845 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
846 		struct btrfs_extent_ref_v0 *ref0;
847 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
848 				      struct btrfs_extent_ref_v0);
849 		num_refs = btrfs_ref_count_v0(leaf, ref0);
850 #endif
851 	} else {
852 		BUG();
853 	}
854 
855 	BUG_ON(num_refs < refs_to_drop);
856 	num_refs -= refs_to_drop;
857 
858 	if (num_refs == 0) {
859 		ret = btrfs_del_item(trans, root, path);
860 	} else {
861 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
862 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
863 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
864 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
865 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
866 		else {
867 			struct btrfs_extent_ref_v0 *ref0;
868 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
869 					struct btrfs_extent_ref_v0);
870 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
871 		}
872 #endif
873 		btrfs_mark_buffer_dirty(leaf);
874 	}
875 	return ret;
876 }
877 
878 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
879 					  struct btrfs_path *path,
880 					  struct btrfs_extent_inline_ref *iref)
881 {
882 	struct btrfs_key key;
883 	struct extent_buffer *leaf;
884 	struct btrfs_extent_data_ref *ref1;
885 	struct btrfs_shared_data_ref *ref2;
886 	u32 num_refs = 0;
887 
888 	leaf = path->nodes[0];
889 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
890 	if (iref) {
891 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
892 		    BTRFS_EXTENT_DATA_REF_KEY) {
893 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
894 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
895 		} else {
896 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
897 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
898 		}
899 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
900 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
901 				      struct btrfs_extent_data_ref);
902 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
903 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
904 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
905 				      struct btrfs_shared_data_ref);
906 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
907 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
908 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
909 		struct btrfs_extent_ref_v0 *ref0;
910 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
911 				      struct btrfs_extent_ref_v0);
912 		num_refs = btrfs_ref_count_v0(leaf, ref0);
913 #endif
914 	} else {
915 		WARN_ON(1);
916 	}
917 	return num_refs;
918 }
919 
920 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
921 					  struct btrfs_root *root,
922 					  struct btrfs_path *path,
923 					  u64 bytenr, u64 parent,
924 					  u64 root_objectid)
925 {
926 	struct btrfs_key key;
927 	int ret;
928 
929 	key.objectid = bytenr;
930 	if (parent) {
931 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
932 		key.offset = parent;
933 	} else {
934 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
935 		key.offset = root_objectid;
936 	}
937 
938 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
939 	if (ret > 0)
940 		ret = -ENOENT;
941 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
942 	if (ret == -ENOENT && parent) {
943 		btrfs_release_path(root, path);
944 		key.type = BTRFS_EXTENT_REF_V0_KEY;
945 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
946 		if (ret > 0)
947 			ret = -ENOENT;
948 	}
949 #endif
950 	return ret;
951 }
952 
953 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
954 					  struct btrfs_root *root,
955 					  struct btrfs_path *path,
956 					  u64 bytenr, u64 parent,
957 					  u64 root_objectid)
958 {
959 	struct btrfs_key key;
960 	int ret;
961 
962 	key.objectid = bytenr;
963 	if (parent) {
964 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
965 		key.offset = parent;
966 	} else {
967 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
968 		key.offset = root_objectid;
969 	}
970 
971 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
972 	btrfs_release_path(root, path);
973 	return ret;
974 }
975 
976 static inline int extent_ref_type(u64 parent, u64 owner)
977 {
978 	int type;
979 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
980 		if (parent > 0)
981 			type = BTRFS_SHARED_BLOCK_REF_KEY;
982 		else
983 			type = BTRFS_TREE_BLOCK_REF_KEY;
984 	} else {
985 		if (parent > 0)
986 			type = BTRFS_SHARED_DATA_REF_KEY;
987 		else
988 			type = BTRFS_EXTENT_DATA_REF_KEY;
989 	}
990 	return type;
991 }
992 
993 static int find_next_key(struct btrfs_path *path, int level,
994 			 struct btrfs_key *key)
995 
996 {
997 	for (; level < BTRFS_MAX_LEVEL; level++) {
998 		if (!path->nodes[level])
999 			break;
1000 		if (path->slots[level] + 1 >=
1001 		    btrfs_header_nritems(path->nodes[level]))
1002 			continue;
1003 		if (level == 0)
1004 			btrfs_item_key_to_cpu(path->nodes[level], key,
1005 					      path->slots[level] + 1);
1006 		else
1007 			btrfs_node_key_to_cpu(path->nodes[level], key,
1008 					      path->slots[level] + 1);
1009 		return 0;
1010 	}
1011 	return 1;
1012 }
1013 
1014 /*
1015  * look for inline back ref. if back ref is found, *ref_ret is set
1016  * to the address of inline back ref, and 0 is returned.
1017  *
1018  * if back ref isn't found, *ref_ret is set to the address where it
1019  * should be inserted, and -ENOENT is returned.
1020  *
1021  * if insert is true and there are too many inline back refs, the path
1022  * points to the extent item, and -EAGAIN is returned.
1023  *
1024  * NOTE: inline back refs are ordered in the same way that back ref
1025  *	 items in the tree are ordered.
1026  */
1027 static noinline_for_stack
1028 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1029 				 struct btrfs_root *root,
1030 				 struct btrfs_path *path,
1031 				 struct btrfs_extent_inline_ref **ref_ret,
1032 				 u64 bytenr, u64 num_bytes,
1033 				 u64 parent, u64 root_objectid,
1034 				 u64 owner, u64 offset, int insert)
1035 {
1036 	struct btrfs_key key;
1037 	struct extent_buffer *leaf;
1038 	struct btrfs_extent_item *ei;
1039 	struct btrfs_extent_inline_ref *iref;
1040 	u64 flags;
1041 	u64 item_size;
1042 	unsigned long ptr;
1043 	unsigned long end;
1044 	int extra_size;
1045 	int type;
1046 	int want;
1047 	int ret;
1048 	int err = 0;
1049 
1050 	key.objectid = bytenr;
1051 	key.type = BTRFS_EXTENT_ITEM_KEY;
1052 	key.offset = num_bytes;
1053 
1054 	want = extent_ref_type(parent, owner);
1055 	if (insert) {
1056 		extra_size = btrfs_extent_inline_ref_size(want);
1057 		path->keep_locks = 1;
1058 	} else
1059 		extra_size = -1;
1060 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1061 	if (ret < 0) {
1062 		err = ret;
1063 		goto out;
1064 	}
1065 	BUG_ON(ret);
1066 
1067 	leaf = path->nodes[0];
1068 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1069 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1070 	if (item_size < sizeof(*ei)) {
1071 		if (!insert) {
1072 			err = -ENOENT;
1073 			goto out;
1074 		}
1075 		ret = convert_extent_item_v0(trans, root, path, owner,
1076 					     extra_size);
1077 		if (ret < 0) {
1078 			err = ret;
1079 			goto out;
1080 		}
1081 		leaf = path->nodes[0];
1082 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1083 	}
1084 #endif
1085 	BUG_ON(item_size < sizeof(*ei));
1086 
1087 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1088 	flags = btrfs_extent_flags(leaf, ei);
1089 
1090 	ptr = (unsigned long)(ei + 1);
1091 	end = (unsigned long)ei + item_size;
1092 
1093 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1094 		ptr += sizeof(struct btrfs_tree_block_info);
1095 		BUG_ON(ptr > end);
1096 	} else {
1097 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1098 	}
1099 
1100 	err = -ENOENT;
1101 	while (1) {
1102 		if (ptr >= end) {
1103 			WARN_ON(ptr > end);
1104 			break;
1105 		}
1106 		iref = (struct btrfs_extent_inline_ref *)ptr;
1107 		type = btrfs_extent_inline_ref_type(leaf, iref);
1108 		if (want < type)
1109 			break;
1110 		if (want > type) {
1111 			ptr += btrfs_extent_inline_ref_size(type);
1112 			continue;
1113 		}
1114 
1115 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116 			struct btrfs_extent_data_ref *dref;
1117 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1118 			if (match_extent_data_ref(leaf, dref, root_objectid,
1119 						  owner, offset)) {
1120 				err = 0;
1121 				break;
1122 			}
1123 			if (hash_extent_data_ref_item(leaf, dref) <
1124 			    hash_extent_data_ref(root_objectid, owner, offset))
1125 				break;
1126 		} else {
1127 			u64 ref_offset;
1128 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1129 			if (parent > 0) {
1130 				if (parent == ref_offset) {
1131 					err = 0;
1132 					break;
1133 				}
1134 				if (ref_offset < parent)
1135 					break;
1136 			} else {
1137 				if (root_objectid == ref_offset) {
1138 					err = 0;
1139 					break;
1140 				}
1141 				if (ref_offset < root_objectid)
1142 					break;
1143 			}
1144 		}
1145 		ptr += btrfs_extent_inline_ref_size(type);
1146 	}
1147 	if (err == -ENOENT && insert) {
1148 		if (item_size + extra_size >=
1149 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1150 			err = -EAGAIN;
1151 			goto out;
1152 		}
1153 		/*
1154 		 * To add new inline back ref, we have to make sure
1155 		 * there is no corresponding back ref item.
1156 		 * For simplicity, we just do not add new inline back
1157 		 * ref if there is any kind of item for this block
1158 		 */
1159 		if (find_next_key(path, 0, &key) == 0 &&
1160 		    key.objectid == bytenr &&
1161 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1162 			err = -EAGAIN;
1163 			goto out;
1164 		}
1165 	}
1166 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1167 out:
1168 	if (insert) {
1169 		path->keep_locks = 0;
1170 		btrfs_unlock_up_safe(path, 1);
1171 	}
1172 	return err;
1173 }
1174 
1175 /*
1176  * helper to add new inline back ref
1177  */
1178 static noinline_for_stack
1179 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1180 				struct btrfs_root *root,
1181 				struct btrfs_path *path,
1182 				struct btrfs_extent_inline_ref *iref,
1183 				u64 parent, u64 root_objectid,
1184 				u64 owner, u64 offset, int refs_to_add,
1185 				struct btrfs_delayed_extent_op *extent_op)
1186 {
1187 	struct extent_buffer *leaf;
1188 	struct btrfs_extent_item *ei;
1189 	unsigned long ptr;
1190 	unsigned long end;
1191 	unsigned long item_offset;
1192 	u64 refs;
1193 	int size;
1194 	int type;
1195 	int ret;
1196 
1197 	leaf = path->nodes[0];
1198 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1199 	item_offset = (unsigned long)iref - (unsigned long)ei;
1200 
1201 	type = extent_ref_type(parent, owner);
1202 	size = btrfs_extent_inline_ref_size(type);
1203 
1204 	ret = btrfs_extend_item(trans, root, path, size);
1205 	BUG_ON(ret);
1206 
1207 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1208 	refs = btrfs_extent_refs(leaf, ei);
1209 	refs += refs_to_add;
1210 	btrfs_set_extent_refs(leaf, ei, refs);
1211 	if (extent_op)
1212 		__run_delayed_extent_op(extent_op, leaf, ei);
1213 
1214 	ptr = (unsigned long)ei + item_offset;
1215 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1216 	if (ptr < end - size)
1217 		memmove_extent_buffer(leaf, ptr + size, ptr,
1218 				      end - size - ptr);
1219 
1220 	iref = (struct btrfs_extent_inline_ref *)ptr;
1221 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1222 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1223 		struct btrfs_extent_data_ref *dref;
1224 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1225 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1226 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1227 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1228 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1229 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1230 		struct btrfs_shared_data_ref *sref;
1231 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1232 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1233 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1234 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1235 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1236 	} else {
1237 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1238 	}
1239 	btrfs_mark_buffer_dirty(leaf);
1240 	return 0;
1241 }
1242 
1243 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1244 				 struct btrfs_root *root,
1245 				 struct btrfs_path *path,
1246 				 struct btrfs_extent_inline_ref **ref_ret,
1247 				 u64 bytenr, u64 num_bytes, u64 parent,
1248 				 u64 root_objectid, u64 owner, u64 offset)
1249 {
1250 	int ret;
1251 
1252 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1253 					   bytenr, num_bytes, parent,
1254 					   root_objectid, owner, offset, 0);
1255 	if (ret != -ENOENT)
1256 		return ret;
1257 
1258 	btrfs_release_path(root, path);
1259 	*ref_ret = NULL;
1260 
1261 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1262 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1263 					    root_objectid);
1264 	} else {
1265 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1266 					     root_objectid, owner, offset);
1267 	}
1268 	return ret;
1269 }
1270 
1271 /*
1272  * helper to update/remove inline back ref
1273  */
1274 static noinline_for_stack
1275 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1276 				 struct btrfs_root *root,
1277 				 struct btrfs_path *path,
1278 				 struct btrfs_extent_inline_ref *iref,
1279 				 int refs_to_mod,
1280 				 struct btrfs_delayed_extent_op *extent_op)
1281 {
1282 	struct extent_buffer *leaf;
1283 	struct btrfs_extent_item *ei;
1284 	struct btrfs_extent_data_ref *dref = NULL;
1285 	struct btrfs_shared_data_ref *sref = NULL;
1286 	unsigned long ptr;
1287 	unsigned long end;
1288 	u32 item_size;
1289 	int size;
1290 	int type;
1291 	int ret;
1292 	u64 refs;
1293 
1294 	leaf = path->nodes[0];
1295 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1296 	refs = btrfs_extent_refs(leaf, ei);
1297 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1298 	refs += refs_to_mod;
1299 	btrfs_set_extent_refs(leaf, ei, refs);
1300 	if (extent_op)
1301 		__run_delayed_extent_op(extent_op, leaf, ei);
1302 
1303 	type = btrfs_extent_inline_ref_type(leaf, iref);
1304 
1305 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1306 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1307 		refs = btrfs_extent_data_ref_count(leaf, dref);
1308 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1309 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1310 		refs = btrfs_shared_data_ref_count(leaf, sref);
1311 	} else {
1312 		refs = 1;
1313 		BUG_ON(refs_to_mod != -1);
1314 	}
1315 
1316 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1317 	refs += refs_to_mod;
1318 
1319 	if (refs > 0) {
1320 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1321 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1322 		else
1323 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1324 	} else {
1325 		size =  btrfs_extent_inline_ref_size(type);
1326 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1327 		ptr = (unsigned long)iref;
1328 		end = (unsigned long)ei + item_size;
1329 		if (ptr + size < end)
1330 			memmove_extent_buffer(leaf, ptr, ptr + size,
1331 					      end - ptr - size);
1332 		item_size -= size;
1333 		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1334 		BUG_ON(ret);
1335 	}
1336 	btrfs_mark_buffer_dirty(leaf);
1337 	return 0;
1338 }
1339 
1340 static noinline_for_stack
1341 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1342 				 struct btrfs_root *root,
1343 				 struct btrfs_path *path,
1344 				 u64 bytenr, u64 num_bytes, u64 parent,
1345 				 u64 root_objectid, u64 owner,
1346 				 u64 offset, int refs_to_add,
1347 				 struct btrfs_delayed_extent_op *extent_op)
1348 {
1349 	struct btrfs_extent_inline_ref *iref;
1350 	int ret;
1351 
1352 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1353 					   bytenr, num_bytes, parent,
1354 					   root_objectid, owner, offset, 1);
1355 	if (ret == 0) {
1356 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1357 		ret = update_inline_extent_backref(trans, root, path, iref,
1358 						   refs_to_add, extent_op);
1359 	} else if (ret == -ENOENT) {
1360 		ret = setup_inline_extent_backref(trans, root, path, iref,
1361 						  parent, root_objectid,
1362 						  owner, offset, refs_to_add,
1363 						  extent_op);
1364 	}
1365 	return ret;
1366 }
1367 
1368 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1369 				 struct btrfs_root *root,
1370 				 struct btrfs_path *path,
1371 				 u64 bytenr, u64 parent, u64 root_objectid,
1372 				 u64 owner, u64 offset, int refs_to_add)
1373 {
1374 	int ret;
1375 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1376 		BUG_ON(refs_to_add != 1);
1377 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1378 					    parent, root_objectid);
1379 	} else {
1380 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1381 					     parent, root_objectid,
1382 					     owner, offset, refs_to_add);
1383 	}
1384 	return ret;
1385 }
1386 
1387 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1388 				 struct btrfs_root *root,
1389 				 struct btrfs_path *path,
1390 				 struct btrfs_extent_inline_ref *iref,
1391 				 int refs_to_drop, int is_data)
1392 {
1393 	int ret;
1394 
1395 	BUG_ON(!is_data && refs_to_drop != 1);
1396 	if (iref) {
1397 		ret = update_inline_extent_backref(trans, root, path, iref,
1398 						   -refs_to_drop, NULL);
1399 	} else if (is_data) {
1400 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1401 	} else {
1402 		ret = btrfs_del_item(trans, root, path);
1403 	}
1404 	return ret;
1405 }
1406 
1407 #ifdef BIO_RW_DISCARD
1408 static void btrfs_issue_discard(struct block_device *bdev,
1409 				u64 start, u64 len)
1410 {
1411 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
1412 }
1413 #endif
1414 
1415 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1416 				u64 num_bytes)
1417 {
1418 #ifdef BIO_RW_DISCARD
1419 	int ret;
1420 	u64 map_length = num_bytes;
1421 	struct btrfs_multi_bio *multi = NULL;
1422 
1423 	/* Tell the block device(s) that the sectors can be discarded */
1424 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1425 			      bytenr, &map_length, &multi, 0);
1426 	if (!ret) {
1427 		struct btrfs_bio_stripe *stripe = multi->stripes;
1428 		int i;
1429 
1430 		if (map_length > num_bytes)
1431 			map_length = num_bytes;
1432 
1433 		for (i = 0; i < multi->num_stripes; i++, stripe++) {
1434 			btrfs_issue_discard(stripe->dev->bdev,
1435 					    stripe->physical,
1436 					    map_length);
1437 		}
1438 		kfree(multi);
1439 	}
1440 
1441 	return ret;
1442 #else
1443 	return 0;
1444 #endif
1445 }
1446 
1447 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1448 			 struct btrfs_root *root,
1449 			 u64 bytenr, u64 num_bytes, u64 parent,
1450 			 u64 root_objectid, u64 owner, u64 offset)
1451 {
1452 	int ret;
1453 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1454 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1455 
1456 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1457 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1458 					parent, root_objectid, (int)owner,
1459 					BTRFS_ADD_DELAYED_REF, NULL);
1460 	} else {
1461 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1462 					parent, root_objectid, owner, offset,
1463 					BTRFS_ADD_DELAYED_REF, NULL);
1464 	}
1465 	return ret;
1466 }
1467 
1468 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1469 				  struct btrfs_root *root,
1470 				  u64 bytenr, u64 num_bytes,
1471 				  u64 parent, u64 root_objectid,
1472 				  u64 owner, u64 offset, int refs_to_add,
1473 				  struct btrfs_delayed_extent_op *extent_op)
1474 {
1475 	struct btrfs_path *path;
1476 	struct extent_buffer *leaf;
1477 	struct btrfs_extent_item *item;
1478 	u64 refs;
1479 	int ret;
1480 	int err = 0;
1481 
1482 	path = btrfs_alloc_path();
1483 	if (!path)
1484 		return -ENOMEM;
1485 
1486 	path->reada = 1;
1487 	path->leave_spinning = 1;
1488 	/* this will setup the path even if it fails to insert the back ref */
1489 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1490 					   path, bytenr, num_bytes, parent,
1491 					   root_objectid, owner, offset,
1492 					   refs_to_add, extent_op);
1493 	if (ret == 0)
1494 		goto out;
1495 
1496 	if (ret != -EAGAIN) {
1497 		err = ret;
1498 		goto out;
1499 	}
1500 
1501 	leaf = path->nodes[0];
1502 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1503 	refs = btrfs_extent_refs(leaf, item);
1504 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1505 	if (extent_op)
1506 		__run_delayed_extent_op(extent_op, leaf, item);
1507 
1508 	btrfs_mark_buffer_dirty(leaf);
1509 	btrfs_release_path(root->fs_info->extent_root, path);
1510 
1511 	path->reada = 1;
1512 	path->leave_spinning = 1;
1513 
1514 	/* now insert the actual backref */
1515 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1516 				    path, bytenr, parent, root_objectid,
1517 				    owner, offset, refs_to_add);
1518 	BUG_ON(ret);
1519 out:
1520 	btrfs_free_path(path);
1521 	return err;
1522 }
1523 
1524 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1525 				struct btrfs_root *root,
1526 				struct btrfs_delayed_ref_node *node,
1527 				struct btrfs_delayed_extent_op *extent_op,
1528 				int insert_reserved)
1529 {
1530 	int ret = 0;
1531 	struct btrfs_delayed_data_ref *ref;
1532 	struct btrfs_key ins;
1533 	u64 parent = 0;
1534 	u64 ref_root = 0;
1535 	u64 flags = 0;
1536 
1537 	ins.objectid = node->bytenr;
1538 	ins.offset = node->num_bytes;
1539 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1540 
1541 	ref = btrfs_delayed_node_to_data_ref(node);
1542 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1543 		parent = ref->parent;
1544 	else
1545 		ref_root = ref->root;
1546 
1547 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1548 		if (extent_op) {
1549 			BUG_ON(extent_op->update_key);
1550 			flags |= extent_op->flags_to_set;
1551 		}
1552 		ret = alloc_reserved_file_extent(trans, root,
1553 						 parent, ref_root, flags,
1554 						 ref->objectid, ref->offset,
1555 						 &ins, node->ref_mod);
1556 		update_reserved_extents(root, ins.objectid, ins.offset, 0);
1557 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1558 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1559 					     node->num_bytes, parent,
1560 					     ref_root, ref->objectid,
1561 					     ref->offset, node->ref_mod,
1562 					     extent_op);
1563 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1564 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1565 					  node->num_bytes, parent,
1566 					  ref_root, ref->objectid,
1567 					  ref->offset, node->ref_mod,
1568 					  extent_op);
1569 	} else {
1570 		BUG();
1571 	}
1572 	return ret;
1573 }
1574 
1575 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1576 				    struct extent_buffer *leaf,
1577 				    struct btrfs_extent_item *ei)
1578 {
1579 	u64 flags = btrfs_extent_flags(leaf, ei);
1580 	if (extent_op->update_flags) {
1581 		flags |= extent_op->flags_to_set;
1582 		btrfs_set_extent_flags(leaf, ei, flags);
1583 	}
1584 
1585 	if (extent_op->update_key) {
1586 		struct btrfs_tree_block_info *bi;
1587 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1588 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1589 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1590 	}
1591 }
1592 
1593 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1594 				 struct btrfs_root *root,
1595 				 struct btrfs_delayed_ref_node *node,
1596 				 struct btrfs_delayed_extent_op *extent_op)
1597 {
1598 	struct btrfs_key key;
1599 	struct btrfs_path *path;
1600 	struct btrfs_extent_item *ei;
1601 	struct extent_buffer *leaf;
1602 	u32 item_size;
1603 	int ret;
1604 	int err = 0;
1605 
1606 	path = btrfs_alloc_path();
1607 	if (!path)
1608 		return -ENOMEM;
1609 
1610 	key.objectid = node->bytenr;
1611 	key.type = BTRFS_EXTENT_ITEM_KEY;
1612 	key.offset = node->num_bytes;
1613 
1614 	path->reada = 1;
1615 	path->leave_spinning = 1;
1616 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1617 				path, 0, 1);
1618 	if (ret < 0) {
1619 		err = ret;
1620 		goto out;
1621 	}
1622 	if (ret > 0) {
1623 		err = -EIO;
1624 		goto out;
1625 	}
1626 
1627 	leaf = path->nodes[0];
1628 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1629 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1630 	if (item_size < sizeof(*ei)) {
1631 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1632 					     path, (u64)-1, 0);
1633 		if (ret < 0) {
1634 			err = ret;
1635 			goto out;
1636 		}
1637 		leaf = path->nodes[0];
1638 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1639 	}
1640 #endif
1641 	BUG_ON(item_size < sizeof(*ei));
1642 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1643 	__run_delayed_extent_op(extent_op, leaf, ei);
1644 
1645 	btrfs_mark_buffer_dirty(leaf);
1646 out:
1647 	btrfs_free_path(path);
1648 	return err;
1649 }
1650 
1651 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1652 				struct btrfs_root *root,
1653 				struct btrfs_delayed_ref_node *node,
1654 				struct btrfs_delayed_extent_op *extent_op,
1655 				int insert_reserved)
1656 {
1657 	int ret = 0;
1658 	struct btrfs_delayed_tree_ref *ref;
1659 	struct btrfs_key ins;
1660 	u64 parent = 0;
1661 	u64 ref_root = 0;
1662 
1663 	ins.objectid = node->bytenr;
1664 	ins.offset = node->num_bytes;
1665 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1666 
1667 	ref = btrfs_delayed_node_to_tree_ref(node);
1668 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1669 		parent = ref->parent;
1670 	else
1671 		ref_root = ref->root;
1672 
1673 	BUG_ON(node->ref_mod != 1);
1674 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1675 		BUG_ON(!extent_op || !extent_op->update_flags ||
1676 		       !extent_op->update_key);
1677 		ret = alloc_reserved_tree_block(trans, root,
1678 						parent, ref_root,
1679 						extent_op->flags_to_set,
1680 						&extent_op->key,
1681 						ref->level, &ins);
1682 		update_reserved_extents(root, ins.objectid, ins.offset, 0);
1683 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1684 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1685 					     node->num_bytes, parent, ref_root,
1686 					     ref->level, 0, 1, extent_op);
1687 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1688 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1689 					  node->num_bytes, parent, ref_root,
1690 					  ref->level, 0, 1, extent_op);
1691 	} else {
1692 		BUG();
1693 	}
1694 	return ret;
1695 }
1696 
1697 
1698 /* helper function to actually process a single delayed ref entry */
1699 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1700 			       struct btrfs_root *root,
1701 			       struct btrfs_delayed_ref_node *node,
1702 			       struct btrfs_delayed_extent_op *extent_op,
1703 			       int insert_reserved)
1704 {
1705 	int ret;
1706 	if (btrfs_delayed_ref_is_head(node)) {
1707 		struct btrfs_delayed_ref_head *head;
1708 		/*
1709 		 * we've hit the end of the chain and we were supposed
1710 		 * to insert this extent into the tree.  But, it got
1711 		 * deleted before we ever needed to insert it, so all
1712 		 * we have to do is clean up the accounting
1713 		 */
1714 		BUG_ON(extent_op);
1715 		head = btrfs_delayed_node_to_head(node);
1716 		if (insert_reserved) {
1717 			if (head->is_data) {
1718 				ret = btrfs_del_csums(trans, root,
1719 						      node->bytenr,
1720 						      node->num_bytes);
1721 				BUG_ON(ret);
1722 			}
1723 			btrfs_update_pinned_extents(root, node->bytenr,
1724 						    node->num_bytes, 1);
1725 			update_reserved_extents(root, node->bytenr,
1726 						node->num_bytes, 0);
1727 		}
1728 		mutex_unlock(&head->mutex);
1729 		return 0;
1730 	}
1731 
1732 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1733 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1734 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
1735 					   insert_reserved);
1736 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1737 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1738 		ret = run_delayed_data_ref(trans, root, node, extent_op,
1739 					   insert_reserved);
1740 	else
1741 		BUG();
1742 	return ret;
1743 }
1744 
1745 static noinline struct btrfs_delayed_ref_node *
1746 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1747 {
1748 	struct rb_node *node;
1749 	struct btrfs_delayed_ref_node *ref;
1750 	int action = BTRFS_ADD_DELAYED_REF;
1751 again:
1752 	/*
1753 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1754 	 * this prevents ref count from going down to zero when
1755 	 * there still are pending delayed ref.
1756 	 */
1757 	node = rb_prev(&head->node.rb_node);
1758 	while (1) {
1759 		if (!node)
1760 			break;
1761 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
1762 				rb_node);
1763 		if (ref->bytenr != head->node.bytenr)
1764 			break;
1765 		if (ref->action == action)
1766 			return ref;
1767 		node = rb_prev(node);
1768 	}
1769 	if (action == BTRFS_ADD_DELAYED_REF) {
1770 		action = BTRFS_DROP_DELAYED_REF;
1771 		goto again;
1772 	}
1773 	return NULL;
1774 }
1775 
1776 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1777 				       struct btrfs_root *root,
1778 				       struct list_head *cluster)
1779 {
1780 	struct btrfs_delayed_ref_root *delayed_refs;
1781 	struct btrfs_delayed_ref_node *ref;
1782 	struct btrfs_delayed_ref_head *locked_ref = NULL;
1783 	struct btrfs_delayed_extent_op *extent_op;
1784 	int ret;
1785 	int count = 0;
1786 	int must_insert_reserved = 0;
1787 
1788 	delayed_refs = &trans->transaction->delayed_refs;
1789 	while (1) {
1790 		if (!locked_ref) {
1791 			/* pick a new head ref from the cluster list */
1792 			if (list_empty(cluster))
1793 				break;
1794 
1795 			locked_ref = list_entry(cluster->next,
1796 				     struct btrfs_delayed_ref_head, cluster);
1797 
1798 			/* grab the lock that says we are going to process
1799 			 * all the refs for this head */
1800 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
1801 
1802 			/*
1803 			 * we may have dropped the spin lock to get the head
1804 			 * mutex lock, and that might have given someone else
1805 			 * time to free the head.  If that's true, it has been
1806 			 * removed from our list and we can move on.
1807 			 */
1808 			if (ret == -EAGAIN) {
1809 				locked_ref = NULL;
1810 				count++;
1811 				continue;
1812 			}
1813 		}
1814 
1815 		/*
1816 		 * record the must insert reserved flag before we
1817 		 * drop the spin lock.
1818 		 */
1819 		must_insert_reserved = locked_ref->must_insert_reserved;
1820 		locked_ref->must_insert_reserved = 0;
1821 
1822 		extent_op = locked_ref->extent_op;
1823 		locked_ref->extent_op = NULL;
1824 
1825 		/*
1826 		 * locked_ref is the head node, so we have to go one
1827 		 * node back for any delayed ref updates
1828 		 */
1829 		ref = select_delayed_ref(locked_ref);
1830 		if (!ref) {
1831 			/* All delayed refs have been processed, Go ahead
1832 			 * and send the head node to run_one_delayed_ref,
1833 			 * so that any accounting fixes can happen
1834 			 */
1835 			ref = &locked_ref->node;
1836 
1837 			if (extent_op && must_insert_reserved) {
1838 				kfree(extent_op);
1839 				extent_op = NULL;
1840 			}
1841 
1842 			if (extent_op) {
1843 				spin_unlock(&delayed_refs->lock);
1844 
1845 				ret = run_delayed_extent_op(trans, root,
1846 							    ref, extent_op);
1847 				BUG_ON(ret);
1848 				kfree(extent_op);
1849 
1850 				cond_resched();
1851 				spin_lock(&delayed_refs->lock);
1852 				continue;
1853 			}
1854 
1855 			list_del_init(&locked_ref->cluster);
1856 			locked_ref = NULL;
1857 		}
1858 
1859 		ref->in_tree = 0;
1860 		rb_erase(&ref->rb_node, &delayed_refs->root);
1861 		delayed_refs->num_entries--;
1862 
1863 		spin_unlock(&delayed_refs->lock);
1864 
1865 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
1866 					  must_insert_reserved);
1867 		BUG_ON(ret);
1868 
1869 		btrfs_put_delayed_ref(ref);
1870 		kfree(extent_op);
1871 		count++;
1872 
1873 		cond_resched();
1874 		spin_lock(&delayed_refs->lock);
1875 	}
1876 	return count;
1877 }
1878 
1879 /*
1880  * this starts processing the delayed reference count updates and
1881  * extent insertions we have queued up so far.  count can be
1882  * 0, which means to process everything in the tree at the start
1883  * of the run (but not newly added entries), or it can be some target
1884  * number you'd like to process.
1885  */
1886 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1887 			   struct btrfs_root *root, unsigned long count)
1888 {
1889 	struct rb_node *node;
1890 	struct btrfs_delayed_ref_root *delayed_refs;
1891 	struct btrfs_delayed_ref_node *ref;
1892 	struct list_head cluster;
1893 	int ret;
1894 	int run_all = count == (unsigned long)-1;
1895 	int run_most = 0;
1896 
1897 	if (root == root->fs_info->extent_root)
1898 		root = root->fs_info->tree_root;
1899 
1900 	delayed_refs = &trans->transaction->delayed_refs;
1901 	INIT_LIST_HEAD(&cluster);
1902 again:
1903 	spin_lock(&delayed_refs->lock);
1904 	if (count == 0) {
1905 		count = delayed_refs->num_entries * 2;
1906 		run_most = 1;
1907 	}
1908 	while (1) {
1909 		if (!(run_all || run_most) &&
1910 		    delayed_refs->num_heads_ready < 64)
1911 			break;
1912 
1913 		/*
1914 		 * go find something we can process in the rbtree.  We start at
1915 		 * the beginning of the tree, and then build a cluster
1916 		 * of refs to process starting at the first one we are able to
1917 		 * lock
1918 		 */
1919 		ret = btrfs_find_ref_cluster(trans, &cluster,
1920 					     delayed_refs->run_delayed_start);
1921 		if (ret)
1922 			break;
1923 
1924 		ret = run_clustered_refs(trans, root, &cluster);
1925 		BUG_ON(ret < 0);
1926 
1927 		count -= min_t(unsigned long, ret, count);
1928 
1929 		if (count == 0)
1930 			break;
1931 	}
1932 
1933 	if (run_all) {
1934 		node = rb_first(&delayed_refs->root);
1935 		if (!node)
1936 			goto out;
1937 		count = (unsigned long)-1;
1938 
1939 		while (node) {
1940 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
1941 				       rb_node);
1942 			if (btrfs_delayed_ref_is_head(ref)) {
1943 				struct btrfs_delayed_ref_head *head;
1944 
1945 				head = btrfs_delayed_node_to_head(ref);
1946 				atomic_inc(&ref->refs);
1947 
1948 				spin_unlock(&delayed_refs->lock);
1949 				mutex_lock(&head->mutex);
1950 				mutex_unlock(&head->mutex);
1951 
1952 				btrfs_put_delayed_ref(ref);
1953 				cond_resched();
1954 				goto again;
1955 			}
1956 			node = rb_next(node);
1957 		}
1958 		spin_unlock(&delayed_refs->lock);
1959 		schedule_timeout(1);
1960 		goto again;
1961 	}
1962 out:
1963 	spin_unlock(&delayed_refs->lock);
1964 	return 0;
1965 }
1966 
1967 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
1968 				struct btrfs_root *root,
1969 				u64 bytenr, u64 num_bytes, u64 flags,
1970 				int is_data)
1971 {
1972 	struct btrfs_delayed_extent_op *extent_op;
1973 	int ret;
1974 
1975 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1976 	if (!extent_op)
1977 		return -ENOMEM;
1978 
1979 	extent_op->flags_to_set = flags;
1980 	extent_op->update_flags = 1;
1981 	extent_op->update_key = 0;
1982 	extent_op->is_data = is_data ? 1 : 0;
1983 
1984 	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
1985 	if (ret)
1986 		kfree(extent_op);
1987 	return ret;
1988 }
1989 
1990 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
1991 				      struct btrfs_root *root,
1992 				      struct btrfs_path *path,
1993 				      u64 objectid, u64 offset, u64 bytenr)
1994 {
1995 	struct btrfs_delayed_ref_head *head;
1996 	struct btrfs_delayed_ref_node *ref;
1997 	struct btrfs_delayed_data_ref *data_ref;
1998 	struct btrfs_delayed_ref_root *delayed_refs;
1999 	struct rb_node *node;
2000 	int ret = 0;
2001 
2002 	ret = -ENOENT;
2003 	delayed_refs = &trans->transaction->delayed_refs;
2004 	spin_lock(&delayed_refs->lock);
2005 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2006 	if (!head)
2007 		goto out;
2008 
2009 	if (!mutex_trylock(&head->mutex)) {
2010 		atomic_inc(&head->node.refs);
2011 		spin_unlock(&delayed_refs->lock);
2012 
2013 		btrfs_release_path(root->fs_info->extent_root, path);
2014 
2015 		mutex_lock(&head->mutex);
2016 		mutex_unlock(&head->mutex);
2017 		btrfs_put_delayed_ref(&head->node);
2018 		return -EAGAIN;
2019 	}
2020 
2021 	node = rb_prev(&head->node.rb_node);
2022 	if (!node)
2023 		goto out_unlock;
2024 
2025 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2026 
2027 	if (ref->bytenr != bytenr)
2028 		goto out_unlock;
2029 
2030 	ret = 1;
2031 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2032 		goto out_unlock;
2033 
2034 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2035 
2036 	node = rb_prev(node);
2037 	if (node) {
2038 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2039 		if (ref->bytenr == bytenr)
2040 			goto out_unlock;
2041 	}
2042 
2043 	if (data_ref->root != root->root_key.objectid ||
2044 	    data_ref->objectid != objectid || data_ref->offset != offset)
2045 		goto out_unlock;
2046 
2047 	ret = 0;
2048 out_unlock:
2049 	mutex_unlock(&head->mutex);
2050 out:
2051 	spin_unlock(&delayed_refs->lock);
2052 	return ret;
2053 }
2054 
2055 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2056 					struct btrfs_root *root,
2057 					struct btrfs_path *path,
2058 					u64 objectid, u64 offset, u64 bytenr)
2059 {
2060 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2061 	struct extent_buffer *leaf;
2062 	struct btrfs_extent_data_ref *ref;
2063 	struct btrfs_extent_inline_ref *iref;
2064 	struct btrfs_extent_item *ei;
2065 	struct btrfs_key key;
2066 	u32 item_size;
2067 	int ret;
2068 
2069 	key.objectid = bytenr;
2070 	key.offset = (u64)-1;
2071 	key.type = BTRFS_EXTENT_ITEM_KEY;
2072 
2073 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2074 	if (ret < 0)
2075 		goto out;
2076 	BUG_ON(ret == 0);
2077 
2078 	ret = -ENOENT;
2079 	if (path->slots[0] == 0)
2080 		goto out;
2081 
2082 	path->slots[0]--;
2083 	leaf = path->nodes[0];
2084 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2085 
2086 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2087 		goto out;
2088 
2089 	ret = 1;
2090 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2091 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2092 	if (item_size < sizeof(*ei)) {
2093 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2094 		goto out;
2095 	}
2096 #endif
2097 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2098 
2099 	if (item_size != sizeof(*ei) +
2100 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2101 		goto out;
2102 
2103 	if (btrfs_extent_generation(leaf, ei) <=
2104 	    btrfs_root_last_snapshot(&root->root_item))
2105 		goto out;
2106 
2107 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2108 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2109 	    BTRFS_EXTENT_DATA_REF_KEY)
2110 		goto out;
2111 
2112 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2113 	if (btrfs_extent_refs(leaf, ei) !=
2114 	    btrfs_extent_data_ref_count(leaf, ref) ||
2115 	    btrfs_extent_data_ref_root(leaf, ref) !=
2116 	    root->root_key.objectid ||
2117 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2118 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2119 		goto out;
2120 
2121 	ret = 0;
2122 out:
2123 	return ret;
2124 }
2125 
2126 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2127 			  struct btrfs_root *root,
2128 			  u64 objectid, u64 offset, u64 bytenr)
2129 {
2130 	struct btrfs_path *path;
2131 	int ret;
2132 	int ret2;
2133 
2134 	path = btrfs_alloc_path();
2135 	if (!path)
2136 		return -ENOENT;
2137 
2138 	do {
2139 		ret = check_committed_ref(trans, root, path, objectid,
2140 					  offset, bytenr);
2141 		if (ret && ret != -ENOENT)
2142 			goto out;
2143 
2144 		ret2 = check_delayed_ref(trans, root, path, objectid,
2145 					 offset, bytenr);
2146 	} while (ret2 == -EAGAIN);
2147 
2148 	if (ret2 && ret2 != -ENOENT) {
2149 		ret = ret2;
2150 		goto out;
2151 	}
2152 
2153 	if (ret != -ENOENT || ret2 != -ENOENT)
2154 		ret = 0;
2155 out:
2156 	btrfs_free_path(path);
2157 	return ret;
2158 }
2159 
2160 #if 0
2161 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2162 		    struct extent_buffer *buf, u32 nr_extents)
2163 {
2164 	struct btrfs_key key;
2165 	struct btrfs_file_extent_item *fi;
2166 	u64 root_gen;
2167 	u32 nritems;
2168 	int i;
2169 	int level;
2170 	int ret = 0;
2171 	int shared = 0;
2172 
2173 	if (!root->ref_cows)
2174 		return 0;
2175 
2176 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2177 		shared = 0;
2178 		root_gen = root->root_key.offset;
2179 	} else {
2180 		shared = 1;
2181 		root_gen = trans->transid - 1;
2182 	}
2183 
2184 	level = btrfs_header_level(buf);
2185 	nritems = btrfs_header_nritems(buf);
2186 
2187 	if (level == 0) {
2188 		struct btrfs_leaf_ref *ref;
2189 		struct btrfs_extent_info *info;
2190 
2191 		ref = btrfs_alloc_leaf_ref(root, nr_extents);
2192 		if (!ref) {
2193 			ret = -ENOMEM;
2194 			goto out;
2195 		}
2196 
2197 		ref->root_gen = root_gen;
2198 		ref->bytenr = buf->start;
2199 		ref->owner = btrfs_header_owner(buf);
2200 		ref->generation = btrfs_header_generation(buf);
2201 		ref->nritems = nr_extents;
2202 		info = ref->extents;
2203 
2204 		for (i = 0; nr_extents > 0 && i < nritems; i++) {
2205 			u64 disk_bytenr;
2206 			btrfs_item_key_to_cpu(buf, &key, i);
2207 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2208 				continue;
2209 			fi = btrfs_item_ptr(buf, i,
2210 					    struct btrfs_file_extent_item);
2211 			if (btrfs_file_extent_type(buf, fi) ==
2212 			    BTRFS_FILE_EXTENT_INLINE)
2213 				continue;
2214 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2215 			if (disk_bytenr == 0)
2216 				continue;
2217 
2218 			info->bytenr = disk_bytenr;
2219 			info->num_bytes =
2220 				btrfs_file_extent_disk_num_bytes(buf, fi);
2221 			info->objectid = key.objectid;
2222 			info->offset = key.offset;
2223 			info++;
2224 		}
2225 
2226 		ret = btrfs_add_leaf_ref(root, ref, shared);
2227 		if (ret == -EEXIST && shared) {
2228 			struct btrfs_leaf_ref *old;
2229 			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2230 			BUG_ON(!old);
2231 			btrfs_remove_leaf_ref(root, old);
2232 			btrfs_free_leaf_ref(root, old);
2233 			ret = btrfs_add_leaf_ref(root, ref, shared);
2234 		}
2235 		WARN_ON(ret);
2236 		btrfs_free_leaf_ref(root, ref);
2237 	}
2238 out:
2239 	return ret;
2240 }
2241 
2242 /* when a block goes through cow, we update the reference counts of
2243  * everything that block points to.  The internal pointers of the block
2244  * can be in just about any order, and it is likely to have clusters of
2245  * things that are close together and clusters of things that are not.
2246  *
2247  * To help reduce the seeks that come with updating all of these reference
2248  * counts, sort them by byte number before actual updates are done.
2249  *
2250  * struct refsort is used to match byte number to slot in the btree block.
2251  * we sort based on the byte number and then use the slot to actually
2252  * find the item.
2253  *
2254  * struct refsort is smaller than strcut btrfs_item and smaller than
2255  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2256  * for a btree block, there's no way for a kmalloc of refsorts for a
2257  * single node to be bigger than a page.
2258  */
2259 struct refsort {
2260 	u64 bytenr;
2261 	u32 slot;
2262 };
2263 
2264 /*
2265  * for passing into sort()
2266  */
2267 static int refsort_cmp(const void *a_void, const void *b_void)
2268 {
2269 	const struct refsort *a = a_void;
2270 	const struct refsort *b = b_void;
2271 
2272 	if (a->bytenr < b->bytenr)
2273 		return -1;
2274 	if (a->bytenr > b->bytenr)
2275 		return 1;
2276 	return 0;
2277 }
2278 #endif
2279 
2280 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2281 			   struct btrfs_root *root,
2282 			   struct extent_buffer *buf,
2283 			   int full_backref, int inc)
2284 {
2285 	u64 bytenr;
2286 	u64 num_bytes;
2287 	u64 parent;
2288 	u64 ref_root;
2289 	u32 nritems;
2290 	struct btrfs_key key;
2291 	struct btrfs_file_extent_item *fi;
2292 	int i;
2293 	int level;
2294 	int ret = 0;
2295 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2296 			    u64, u64, u64, u64, u64, u64);
2297 
2298 	ref_root = btrfs_header_owner(buf);
2299 	nritems = btrfs_header_nritems(buf);
2300 	level = btrfs_header_level(buf);
2301 
2302 	if (!root->ref_cows && level == 0)
2303 		return 0;
2304 
2305 	if (inc)
2306 		process_func = btrfs_inc_extent_ref;
2307 	else
2308 		process_func = btrfs_free_extent;
2309 
2310 	if (full_backref)
2311 		parent = buf->start;
2312 	else
2313 		parent = 0;
2314 
2315 	for (i = 0; i < nritems; i++) {
2316 		if (level == 0) {
2317 			btrfs_item_key_to_cpu(buf, &key, i);
2318 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2319 				continue;
2320 			fi = btrfs_item_ptr(buf, i,
2321 					    struct btrfs_file_extent_item);
2322 			if (btrfs_file_extent_type(buf, fi) ==
2323 			    BTRFS_FILE_EXTENT_INLINE)
2324 				continue;
2325 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2326 			if (bytenr == 0)
2327 				continue;
2328 
2329 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2330 			key.offset -= btrfs_file_extent_offset(buf, fi);
2331 			ret = process_func(trans, root, bytenr, num_bytes,
2332 					   parent, ref_root, key.objectid,
2333 					   key.offset);
2334 			if (ret)
2335 				goto fail;
2336 		} else {
2337 			bytenr = btrfs_node_blockptr(buf, i);
2338 			num_bytes = btrfs_level_size(root, level - 1);
2339 			ret = process_func(trans, root, bytenr, num_bytes,
2340 					   parent, ref_root, level - 1, 0);
2341 			if (ret)
2342 				goto fail;
2343 		}
2344 	}
2345 	return 0;
2346 fail:
2347 	BUG();
2348 	return ret;
2349 }
2350 
2351 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2352 		  struct extent_buffer *buf, int full_backref)
2353 {
2354 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2355 }
2356 
2357 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2358 		  struct extent_buffer *buf, int full_backref)
2359 {
2360 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2361 }
2362 
2363 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2364 				 struct btrfs_root *root,
2365 				 struct btrfs_path *path,
2366 				 struct btrfs_block_group_cache *cache)
2367 {
2368 	int ret;
2369 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2370 	unsigned long bi;
2371 	struct extent_buffer *leaf;
2372 
2373 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2374 	if (ret < 0)
2375 		goto fail;
2376 	BUG_ON(ret);
2377 
2378 	leaf = path->nodes[0];
2379 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2380 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2381 	btrfs_mark_buffer_dirty(leaf);
2382 	btrfs_release_path(extent_root, path);
2383 fail:
2384 	if (ret)
2385 		return ret;
2386 	return 0;
2387 
2388 }
2389 
2390 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2391 				   struct btrfs_root *root)
2392 {
2393 	struct btrfs_block_group_cache *cache, *entry;
2394 	struct rb_node *n;
2395 	int err = 0;
2396 	int werr = 0;
2397 	struct btrfs_path *path;
2398 	u64 last = 0;
2399 
2400 	path = btrfs_alloc_path();
2401 	if (!path)
2402 		return -ENOMEM;
2403 
2404 	while (1) {
2405 		cache = NULL;
2406 		spin_lock(&root->fs_info->block_group_cache_lock);
2407 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
2408 		     n; n = rb_next(n)) {
2409 			entry = rb_entry(n, struct btrfs_block_group_cache,
2410 					 cache_node);
2411 			if (entry->dirty) {
2412 				cache = entry;
2413 				break;
2414 			}
2415 		}
2416 		spin_unlock(&root->fs_info->block_group_cache_lock);
2417 
2418 		if (!cache)
2419 			break;
2420 
2421 		cache->dirty = 0;
2422 		last += cache->key.offset;
2423 
2424 		err = write_one_cache_group(trans, root,
2425 					    path, cache);
2426 		/*
2427 		 * if we fail to write the cache group, we want
2428 		 * to keep it marked dirty in hopes that a later
2429 		 * write will work
2430 		 */
2431 		if (err) {
2432 			werr = err;
2433 			continue;
2434 		}
2435 	}
2436 	btrfs_free_path(path);
2437 	return werr;
2438 }
2439 
2440 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2441 {
2442 	struct btrfs_block_group_cache *block_group;
2443 	int readonly = 0;
2444 
2445 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2446 	if (!block_group || block_group->ro)
2447 		readonly = 1;
2448 	if (block_group)
2449 		btrfs_put_block_group(block_group);
2450 	return readonly;
2451 }
2452 
2453 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2454 			     u64 total_bytes, u64 bytes_used,
2455 			     struct btrfs_space_info **space_info)
2456 {
2457 	struct btrfs_space_info *found;
2458 
2459 	found = __find_space_info(info, flags);
2460 	if (found) {
2461 		spin_lock(&found->lock);
2462 		found->total_bytes += total_bytes;
2463 		found->bytes_used += bytes_used;
2464 		found->full = 0;
2465 		spin_unlock(&found->lock);
2466 		*space_info = found;
2467 		return 0;
2468 	}
2469 	found = kzalloc(sizeof(*found), GFP_NOFS);
2470 	if (!found)
2471 		return -ENOMEM;
2472 
2473 	INIT_LIST_HEAD(&found->block_groups);
2474 	init_rwsem(&found->groups_sem);
2475 	spin_lock_init(&found->lock);
2476 	found->flags = flags;
2477 	found->total_bytes = total_bytes;
2478 	found->bytes_used = bytes_used;
2479 	found->bytes_pinned = 0;
2480 	found->bytes_reserved = 0;
2481 	found->bytes_readonly = 0;
2482 	found->bytes_delalloc = 0;
2483 	found->full = 0;
2484 	found->force_alloc = 0;
2485 	*space_info = found;
2486 	list_add_rcu(&found->list, &info->space_info);
2487 	return 0;
2488 }
2489 
2490 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2491 {
2492 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2493 				   BTRFS_BLOCK_GROUP_RAID1 |
2494 				   BTRFS_BLOCK_GROUP_RAID10 |
2495 				   BTRFS_BLOCK_GROUP_DUP);
2496 	if (extra_flags) {
2497 		if (flags & BTRFS_BLOCK_GROUP_DATA)
2498 			fs_info->avail_data_alloc_bits |= extra_flags;
2499 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
2500 			fs_info->avail_metadata_alloc_bits |= extra_flags;
2501 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2502 			fs_info->avail_system_alloc_bits |= extra_flags;
2503 	}
2504 }
2505 
2506 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2507 {
2508 	spin_lock(&cache->space_info->lock);
2509 	spin_lock(&cache->lock);
2510 	if (!cache->ro) {
2511 		cache->space_info->bytes_readonly += cache->key.offset -
2512 					btrfs_block_group_used(&cache->item);
2513 		cache->ro = 1;
2514 	}
2515 	spin_unlock(&cache->lock);
2516 	spin_unlock(&cache->space_info->lock);
2517 }
2518 
2519 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2520 {
2521 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
2522 
2523 	if (num_devices == 1)
2524 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2525 	if (num_devices < 4)
2526 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2527 
2528 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2529 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2530 		      BTRFS_BLOCK_GROUP_RAID10))) {
2531 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
2532 	}
2533 
2534 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2535 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2536 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2537 	}
2538 
2539 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2540 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2541 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
2542 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
2543 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2544 	return flags;
2545 }
2546 
2547 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2548 {
2549 	struct btrfs_fs_info *info = root->fs_info;
2550 	u64 alloc_profile;
2551 
2552 	if (data) {
2553 		alloc_profile = info->avail_data_alloc_bits &
2554 			info->data_alloc_profile;
2555 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2556 	} else if (root == root->fs_info->chunk_root) {
2557 		alloc_profile = info->avail_system_alloc_bits &
2558 			info->system_alloc_profile;
2559 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2560 	} else {
2561 		alloc_profile = info->avail_metadata_alloc_bits &
2562 			info->metadata_alloc_profile;
2563 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2564 	}
2565 
2566 	return btrfs_reduce_alloc_profile(root, data);
2567 }
2568 
2569 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2570 {
2571 	u64 alloc_target;
2572 
2573 	alloc_target = btrfs_get_alloc_profile(root, 1);
2574 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2575 						       alloc_target);
2576 }
2577 
2578 /*
2579  * for now this just makes sure we have at least 5% of our metadata space free
2580  * for use.
2581  */
2582 int btrfs_check_metadata_free_space(struct btrfs_root *root)
2583 {
2584 	struct btrfs_fs_info *info = root->fs_info;
2585 	struct btrfs_space_info *meta_sinfo;
2586 	u64 alloc_target, thresh;
2587 	int committed = 0, ret;
2588 
2589 	/* get the space info for where the metadata will live */
2590 	alloc_target = btrfs_get_alloc_profile(root, 0);
2591 	meta_sinfo = __find_space_info(info, alloc_target);
2592 
2593 again:
2594 	spin_lock(&meta_sinfo->lock);
2595 	if (!meta_sinfo->full)
2596 		thresh = meta_sinfo->total_bytes * 80;
2597 	else
2598 		thresh = meta_sinfo->total_bytes * 95;
2599 
2600 	do_div(thresh, 100);
2601 
2602 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2603 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2604 		struct btrfs_trans_handle *trans;
2605 		if (!meta_sinfo->full) {
2606 			meta_sinfo->force_alloc = 1;
2607 			spin_unlock(&meta_sinfo->lock);
2608 
2609 			trans = btrfs_start_transaction(root, 1);
2610 			if (!trans)
2611 				return -ENOMEM;
2612 
2613 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2614 					     2 * 1024 * 1024, alloc_target, 0);
2615 			btrfs_end_transaction(trans, root);
2616 			goto again;
2617 		}
2618 		spin_unlock(&meta_sinfo->lock);
2619 
2620 		if (!committed) {
2621 			committed = 1;
2622 			trans = btrfs_join_transaction(root, 1);
2623 			if (!trans)
2624 				return -ENOMEM;
2625 			ret = btrfs_commit_transaction(trans, root);
2626 			if (ret)
2627 				return ret;
2628 			goto again;
2629 		}
2630 		return -ENOSPC;
2631 	}
2632 	spin_unlock(&meta_sinfo->lock);
2633 
2634 	return 0;
2635 }
2636 
2637 /*
2638  * This will check the space that the inode allocates from to make sure we have
2639  * enough space for bytes.
2640  */
2641 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2642 				u64 bytes)
2643 {
2644 	struct btrfs_space_info *data_sinfo;
2645 	int ret = 0, committed = 0;
2646 
2647 	/* make sure bytes are sectorsize aligned */
2648 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2649 
2650 	data_sinfo = BTRFS_I(inode)->space_info;
2651 again:
2652 	/* make sure we have enough space to handle the data first */
2653 	spin_lock(&data_sinfo->lock);
2654 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2655 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2656 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2657 	    data_sinfo->bytes_may_use < bytes) {
2658 		struct btrfs_trans_handle *trans;
2659 
2660 		/*
2661 		 * if we don't have enough free bytes in this space then we need
2662 		 * to alloc a new chunk.
2663 		 */
2664 		if (!data_sinfo->full) {
2665 			u64 alloc_target;
2666 
2667 			data_sinfo->force_alloc = 1;
2668 			spin_unlock(&data_sinfo->lock);
2669 
2670 			alloc_target = btrfs_get_alloc_profile(root, 1);
2671 			trans = btrfs_start_transaction(root, 1);
2672 			if (!trans)
2673 				return -ENOMEM;
2674 
2675 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2676 					     bytes + 2 * 1024 * 1024,
2677 					     alloc_target, 0);
2678 			btrfs_end_transaction(trans, root);
2679 			if (ret)
2680 				return ret;
2681 			goto again;
2682 		}
2683 		spin_unlock(&data_sinfo->lock);
2684 
2685 		/* commit the current transaction and try again */
2686 		if (!committed) {
2687 			committed = 1;
2688 			trans = btrfs_join_transaction(root, 1);
2689 			if (!trans)
2690 				return -ENOMEM;
2691 			ret = btrfs_commit_transaction(trans, root);
2692 			if (ret)
2693 				return ret;
2694 			goto again;
2695 		}
2696 
2697 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2698 		       ", %llu bytes_used, %llu bytes_reserved, "
2699 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
2700 		       "%llu total\n", (unsigned long long)bytes,
2701 		       (unsigned long long)data_sinfo->bytes_delalloc,
2702 		       (unsigned long long)data_sinfo->bytes_used,
2703 		       (unsigned long long)data_sinfo->bytes_reserved,
2704 		       (unsigned long long)data_sinfo->bytes_pinned,
2705 		       (unsigned long long)data_sinfo->bytes_readonly,
2706 		       (unsigned long long)data_sinfo->bytes_may_use,
2707 		       (unsigned long long)data_sinfo->total_bytes);
2708 		return -ENOSPC;
2709 	}
2710 	data_sinfo->bytes_may_use += bytes;
2711 	BTRFS_I(inode)->reserved_bytes += bytes;
2712 	spin_unlock(&data_sinfo->lock);
2713 
2714 	return btrfs_check_metadata_free_space(root);
2715 }
2716 
2717 /*
2718  * if there was an error for whatever reason after calling
2719  * btrfs_check_data_free_space, call this so we can cleanup the counters.
2720  */
2721 void btrfs_free_reserved_data_space(struct btrfs_root *root,
2722 				    struct inode *inode, u64 bytes)
2723 {
2724 	struct btrfs_space_info *data_sinfo;
2725 
2726 	/* make sure bytes are sectorsize aligned */
2727 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2728 
2729 	data_sinfo = BTRFS_I(inode)->space_info;
2730 	spin_lock(&data_sinfo->lock);
2731 	data_sinfo->bytes_may_use -= bytes;
2732 	BTRFS_I(inode)->reserved_bytes -= bytes;
2733 	spin_unlock(&data_sinfo->lock);
2734 }
2735 
2736 /* called when we are adding a delalloc extent to the inode's io_tree */
2737 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2738 				  u64 bytes)
2739 {
2740 	struct btrfs_space_info *data_sinfo;
2741 
2742 	/* get the space info for where this inode will be storing its data */
2743 	data_sinfo = BTRFS_I(inode)->space_info;
2744 
2745 	/* make sure we have enough space to handle the data first */
2746 	spin_lock(&data_sinfo->lock);
2747 	data_sinfo->bytes_delalloc += bytes;
2748 
2749 	/*
2750 	 * we are adding a delalloc extent without calling
2751 	 * btrfs_check_data_free_space first.  This happens on a weird
2752 	 * writepage condition, but shouldn't hurt our accounting
2753 	 */
2754 	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2755 		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2756 		BTRFS_I(inode)->reserved_bytes = 0;
2757 	} else {
2758 		data_sinfo->bytes_may_use -= bytes;
2759 		BTRFS_I(inode)->reserved_bytes -= bytes;
2760 	}
2761 
2762 	spin_unlock(&data_sinfo->lock);
2763 }
2764 
2765 /* called when we are clearing an delalloc extent from the inode's io_tree */
2766 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2767 			      u64 bytes)
2768 {
2769 	struct btrfs_space_info *info;
2770 
2771 	info = BTRFS_I(inode)->space_info;
2772 
2773 	spin_lock(&info->lock);
2774 	info->bytes_delalloc -= bytes;
2775 	spin_unlock(&info->lock);
2776 }
2777 
2778 static void force_metadata_allocation(struct btrfs_fs_info *info)
2779 {
2780 	struct list_head *head = &info->space_info;
2781 	struct btrfs_space_info *found;
2782 
2783 	rcu_read_lock();
2784 	list_for_each_entry_rcu(found, head, list) {
2785 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
2786 			found->force_alloc = 1;
2787 	}
2788 	rcu_read_unlock();
2789 }
2790 
2791 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2792 			  struct btrfs_root *extent_root, u64 alloc_bytes,
2793 			  u64 flags, int force)
2794 {
2795 	struct btrfs_space_info *space_info;
2796 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
2797 	u64 thresh;
2798 	int ret = 0;
2799 
2800 	mutex_lock(&fs_info->chunk_mutex);
2801 
2802 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
2803 
2804 	space_info = __find_space_info(extent_root->fs_info, flags);
2805 	if (!space_info) {
2806 		ret = update_space_info(extent_root->fs_info, flags,
2807 					0, 0, &space_info);
2808 		BUG_ON(ret);
2809 	}
2810 	BUG_ON(!space_info);
2811 
2812 	spin_lock(&space_info->lock);
2813 	if (space_info->force_alloc) {
2814 		force = 1;
2815 		space_info->force_alloc = 0;
2816 	}
2817 	if (space_info->full) {
2818 		spin_unlock(&space_info->lock);
2819 		goto out;
2820 	}
2821 
2822 	thresh = space_info->total_bytes - space_info->bytes_readonly;
2823 	thresh = div_factor(thresh, 6);
2824 	if (!force &&
2825 	   (space_info->bytes_used + space_info->bytes_pinned +
2826 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
2827 		spin_unlock(&space_info->lock);
2828 		goto out;
2829 	}
2830 	spin_unlock(&space_info->lock);
2831 
2832 	/*
2833 	 * if we're doing a data chunk, go ahead and make sure that
2834 	 * we keep a reasonable number of metadata chunks allocated in the
2835 	 * FS as well.
2836 	 */
2837 	if (flags & BTRFS_BLOCK_GROUP_DATA) {
2838 		fs_info->data_chunk_allocations++;
2839 		if (!(fs_info->data_chunk_allocations %
2840 		      fs_info->metadata_ratio))
2841 			force_metadata_allocation(fs_info);
2842 	}
2843 
2844 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
2845 	if (ret)
2846 		space_info->full = 1;
2847 out:
2848 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
2849 	return ret;
2850 }
2851 
2852 static int update_block_group(struct btrfs_trans_handle *trans,
2853 			      struct btrfs_root *root,
2854 			      u64 bytenr, u64 num_bytes, int alloc,
2855 			      int mark_free)
2856 {
2857 	struct btrfs_block_group_cache *cache;
2858 	struct btrfs_fs_info *info = root->fs_info;
2859 	u64 total = num_bytes;
2860 	u64 old_val;
2861 	u64 byte_in_group;
2862 
2863 	/* block accounting for super block */
2864 	spin_lock(&info->delalloc_lock);
2865 	old_val = btrfs_super_bytes_used(&info->super_copy);
2866 	if (alloc)
2867 		old_val += num_bytes;
2868 	else
2869 		old_val -= num_bytes;
2870 	btrfs_set_super_bytes_used(&info->super_copy, old_val);
2871 
2872 	/* block accounting for root item */
2873 	old_val = btrfs_root_used(&root->root_item);
2874 	if (alloc)
2875 		old_val += num_bytes;
2876 	else
2877 		old_val -= num_bytes;
2878 	btrfs_set_root_used(&root->root_item, old_val);
2879 	spin_unlock(&info->delalloc_lock);
2880 
2881 	while (total) {
2882 		cache = btrfs_lookup_block_group(info, bytenr);
2883 		if (!cache)
2884 			return -1;
2885 		byte_in_group = bytenr - cache->key.objectid;
2886 		WARN_ON(byte_in_group > cache->key.offset);
2887 
2888 		spin_lock(&cache->space_info->lock);
2889 		spin_lock(&cache->lock);
2890 		cache->dirty = 1;
2891 		old_val = btrfs_block_group_used(&cache->item);
2892 		num_bytes = min(total, cache->key.offset - byte_in_group);
2893 		if (alloc) {
2894 			old_val += num_bytes;
2895 			cache->space_info->bytes_used += num_bytes;
2896 			if (cache->ro)
2897 				cache->space_info->bytes_readonly -= num_bytes;
2898 			btrfs_set_block_group_used(&cache->item, old_val);
2899 			spin_unlock(&cache->lock);
2900 			spin_unlock(&cache->space_info->lock);
2901 		} else {
2902 			old_val -= num_bytes;
2903 			cache->space_info->bytes_used -= num_bytes;
2904 			if (cache->ro)
2905 				cache->space_info->bytes_readonly += num_bytes;
2906 			btrfs_set_block_group_used(&cache->item, old_val);
2907 			spin_unlock(&cache->lock);
2908 			spin_unlock(&cache->space_info->lock);
2909 			if (mark_free) {
2910 				int ret;
2911 
2912 				ret = btrfs_discard_extent(root, bytenr,
2913 							   num_bytes);
2914 				WARN_ON(ret);
2915 
2916 				ret = btrfs_add_free_space(cache, bytenr,
2917 							   num_bytes);
2918 				WARN_ON(ret);
2919 			}
2920 		}
2921 		btrfs_put_block_group(cache);
2922 		total -= num_bytes;
2923 		bytenr += num_bytes;
2924 	}
2925 	return 0;
2926 }
2927 
2928 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2929 {
2930 	struct btrfs_block_group_cache *cache;
2931 	u64 bytenr;
2932 
2933 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2934 	if (!cache)
2935 		return 0;
2936 
2937 	bytenr = cache->key.objectid;
2938 	btrfs_put_block_group(cache);
2939 
2940 	return bytenr;
2941 }
2942 
2943 int btrfs_update_pinned_extents(struct btrfs_root *root,
2944 				u64 bytenr, u64 num, int pin)
2945 {
2946 	u64 len;
2947 	struct btrfs_block_group_cache *cache;
2948 	struct btrfs_fs_info *fs_info = root->fs_info;
2949 
2950 	if (pin) {
2951 		set_extent_dirty(&fs_info->pinned_extents,
2952 				bytenr, bytenr + num - 1, GFP_NOFS);
2953 	} else {
2954 		clear_extent_dirty(&fs_info->pinned_extents,
2955 				bytenr, bytenr + num - 1, GFP_NOFS);
2956 	}
2957 
2958 	while (num > 0) {
2959 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2960 		BUG_ON(!cache);
2961 		len = min(num, cache->key.offset -
2962 			  (bytenr - cache->key.objectid));
2963 		if (pin) {
2964 			spin_lock(&cache->space_info->lock);
2965 			spin_lock(&cache->lock);
2966 			cache->pinned += len;
2967 			cache->space_info->bytes_pinned += len;
2968 			spin_unlock(&cache->lock);
2969 			spin_unlock(&cache->space_info->lock);
2970 			fs_info->total_pinned += len;
2971 		} else {
2972 			spin_lock(&cache->space_info->lock);
2973 			spin_lock(&cache->lock);
2974 			cache->pinned -= len;
2975 			cache->space_info->bytes_pinned -= len;
2976 			spin_unlock(&cache->lock);
2977 			spin_unlock(&cache->space_info->lock);
2978 			fs_info->total_pinned -= len;
2979 			if (cache->cached)
2980 				btrfs_add_free_space(cache, bytenr, len);
2981 		}
2982 		btrfs_put_block_group(cache);
2983 		bytenr += len;
2984 		num -= len;
2985 	}
2986 	return 0;
2987 }
2988 
2989 static int update_reserved_extents(struct btrfs_root *root,
2990 				   u64 bytenr, u64 num, int reserve)
2991 {
2992 	u64 len;
2993 	struct btrfs_block_group_cache *cache;
2994 	struct btrfs_fs_info *fs_info = root->fs_info;
2995 
2996 	while (num > 0) {
2997 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2998 		BUG_ON(!cache);
2999 		len = min(num, cache->key.offset -
3000 			  (bytenr - cache->key.objectid));
3001 
3002 		spin_lock(&cache->space_info->lock);
3003 		spin_lock(&cache->lock);
3004 		if (reserve) {
3005 			cache->reserved += len;
3006 			cache->space_info->bytes_reserved += len;
3007 		} else {
3008 			cache->reserved -= len;
3009 			cache->space_info->bytes_reserved -= len;
3010 		}
3011 		spin_unlock(&cache->lock);
3012 		spin_unlock(&cache->space_info->lock);
3013 		btrfs_put_block_group(cache);
3014 		bytenr += len;
3015 		num -= len;
3016 	}
3017 	return 0;
3018 }
3019 
3020 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3021 {
3022 	u64 last = 0;
3023 	u64 start;
3024 	u64 end;
3025 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3026 	int ret;
3027 
3028 	while (1) {
3029 		ret = find_first_extent_bit(pinned_extents, last,
3030 					    &start, &end, EXTENT_DIRTY);
3031 		if (ret)
3032 			break;
3033 		set_extent_dirty(copy, start, end, GFP_NOFS);
3034 		last = end + 1;
3035 	}
3036 	return 0;
3037 }
3038 
3039 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3040 			       struct btrfs_root *root,
3041 			       struct extent_io_tree *unpin)
3042 {
3043 	u64 start;
3044 	u64 end;
3045 	int ret;
3046 
3047 	while (1) {
3048 		ret = find_first_extent_bit(unpin, 0, &start, &end,
3049 					    EXTENT_DIRTY);
3050 		if (ret)
3051 			break;
3052 
3053 		ret = btrfs_discard_extent(root, start, end + 1 - start);
3054 
3055 		/* unlocks the pinned mutex */
3056 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3057 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
3058 
3059 		cond_resched();
3060 	}
3061 	return ret;
3062 }
3063 
3064 static int pin_down_bytes(struct btrfs_trans_handle *trans,
3065 			  struct btrfs_root *root,
3066 			  struct btrfs_path *path,
3067 			  u64 bytenr, u64 num_bytes, int is_data,
3068 			  struct extent_buffer **must_clean)
3069 {
3070 	int err = 0;
3071 	struct extent_buffer *buf;
3072 
3073 	if (is_data)
3074 		goto pinit;
3075 
3076 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3077 	if (!buf)
3078 		goto pinit;
3079 
3080 	/* we can reuse a block if it hasn't been written
3081 	 * and it is from this transaction.  We can't
3082 	 * reuse anything from the tree log root because
3083 	 * it has tiny sub-transactions.
3084 	 */
3085 	if (btrfs_buffer_uptodate(buf, 0) &&
3086 	    btrfs_try_tree_lock(buf)) {
3087 		u64 header_owner = btrfs_header_owner(buf);
3088 		u64 header_transid = btrfs_header_generation(buf);
3089 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3090 		    header_transid == trans->transid &&
3091 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3092 			*must_clean = buf;
3093 			return 1;
3094 		}
3095 		btrfs_tree_unlock(buf);
3096 	}
3097 	free_extent_buffer(buf);
3098 pinit:
3099 	btrfs_set_path_blocking(path);
3100 	/* unlocks the pinned mutex */
3101 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3102 
3103 	BUG_ON(err < 0);
3104 	return 0;
3105 }
3106 
3107 
3108 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3109 				struct btrfs_root *root,
3110 				u64 bytenr, u64 num_bytes, u64 parent,
3111 				u64 root_objectid, u64 owner_objectid,
3112 				u64 owner_offset, int refs_to_drop,
3113 				struct btrfs_delayed_extent_op *extent_op)
3114 {
3115 	struct btrfs_key key;
3116 	struct btrfs_path *path;
3117 	struct btrfs_fs_info *info = root->fs_info;
3118 	struct btrfs_root *extent_root = info->extent_root;
3119 	struct extent_buffer *leaf;
3120 	struct btrfs_extent_item *ei;
3121 	struct btrfs_extent_inline_ref *iref;
3122 	int ret;
3123 	int is_data;
3124 	int extent_slot = 0;
3125 	int found_extent = 0;
3126 	int num_to_del = 1;
3127 	u32 item_size;
3128 	u64 refs;
3129 
3130 	path = btrfs_alloc_path();
3131 	if (!path)
3132 		return -ENOMEM;
3133 
3134 	path->reada = 1;
3135 	path->leave_spinning = 1;
3136 
3137 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3138 	BUG_ON(!is_data && refs_to_drop != 1);
3139 
3140 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
3141 				    bytenr, num_bytes, parent,
3142 				    root_objectid, owner_objectid,
3143 				    owner_offset);
3144 	if (ret == 0) {
3145 		extent_slot = path->slots[0];
3146 		while (extent_slot >= 0) {
3147 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3148 					      extent_slot);
3149 			if (key.objectid != bytenr)
3150 				break;
3151 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3152 			    key.offset == num_bytes) {
3153 				found_extent = 1;
3154 				break;
3155 			}
3156 			if (path->slots[0] - extent_slot > 5)
3157 				break;
3158 			extent_slot--;
3159 		}
3160 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3161 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3162 		if (found_extent && item_size < sizeof(*ei))
3163 			found_extent = 0;
3164 #endif
3165 		if (!found_extent) {
3166 			BUG_ON(iref);
3167 			ret = remove_extent_backref(trans, extent_root, path,
3168 						    NULL, refs_to_drop,
3169 						    is_data);
3170 			BUG_ON(ret);
3171 			btrfs_release_path(extent_root, path);
3172 			path->leave_spinning = 1;
3173 
3174 			key.objectid = bytenr;
3175 			key.type = BTRFS_EXTENT_ITEM_KEY;
3176 			key.offset = num_bytes;
3177 
3178 			ret = btrfs_search_slot(trans, extent_root,
3179 						&key, path, -1, 1);
3180 			if (ret) {
3181 				printk(KERN_ERR "umm, got %d back from search"
3182 				       ", was looking for %llu\n", ret,
3183 				       (unsigned long long)bytenr);
3184 				btrfs_print_leaf(extent_root, path->nodes[0]);
3185 			}
3186 			BUG_ON(ret);
3187 			extent_slot = path->slots[0];
3188 		}
3189 	} else {
3190 		btrfs_print_leaf(extent_root, path->nodes[0]);
3191 		WARN_ON(1);
3192 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
3193 		       "parent %llu root %llu  owner %llu offset %llu\n",
3194 		       (unsigned long long)bytenr,
3195 		       (unsigned long long)parent,
3196 		       (unsigned long long)root_objectid,
3197 		       (unsigned long long)owner_objectid,
3198 		       (unsigned long long)owner_offset);
3199 	}
3200 
3201 	leaf = path->nodes[0];
3202 	item_size = btrfs_item_size_nr(leaf, extent_slot);
3203 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3204 	if (item_size < sizeof(*ei)) {
3205 		BUG_ON(found_extent || extent_slot != path->slots[0]);
3206 		ret = convert_extent_item_v0(trans, extent_root, path,
3207 					     owner_objectid, 0);
3208 		BUG_ON(ret < 0);
3209 
3210 		btrfs_release_path(extent_root, path);
3211 		path->leave_spinning = 1;
3212 
3213 		key.objectid = bytenr;
3214 		key.type = BTRFS_EXTENT_ITEM_KEY;
3215 		key.offset = num_bytes;
3216 
3217 		ret = btrfs_search_slot(trans, extent_root, &key, path,
3218 					-1, 1);
3219 		if (ret) {
3220 			printk(KERN_ERR "umm, got %d back from search"
3221 			       ", was looking for %llu\n", ret,
3222 			       (unsigned long long)bytenr);
3223 			btrfs_print_leaf(extent_root, path->nodes[0]);
3224 		}
3225 		BUG_ON(ret);
3226 		extent_slot = path->slots[0];
3227 		leaf = path->nodes[0];
3228 		item_size = btrfs_item_size_nr(leaf, extent_slot);
3229 	}
3230 #endif
3231 	BUG_ON(item_size < sizeof(*ei));
3232 	ei = btrfs_item_ptr(leaf, extent_slot,
3233 			    struct btrfs_extent_item);
3234 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3235 		struct btrfs_tree_block_info *bi;
3236 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3237 		bi = (struct btrfs_tree_block_info *)(ei + 1);
3238 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3239 	}
3240 
3241 	refs = btrfs_extent_refs(leaf, ei);
3242 	BUG_ON(refs < refs_to_drop);
3243 	refs -= refs_to_drop;
3244 
3245 	if (refs > 0) {
3246 		if (extent_op)
3247 			__run_delayed_extent_op(extent_op, leaf, ei);
3248 		/*
3249 		 * In the case of inline back ref, reference count will
3250 		 * be updated by remove_extent_backref
3251 		 */
3252 		if (iref) {
3253 			BUG_ON(!found_extent);
3254 		} else {
3255 			btrfs_set_extent_refs(leaf, ei, refs);
3256 			btrfs_mark_buffer_dirty(leaf);
3257 		}
3258 		if (found_extent) {
3259 			ret = remove_extent_backref(trans, extent_root, path,
3260 						    iref, refs_to_drop,
3261 						    is_data);
3262 			BUG_ON(ret);
3263 		}
3264 	} else {
3265 		int mark_free = 0;
3266 		struct extent_buffer *must_clean = NULL;
3267 
3268 		if (found_extent) {
3269 			BUG_ON(is_data && refs_to_drop !=
3270 			       extent_data_ref_count(root, path, iref));
3271 			if (iref) {
3272 				BUG_ON(path->slots[0] != extent_slot);
3273 			} else {
3274 				BUG_ON(path->slots[0] != extent_slot + 1);
3275 				path->slots[0] = extent_slot;
3276 				num_to_del = 2;
3277 			}
3278 		}
3279 
3280 		ret = pin_down_bytes(trans, root, path, bytenr,
3281 				     num_bytes, is_data, &must_clean);
3282 		if (ret > 0)
3283 			mark_free = 1;
3284 		BUG_ON(ret < 0);
3285 		/*
3286 		 * it is going to be very rare for someone to be waiting
3287 		 * on the block we're freeing.  del_items might need to
3288 		 * schedule, so rather than get fancy, just force it
3289 		 * to blocking here
3290 		 */
3291 		if (must_clean)
3292 			btrfs_set_lock_blocking(must_clean);
3293 
3294 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3295 				      num_to_del);
3296 		BUG_ON(ret);
3297 		btrfs_release_path(extent_root, path);
3298 
3299 		if (must_clean) {
3300 			clean_tree_block(NULL, root, must_clean);
3301 			btrfs_tree_unlock(must_clean);
3302 			free_extent_buffer(must_clean);
3303 		}
3304 
3305 		if (is_data) {
3306 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3307 			BUG_ON(ret);
3308 		} else {
3309 			invalidate_mapping_pages(info->btree_inode->i_mapping,
3310 			     bytenr >> PAGE_CACHE_SHIFT,
3311 			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3312 		}
3313 
3314 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3315 					 mark_free);
3316 		BUG_ON(ret);
3317 	}
3318 	btrfs_free_path(path);
3319 	return ret;
3320 }
3321 
3322 /*
3323  * when we free an extent, it is possible (and likely) that we free the last
3324  * delayed ref for that extent as well.  This searches the delayed ref tree for
3325  * a given extent, and if there are no other delayed refs to be processed, it
3326  * removes it from the tree.
3327  */
3328 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3329 				      struct btrfs_root *root, u64 bytenr)
3330 {
3331 	struct btrfs_delayed_ref_head *head;
3332 	struct btrfs_delayed_ref_root *delayed_refs;
3333 	struct btrfs_delayed_ref_node *ref;
3334 	struct rb_node *node;
3335 	int ret;
3336 
3337 	delayed_refs = &trans->transaction->delayed_refs;
3338 	spin_lock(&delayed_refs->lock);
3339 	head = btrfs_find_delayed_ref_head(trans, bytenr);
3340 	if (!head)
3341 		goto out;
3342 
3343 	node = rb_prev(&head->node.rb_node);
3344 	if (!node)
3345 		goto out;
3346 
3347 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3348 
3349 	/* there are still entries for this ref, we can't drop it */
3350 	if (ref->bytenr == bytenr)
3351 		goto out;
3352 
3353 	if (head->extent_op) {
3354 		if (!head->must_insert_reserved)
3355 			goto out;
3356 		kfree(head->extent_op);
3357 		head->extent_op = NULL;
3358 	}
3359 
3360 	/*
3361 	 * waiting for the lock here would deadlock.  If someone else has it
3362 	 * locked they are already in the process of dropping it anyway
3363 	 */
3364 	if (!mutex_trylock(&head->mutex))
3365 		goto out;
3366 
3367 	/*
3368 	 * at this point we have a head with no other entries.  Go
3369 	 * ahead and process it.
3370 	 */
3371 	head->node.in_tree = 0;
3372 	rb_erase(&head->node.rb_node, &delayed_refs->root);
3373 
3374 	delayed_refs->num_entries--;
3375 
3376 	/*
3377 	 * we don't take a ref on the node because we're removing it from the
3378 	 * tree, so we just steal the ref the tree was holding.
3379 	 */
3380 	delayed_refs->num_heads--;
3381 	if (list_empty(&head->cluster))
3382 		delayed_refs->num_heads_ready--;
3383 
3384 	list_del_init(&head->cluster);
3385 	spin_unlock(&delayed_refs->lock);
3386 
3387 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
3388 				  &head->node, head->extent_op,
3389 				  head->must_insert_reserved);
3390 	BUG_ON(ret);
3391 	btrfs_put_delayed_ref(&head->node);
3392 	return 0;
3393 out:
3394 	spin_unlock(&delayed_refs->lock);
3395 	return 0;
3396 }
3397 
3398 int btrfs_free_extent(struct btrfs_trans_handle *trans,
3399 		      struct btrfs_root *root,
3400 		      u64 bytenr, u64 num_bytes, u64 parent,
3401 		      u64 root_objectid, u64 owner, u64 offset)
3402 {
3403 	int ret;
3404 
3405 	/*
3406 	 * tree log blocks never actually go into the extent allocation
3407 	 * tree, just update pinning info and exit early.
3408 	 */
3409 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3410 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3411 		/* unlocks the pinned mutex */
3412 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3413 		update_reserved_extents(root, bytenr, num_bytes, 0);
3414 		ret = 0;
3415 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3416 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
3417 					parent, root_objectid, (int)owner,
3418 					BTRFS_DROP_DELAYED_REF, NULL);
3419 		BUG_ON(ret);
3420 		ret = check_ref_cleanup(trans, root, bytenr);
3421 		BUG_ON(ret);
3422 	} else {
3423 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
3424 					parent, root_objectid, owner,
3425 					offset, BTRFS_DROP_DELAYED_REF, NULL);
3426 		BUG_ON(ret);
3427 	}
3428 	return ret;
3429 }
3430 
3431 static u64 stripe_align(struct btrfs_root *root, u64 val)
3432 {
3433 	u64 mask = ((u64)root->stripesize - 1);
3434 	u64 ret = (val + mask) & ~mask;
3435 	return ret;
3436 }
3437 
3438 /*
3439  * walks the btree of allocated extents and find a hole of a given size.
3440  * The key ins is changed to record the hole:
3441  * ins->objectid == block start
3442  * ins->flags = BTRFS_EXTENT_ITEM_KEY
3443  * ins->offset == number of blocks
3444  * Any available blocks before search_start are skipped.
3445  */
3446 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3447 				     struct btrfs_root *orig_root,
3448 				     u64 num_bytes, u64 empty_size,
3449 				     u64 search_start, u64 search_end,
3450 				     u64 hint_byte, struct btrfs_key *ins,
3451 				     u64 exclude_start, u64 exclude_nr,
3452 				     int data)
3453 {
3454 	int ret = 0;
3455 	struct btrfs_root *root = orig_root->fs_info->extent_root;
3456 	struct btrfs_free_cluster *last_ptr = NULL;
3457 	struct btrfs_block_group_cache *block_group = NULL;
3458 	int empty_cluster = 2 * 1024 * 1024;
3459 	int allowed_chunk_alloc = 0;
3460 	struct btrfs_space_info *space_info;
3461 	int last_ptr_loop = 0;
3462 	int loop = 0;
3463 
3464 	WARN_ON(num_bytes < root->sectorsize);
3465 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3466 	ins->objectid = 0;
3467 	ins->offset = 0;
3468 
3469 	space_info = __find_space_info(root->fs_info, data);
3470 
3471 	if (orig_root->ref_cows || empty_size)
3472 		allowed_chunk_alloc = 1;
3473 
3474 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
3475 		last_ptr = &root->fs_info->meta_alloc_cluster;
3476 		if (!btrfs_test_opt(root, SSD))
3477 			empty_cluster = 64 * 1024;
3478 	}
3479 
3480 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
3481 		last_ptr = &root->fs_info->data_alloc_cluster;
3482 	}
3483 
3484 	if (last_ptr) {
3485 		spin_lock(&last_ptr->lock);
3486 		if (last_ptr->block_group)
3487 			hint_byte = last_ptr->window_start;
3488 		spin_unlock(&last_ptr->lock);
3489 	}
3490 
3491 	search_start = max(search_start, first_logical_byte(root, 0));
3492 	search_start = max(search_start, hint_byte);
3493 
3494 	if (!last_ptr) {
3495 		empty_cluster = 0;
3496 		loop = 1;
3497 	}
3498 
3499 	if (search_start == hint_byte) {
3500 		block_group = btrfs_lookup_block_group(root->fs_info,
3501 						       search_start);
3502 		if (block_group && block_group_bits(block_group, data)) {
3503 			down_read(&space_info->groups_sem);
3504 			if (list_empty(&block_group->list) ||
3505 			    block_group->ro) {
3506 				/*
3507 				 * someone is removing this block group,
3508 				 * we can't jump into the have_block_group
3509 				 * target because our list pointers are not
3510 				 * valid
3511 				 */
3512 				btrfs_put_block_group(block_group);
3513 				up_read(&space_info->groups_sem);
3514 			} else
3515 				goto have_block_group;
3516 		} else if (block_group) {
3517 			btrfs_put_block_group(block_group);
3518 		}
3519 	}
3520 
3521 search:
3522 	down_read(&space_info->groups_sem);
3523 	list_for_each_entry(block_group, &space_info->block_groups, list) {
3524 		u64 offset;
3525 
3526 		atomic_inc(&block_group->count);
3527 		search_start = block_group->key.objectid;
3528 
3529 have_block_group:
3530 		if (unlikely(!block_group->cached)) {
3531 			mutex_lock(&block_group->cache_mutex);
3532 			ret = cache_block_group(root, block_group);
3533 			mutex_unlock(&block_group->cache_mutex);
3534 			if (ret) {
3535 				btrfs_put_block_group(block_group);
3536 				break;
3537 			}
3538 		}
3539 
3540 		if (unlikely(block_group->ro))
3541 			goto loop;
3542 
3543 		if (last_ptr) {
3544 			/*
3545 			 * the refill lock keeps out other
3546 			 * people trying to start a new cluster
3547 			 */
3548 			spin_lock(&last_ptr->refill_lock);
3549 			if (last_ptr->block_group &&
3550 			    (last_ptr->block_group->ro ||
3551 			    !block_group_bits(last_ptr->block_group, data))) {
3552 				offset = 0;
3553 				goto refill_cluster;
3554 			}
3555 
3556 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
3557 						 num_bytes, search_start);
3558 			if (offset) {
3559 				/* we have a block, we're done */
3560 				spin_unlock(&last_ptr->refill_lock);
3561 				goto checks;
3562 			}
3563 
3564 			spin_lock(&last_ptr->lock);
3565 			/*
3566 			 * whoops, this cluster doesn't actually point to
3567 			 * this block group.  Get a ref on the block
3568 			 * group is does point to and try again
3569 			 */
3570 			if (!last_ptr_loop && last_ptr->block_group &&
3571 			    last_ptr->block_group != block_group) {
3572 
3573 				btrfs_put_block_group(block_group);
3574 				block_group = last_ptr->block_group;
3575 				atomic_inc(&block_group->count);
3576 				spin_unlock(&last_ptr->lock);
3577 				spin_unlock(&last_ptr->refill_lock);
3578 
3579 				last_ptr_loop = 1;
3580 				search_start = block_group->key.objectid;
3581 				/*
3582 				 * we know this block group is properly
3583 				 * in the list because
3584 				 * btrfs_remove_block_group, drops the
3585 				 * cluster before it removes the block
3586 				 * group from the list
3587 				 */
3588 				goto have_block_group;
3589 			}
3590 			spin_unlock(&last_ptr->lock);
3591 refill_cluster:
3592 			/*
3593 			 * this cluster didn't work out, free it and
3594 			 * start over
3595 			 */
3596 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
3597 
3598 			last_ptr_loop = 0;
3599 
3600 			/* allocate a cluster in this block group */
3601 			ret = btrfs_find_space_cluster(trans, root,
3602 					       block_group, last_ptr,
3603 					       offset, num_bytes,
3604 					       empty_cluster + empty_size);
3605 			if (ret == 0) {
3606 				/*
3607 				 * now pull our allocation out of this
3608 				 * cluster
3609 				 */
3610 				offset = btrfs_alloc_from_cluster(block_group,
3611 						  last_ptr, num_bytes,
3612 						  search_start);
3613 				if (offset) {
3614 					/* we found one, proceed */
3615 					spin_unlock(&last_ptr->refill_lock);
3616 					goto checks;
3617 				}
3618 			}
3619 			/*
3620 			 * at this point we either didn't find a cluster
3621 			 * or we weren't able to allocate a block from our
3622 			 * cluster.  Free the cluster we've been trying
3623 			 * to use, and go to the next block group
3624 			 */
3625 			if (loop < 2) {
3626 				btrfs_return_cluster_to_free_space(NULL,
3627 								   last_ptr);
3628 				spin_unlock(&last_ptr->refill_lock);
3629 				goto loop;
3630 			}
3631 			spin_unlock(&last_ptr->refill_lock);
3632 		}
3633 
3634 		offset = btrfs_find_space_for_alloc(block_group, search_start,
3635 						    num_bytes, empty_size);
3636 		if (!offset)
3637 			goto loop;
3638 checks:
3639 		search_start = stripe_align(root, offset);
3640 
3641 		/* move on to the next group */
3642 		if (search_start + num_bytes >= search_end) {
3643 			btrfs_add_free_space(block_group, offset, num_bytes);
3644 			goto loop;
3645 		}
3646 
3647 		/* move on to the next group */
3648 		if (search_start + num_bytes >
3649 		    block_group->key.objectid + block_group->key.offset) {
3650 			btrfs_add_free_space(block_group, offset, num_bytes);
3651 			goto loop;
3652 		}
3653 
3654 		if (exclude_nr > 0 &&
3655 		    (search_start + num_bytes > exclude_start &&
3656 		     search_start < exclude_start + exclude_nr)) {
3657 			search_start = exclude_start + exclude_nr;
3658 
3659 			btrfs_add_free_space(block_group, offset, num_bytes);
3660 			/*
3661 			 * if search_start is still in this block group
3662 			 * then we just re-search this block group
3663 			 */
3664 			if (search_start >= block_group->key.objectid &&
3665 			    search_start < (block_group->key.objectid +
3666 					    block_group->key.offset))
3667 				goto have_block_group;
3668 			goto loop;
3669 		}
3670 
3671 		ins->objectid = search_start;
3672 		ins->offset = num_bytes;
3673 
3674 		if (offset < search_start)
3675 			btrfs_add_free_space(block_group, offset,
3676 					     search_start - offset);
3677 		BUG_ON(offset > search_start);
3678 
3679 		/* we are all good, lets return */
3680 		break;
3681 loop:
3682 		btrfs_put_block_group(block_group);
3683 	}
3684 	up_read(&space_info->groups_sem);
3685 
3686 	/* loop == 0, try to find a clustered alloc in every block group
3687 	 * loop == 1, try again after forcing a chunk allocation
3688 	 * loop == 2, set empty_size and empty_cluster to 0 and try again
3689 	 */
3690 	if (!ins->objectid && loop < 3 &&
3691 	    (empty_size || empty_cluster || allowed_chunk_alloc)) {
3692 		if (loop >= 2) {
3693 			empty_size = 0;
3694 			empty_cluster = 0;
3695 		}
3696 
3697 		if (allowed_chunk_alloc) {
3698 			ret = do_chunk_alloc(trans, root, num_bytes +
3699 					     2 * 1024 * 1024, data, 1);
3700 			allowed_chunk_alloc = 0;
3701 		} else {
3702 			space_info->force_alloc = 1;
3703 		}
3704 
3705 		if (loop < 3) {
3706 			loop++;
3707 			goto search;
3708 		}
3709 		ret = -ENOSPC;
3710 	} else if (!ins->objectid) {
3711 		ret = -ENOSPC;
3712 	}
3713 
3714 	/* we found what we needed */
3715 	if (ins->objectid) {
3716 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
3717 			trans->block_group = block_group->key.objectid;
3718 
3719 		btrfs_put_block_group(block_group);
3720 		ret = 0;
3721 	}
3722 
3723 	return ret;
3724 }
3725 
3726 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3727 {
3728 	struct btrfs_block_group_cache *cache;
3729 
3730 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3731 	       (unsigned long long)(info->total_bytes - info->bytes_used -
3732 				    info->bytes_pinned - info->bytes_reserved),
3733 	       (info->full) ? "" : "not ");
3734 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3735 	       " may_use=%llu, used=%llu\n",
3736 	       (unsigned long long)info->total_bytes,
3737 	       (unsigned long long)info->bytes_pinned,
3738 	       (unsigned long long)info->bytes_delalloc,
3739 	       (unsigned long long)info->bytes_may_use,
3740 	       (unsigned long long)info->bytes_used);
3741 
3742 	down_read(&info->groups_sem);
3743 	list_for_each_entry(cache, &info->block_groups, list) {
3744 		spin_lock(&cache->lock);
3745 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3746 		       "%llu pinned %llu reserved\n",
3747 		       (unsigned long long)cache->key.objectid,
3748 		       (unsigned long long)cache->key.offset,
3749 		       (unsigned long long)btrfs_block_group_used(&cache->item),
3750 		       (unsigned long long)cache->pinned,
3751 		       (unsigned long long)cache->reserved);
3752 		btrfs_dump_free_space(cache, bytes);
3753 		spin_unlock(&cache->lock);
3754 	}
3755 	up_read(&info->groups_sem);
3756 }
3757 
3758 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3759 				  struct btrfs_root *root,
3760 				  u64 num_bytes, u64 min_alloc_size,
3761 				  u64 empty_size, u64 hint_byte,
3762 				  u64 search_end, struct btrfs_key *ins,
3763 				  u64 data)
3764 {
3765 	int ret;
3766 	u64 search_start = 0;
3767 	struct btrfs_fs_info *info = root->fs_info;
3768 
3769 	data = btrfs_get_alloc_profile(root, data);
3770 again:
3771 	/*
3772 	 * the only place that sets empty_size is btrfs_realloc_node, which
3773 	 * is not called recursively on allocations
3774 	 */
3775 	if (empty_size || root->ref_cows) {
3776 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3777 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3778 				     2 * 1024 * 1024,
3779 				     BTRFS_BLOCK_GROUP_METADATA |
3780 				     (info->metadata_alloc_profile &
3781 				      info->avail_metadata_alloc_bits), 0);
3782 		}
3783 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3784 				     num_bytes + 2 * 1024 * 1024, data, 0);
3785 	}
3786 
3787 	WARN_ON(num_bytes < root->sectorsize);
3788 	ret = find_free_extent(trans, root, num_bytes, empty_size,
3789 			       search_start, search_end, hint_byte, ins,
3790 			       trans->alloc_exclude_start,
3791 			       trans->alloc_exclude_nr, data);
3792 
3793 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3794 		num_bytes = num_bytes >> 1;
3795 		num_bytes = num_bytes & ~(root->sectorsize - 1);
3796 		num_bytes = max(num_bytes, min_alloc_size);
3797 		do_chunk_alloc(trans, root->fs_info->extent_root,
3798 			       num_bytes, data, 1);
3799 		goto again;
3800 	}
3801 	if (ret) {
3802 		struct btrfs_space_info *sinfo;
3803 
3804 		sinfo = __find_space_info(root->fs_info, data);
3805 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
3806 		       "wanted %llu\n", (unsigned long long)data,
3807 		       (unsigned long long)num_bytes);
3808 		dump_space_info(sinfo, num_bytes);
3809 		BUG();
3810 	}
3811 
3812 	return ret;
3813 }
3814 
3815 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3816 {
3817 	struct btrfs_block_group_cache *cache;
3818 	int ret = 0;
3819 
3820 	cache = btrfs_lookup_block_group(root->fs_info, start);
3821 	if (!cache) {
3822 		printk(KERN_ERR "Unable to find block group for %llu\n",
3823 		       (unsigned long long)start);
3824 		return -ENOSPC;
3825 	}
3826 
3827 	ret = btrfs_discard_extent(root, start, len);
3828 
3829 	btrfs_add_free_space(cache, start, len);
3830 	btrfs_put_block_group(cache);
3831 	update_reserved_extents(root, start, len, 0);
3832 
3833 	return ret;
3834 }
3835 
3836 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3837 				  struct btrfs_root *root,
3838 				  u64 num_bytes, u64 min_alloc_size,
3839 				  u64 empty_size, u64 hint_byte,
3840 				  u64 search_end, struct btrfs_key *ins,
3841 				  u64 data)
3842 {
3843 	int ret;
3844 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3845 				     empty_size, hint_byte, search_end, ins,
3846 				     data);
3847 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
3848 	return ret;
3849 }
3850 
3851 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
3852 				      struct btrfs_root *root,
3853 				      u64 parent, u64 root_objectid,
3854 				      u64 flags, u64 owner, u64 offset,
3855 				      struct btrfs_key *ins, int ref_mod)
3856 {
3857 	int ret;
3858 	struct btrfs_fs_info *fs_info = root->fs_info;
3859 	struct btrfs_extent_item *extent_item;
3860 	struct btrfs_extent_inline_ref *iref;
3861 	struct btrfs_path *path;
3862 	struct extent_buffer *leaf;
3863 	int type;
3864 	u32 size;
3865 
3866 	if (parent > 0)
3867 		type = BTRFS_SHARED_DATA_REF_KEY;
3868 	else
3869 		type = BTRFS_EXTENT_DATA_REF_KEY;
3870 
3871 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
3872 
3873 	path = btrfs_alloc_path();
3874 	BUG_ON(!path);
3875 
3876 	path->leave_spinning = 1;
3877 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3878 				      ins, size);
3879 	BUG_ON(ret);
3880 
3881 	leaf = path->nodes[0];
3882 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
3883 				     struct btrfs_extent_item);
3884 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
3885 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
3886 	btrfs_set_extent_flags(leaf, extent_item,
3887 			       flags | BTRFS_EXTENT_FLAG_DATA);
3888 
3889 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
3890 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
3891 	if (parent > 0) {
3892 		struct btrfs_shared_data_ref *ref;
3893 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
3894 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
3895 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
3896 	} else {
3897 		struct btrfs_extent_data_ref *ref;
3898 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3899 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
3900 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
3901 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
3902 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
3903 	}
3904 
3905 	btrfs_mark_buffer_dirty(path->nodes[0]);
3906 	btrfs_free_path(path);
3907 
3908 	ret = update_block_group(trans, root, ins->objectid, ins->offset,
3909 				 1, 0);
3910 	if (ret) {
3911 		printk(KERN_ERR "btrfs update block group failed for %llu "
3912 		       "%llu\n", (unsigned long long)ins->objectid,
3913 		       (unsigned long long)ins->offset);
3914 		BUG();
3915 	}
3916 	return ret;
3917 }
3918 
3919 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
3920 				     struct btrfs_root *root,
3921 				     u64 parent, u64 root_objectid,
3922 				     u64 flags, struct btrfs_disk_key *key,
3923 				     int level, struct btrfs_key *ins)
3924 {
3925 	int ret;
3926 	struct btrfs_fs_info *fs_info = root->fs_info;
3927 	struct btrfs_extent_item *extent_item;
3928 	struct btrfs_tree_block_info *block_info;
3929 	struct btrfs_extent_inline_ref *iref;
3930 	struct btrfs_path *path;
3931 	struct extent_buffer *leaf;
3932 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
3933 
3934 	path = btrfs_alloc_path();
3935 	BUG_ON(!path);
3936 
3937 	path->leave_spinning = 1;
3938 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
3939 				      ins, size);
3940 	BUG_ON(ret);
3941 
3942 	leaf = path->nodes[0];
3943 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
3944 				     struct btrfs_extent_item);
3945 	btrfs_set_extent_refs(leaf, extent_item, 1);
3946 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
3947 	btrfs_set_extent_flags(leaf, extent_item,
3948 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
3949 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
3950 
3951 	btrfs_set_tree_block_key(leaf, block_info, key);
3952 	btrfs_set_tree_block_level(leaf, block_info, level);
3953 
3954 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
3955 	if (parent > 0) {
3956 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
3957 		btrfs_set_extent_inline_ref_type(leaf, iref,
3958 						 BTRFS_SHARED_BLOCK_REF_KEY);
3959 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
3960 	} else {
3961 		btrfs_set_extent_inline_ref_type(leaf, iref,
3962 						 BTRFS_TREE_BLOCK_REF_KEY);
3963 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
3964 	}
3965 
3966 	btrfs_mark_buffer_dirty(leaf);
3967 	btrfs_free_path(path);
3968 
3969 	ret = update_block_group(trans, root, ins->objectid, ins->offset,
3970 				 1, 0);
3971 	if (ret) {
3972 		printk(KERN_ERR "btrfs update block group failed for %llu "
3973 		       "%llu\n", (unsigned long long)ins->objectid,
3974 		       (unsigned long long)ins->offset);
3975 		BUG();
3976 	}
3977 	return ret;
3978 }
3979 
3980 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
3981 				     struct btrfs_root *root,
3982 				     u64 root_objectid, u64 owner,
3983 				     u64 offset, struct btrfs_key *ins)
3984 {
3985 	int ret;
3986 
3987 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
3988 
3989 	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
3990 					 0, root_objectid, owner, offset,
3991 					 BTRFS_ADD_DELAYED_EXTENT, NULL);
3992 	return ret;
3993 }
3994 
3995 /*
3996  * this is used by the tree logging recovery code.  It records that
3997  * an extent has been allocated and makes sure to clear the free
3998  * space cache bits as well
3999  */
4000 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4001 				   struct btrfs_root *root,
4002 				   u64 root_objectid, u64 owner, u64 offset,
4003 				   struct btrfs_key *ins)
4004 {
4005 	int ret;
4006 	struct btrfs_block_group_cache *block_group;
4007 
4008 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4009 	mutex_lock(&block_group->cache_mutex);
4010 	cache_block_group(root, block_group);
4011 	mutex_unlock(&block_group->cache_mutex);
4012 
4013 	ret = btrfs_remove_free_space(block_group, ins->objectid,
4014 				      ins->offset);
4015 	BUG_ON(ret);
4016 	btrfs_put_block_group(block_group);
4017 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4018 					 0, owner, offset, ins, 1);
4019 	return ret;
4020 }
4021 
4022 /*
4023  * finds a free extent and does all the dirty work required for allocation
4024  * returns the key for the extent through ins, and a tree buffer for
4025  * the first block of the extent through buf.
4026  *
4027  * returns 0 if everything worked, non-zero otherwise.
4028  */
4029 static int alloc_tree_block(struct btrfs_trans_handle *trans,
4030 			    struct btrfs_root *root,
4031 			    u64 num_bytes, u64 parent, u64 root_objectid,
4032 			    struct btrfs_disk_key *key, int level,
4033 			    u64 empty_size, u64 hint_byte, u64 search_end,
4034 			    struct btrfs_key *ins)
4035 {
4036 	int ret;
4037 	u64 flags = 0;
4038 
4039 	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4040 				     empty_size, hint_byte, search_end,
4041 				     ins, 0);
4042 	BUG_ON(ret);
4043 
4044 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4045 		if (parent == 0)
4046 			parent = ins->objectid;
4047 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4048 	} else
4049 		BUG_ON(parent > 0);
4050 
4051 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
4052 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4053 		struct btrfs_delayed_extent_op *extent_op;
4054 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4055 		BUG_ON(!extent_op);
4056 		if (key)
4057 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
4058 		else
4059 			memset(&extent_op->key, 0, sizeof(extent_op->key));
4060 		extent_op->flags_to_set = flags;
4061 		extent_op->update_key = 1;
4062 		extent_op->update_flags = 1;
4063 		extent_op->is_data = 0;
4064 
4065 		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4066 					ins->offset, parent, root_objectid,
4067 					level, BTRFS_ADD_DELAYED_EXTENT,
4068 					extent_op);
4069 		BUG_ON(ret);
4070 	}
4071 	return ret;
4072 }
4073 
4074 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4075 					    struct btrfs_root *root,
4076 					    u64 bytenr, u32 blocksize,
4077 					    int level)
4078 {
4079 	struct extent_buffer *buf;
4080 
4081 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
4082 	if (!buf)
4083 		return ERR_PTR(-ENOMEM);
4084 	btrfs_set_header_generation(buf, trans->transid);
4085 	btrfs_set_buffer_lockdep_class(buf, level);
4086 	btrfs_tree_lock(buf);
4087 	clean_tree_block(trans, root, buf);
4088 
4089 	btrfs_set_lock_blocking(buf);
4090 	btrfs_set_buffer_uptodate(buf);
4091 
4092 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4093 		set_extent_dirty(&root->dirty_log_pages, buf->start,
4094 			 buf->start + buf->len - 1, GFP_NOFS);
4095 	} else {
4096 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4097 			 buf->start + buf->len - 1, GFP_NOFS);
4098 	}
4099 	trans->blocks_used++;
4100 	/* this returns a buffer locked for blocking */
4101 	return buf;
4102 }
4103 
4104 /*
4105  * helper function to allocate a block for a given tree
4106  * returns the tree buffer or NULL.
4107  */
4108 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4109 					struct btrfs_root *root, u32 blocksize,
4110 					u64 parent, u64 root_objectid,
4111 					struct btrfs_disk_key *key, int level,
4112 					u64 hint, u64 empty_size)
4113 {
4114 	struct btrfs_key ins;
4115 	int ret;
4116 	struct extent_buffer *buf;
4117 
4118 	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
4119 			       key, level, empty_size, hint, (u64)-1, &ins);
4120 	if (ret) {
4121 		BUG_ON(ret > 0);
4122 		return ERR_PTR(ret);
4123 	}
4124 
4125 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4126 				    blocksize, level);
4127 	return buf;
4128 }
4129 
4130 #if 0
4131 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
4132 			struct btrfs_root *root, struct extent_buffer *leaf)
4133 {
4134 	u64 disk_bytenr;
4135 	u64 num_bytes;
4136 	struct btrfs_key key;
4137 	struct btrfs_file_extent_item *fi;
4138 	u32 nritems;
4139 	int i;
4140 	int ret;
4141 
4142 	BUG_ON(!btrfs_is_leaf(leaf));
4143 	nritems = btrfs_header_nritems(leaf);
4144 
4145 	for (i = 0; i < nritems; i++) {
4146 		cond_resched();
4147 		btrfs_item_key_to_cpu(leaf, &key, i);
4148 
4149 		/* only extents have references, skip everything else */
4150 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4151 			continue;
4152 
4153 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4154 
4155 		/* inline extents live in the btree, they don't have refs */
4156 		if (btrfs_file_extent_type(leaf, fi) ==
4157 		    BTRFS_FILE_EXTENT_INLINE)
4158 			continue;
4159 
4160 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4161 
4162 		/* holes don't have refs */
4163 		if (disk_bytenr == 0)
4164 			continue;
4165 
4166 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4167 		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
4168 					leaf->start, 0, key.objectid, 0);
4169 		BUG_ON(ret);
4170 	}
4171 	return 0;
4172 }
4173 
4174 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4175 					struct btrfs_root *root,
4176 					struct btrfs_leaf_ref *ref)
4177 {
4178 	int i;
4179 	int ret;
4180 	struct btrfs_extent_info *info;
4181 	struct refsort *sorted;
4182 
4183 	if (ref->nritems == 0)
4184 		return 0;
4185 
4186 	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
4187 	for (i = 0; i < ref->nritems; i++) {
4188 		sorted[i].bytenr = ref->extents[i].bytenr;
4189 		sorted[i].slot = i;
4190 	}
4191 	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
4192 
4193 	/*
4194 	 * the items in the ref were sorted when the ref was inserted
4195 	 * into the ref cache, so this is already in order
4196 	 */
4197 	for (i = 0; i < ref->nritems; i++) {
4198 		info = ref->extents + sorted[i].slot;
4199 		ret = btrfs_free_extent(trans, root, info->bytenr,
4200 					  info->num_bytes, ref->bytenr,
4201 					  ref->owner, ref->generation,
4202 					  info->objectid, 0);
4203 
4204 		atomic_inc(&root->fs_info->throttle_gen);
4205 		wake_up(&root->fs_info->transaction_throttle);
4206 		cond_resched();
4207 
4208 		BUG_ON(ret);
4209 		info++;
4210 	}
4211 
4212 	kfree(sorted);
4213 	return 0;
4214 }
4215 
4216 
4217 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
4218 				     struct btrfs_root *root, u64 start,
4219 				     u64 len, u32 *refs)
4220 {
4221 	int ret;
4222 
4223 	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
4224 	BUG_ON(ret);
4225 
4226 #if 0 /* some debugging code in case we see problems here */
4227 	/* if the refs count is one, it won't get increased again.  But
4228 	 * if the ref count is > 1, someone may be decreasing it at
4229 	 * the same time we are.
4230 	 */
4231 	if (*refs != 1) {
4232 		struct extent_buffer *eb = NULL;
4233 		eb = btrfs_find_create_tree_block(root, start, len);
4234 		if (eb)
4235 			btrfs_tree_lock(eb);
4236 
4237 		mutex_lock(&root->fs_info->alloc_mutex);
4238 		ret = lookup_extent_ref(NULL, root, start, len, refs);
4239 		BUG_ON(ret);
4240 		mutex_unlock(&root->fs_info->alloc_mutex);
4241 
4242 		if (eb) {
4243 			btrfs_tree_unlock(eb);
4244 			free_extent_buffer(eb);
4245 		}
4246 		if (*refs == 1) {
4247 			printk(KERN_ERR "btrfs block %llu went down to one "
4248 			       "during drop_snap\n", (unsigned long long)start);
4249 		}
4250 
4251 	}
4252 #endif
4253 
4254 	cond_resched();
4255 	return ret;
4256 }
4257 
4258 
4259 /*
4260  * this is used while deleting old snapshots, and it drops the refs
4261  * on a whole subtree starting from a level 1 node.
4262  *
4263  * The idea is to sort all the leaf pointers, and then drop the
4264  * ref on all the leaves in order.  Most of the time the leaves
4265  * will have ref cache entries, so no leaf IOs will be required to
4266  * find the extents they have references on.
4267  *
4268  * For each leaf, any references it has are also dropped in order
4269  *
4270  * This ends up dropping the references in something close to optimal
4271  * order for reading and modifying the extent allocation tree.
4272  */
4273 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4274 					struct btrfs_root *root,
4275 					struct btrfs_path *path)
4276 {
4277 	u64 bytenr;
4278 	u64 root_owner;
4279 	u64 root_gen;
4280 	struct extent_buffer *eb = path->nodes[1];
4281 	struct extent_buffer *leaf;
4282 	struct btrfs_leaf_ref *ref;
4283 	struct refsort *sorted = NULL;
4284 	int nritems = btrfs_header_nritems(eb);
4285 	int ret;
4286 	int i;
4287 	int refi = 0;
4288 	int slot = path->slots[1];
4289 	u32 blocksize = btrfs_level_size(root, 0);
4290 	u32 refs;
4291 
4292 	if (nritems == 0)
4293 		goto out;
4294 
4295 	root_owner = btrfs_header_owner(eb);
4296 	root_gen = btrfs_header_generation(eb);
4297 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
4298 
4299 	/*
4300 	 * step one, sort all the leaf pointers so we don't scribble
4301 	 * randomly into the extent allocation tree
4302 	 */
4303 	for (i = slot; i < nritems; i++) {
4304 		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
4305 		sorted[refi].slot = i;
4306 		refi++;
4307 	}
4308 
4309 	/*
4310 	 * nritems won't be zero, but if we're picking up drop_snapshot
4311 	 * after a crash, slot might be > 0, so double check things
4312 	 * just in case.
4313 	 */
4314 	if (refi == 0)
4315 		goto out;
4316 
4317 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
4318 
4319 	/*
4320 	 * the first loop frees everything the leaves point to
4321 	 */
4322 	for (i = 0; i < refi; i++) {
4323 		u64 ptr_gen;
4324 
4325 		bytenr = sorted[i].bytenr;
4326 
4327 		/*
4328 		 * check the reference count on this leaf.  If it is > 1
4329 		 * we just decrement it below and don't update any
4330 		 * of the refs the leaf points to.
4331 		 */
4332 		ret = drop_snap_lookup_refcount(trans, root, bytenr,
4333 						blocksize, &refs);
4334 		BUG_ON(ret);
4335 		if (refs != 1)
4336 			continue;
4337 
4338 		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
4339 
4340 		/*
4341 		 * the leaf only had one reference, which means the
4342 		 * only thing pointing to this leaf is the snapshot
4343 		 * we're deleting.  It isn't possible for the reference
4344 		 * count to increase again later
4345 		 *
4346 		 * The reference cache is checked for the leaf,
4347 		 * and if found we'll be able to drop any refs held by
4348 		 * the leaf without needing to read it in.
4349 		 */
4350 		ref = btrfs_lookup_leaf_ref(root, bytenr);
4351 		if (ref && ref->generation != ptr_gen) {
4352 			btrfs_free_leaf_ref(root, ref);
4353 			ref = NULL;
4354 		}
4355 		if (ref) {
4356 			ret = cache_drop_leaf_ref(trans, root, ref);
4357 			BUG_ON(ret);
4358 			btrfs_remove_leaf_ref(root, ref);
4359 			btrfs_free_leaf_ref(root, ref);
4360 		} else {
4361 			/*
4362 			 * the leaf wasn't in the reference cache, so
4363 			 * we have to read it.
4364 			 */
4365 			leaf = read_tree_block(root, bytenr, blocksize,
4366 					       ptr_gen);
4367 			ret = btrfs_drop_leaf_ref(trans, root, leaf);
4368 			BUG_ON(ret);
4369 			free_extent_buffer(leaf);
4370 		}
4371 		atomic_inc(&root->fs_info->throttle_gen);
4372 		wake_up(&root->fs_info->transaction_throttle);
4373 		cond_resched();
4374 	}
4375 
4376 	/*
4377 	 * run through the loop again to free the refs on the leaves.
4378 	 * This is faster than doing it in the loop above because
4379 	 * the leaves are likely to be clustered together.  We end up
4380 	 * working in nice chunks on the extent allocation tree.
4381 	 */
4382 	for (i = 0; i < refi; i++) {
4383 		bytenr = sorted[i].bytenr;
4384 		ret = btrfs_free_extent(trans, root, bytenr,
4385 					blocksize, eb->start,
4386 					root_owner, root_gen, 0, 1);
4387 		BUG_ON(ret);
4388 
4389 		atomic_inc(&root->fs_info->throttle_gen);
4390 		wake_up(&root->fs_info->transaction_throttle);
4391 		cond_resched();
4392 	}
4393 out:
4394 	kfree(sorted);
4395 
4396 	/*
4397 	 * update the path to show we've processed the entire level 1
4398 	 * node.  This will get saved into the root's drop_snapshot_progress
4399 	 * field so these drops are not repeated again if this transaction
4400 	 * commits.
4401 	 */
4402 	path->slots[1] = nritems;
4403 	return 0;
4404 }
4405 
4406 /*
4407  * helper function for drop_snapshot, this walks down the tree dropping ref
4408  * counts as it goes.
4409  */
4410 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4411 				   struct btrfs_root *root,
4412 				   struct btrfs_path *path, int *level)
4413 {
4414 	u64 root_owner;
4415 	u64 root_gen;
4416 	u64 bytenr;
4417 	u64 ptr_gen;
4418 	struct extent_buffer *next;
4419 	struct extent_buffer *cur;
4420 	struct extent_buffer *parent;
4421 	u32 blocksize;
4422 	int ret;
4423 	u32 refs;
4424 
4425 	WARN_ON(*level < 0);
4426 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4427 	ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4428 				path->nodes[*level]->len, &refs);
4429 	BUG_ON(ret);
4430 	if (refs > 1)
4431 		goto out;
4432 
4433 	/*
4434 	 * walk down to the last node level and free all the leaves
4435 	 */
4436 	while (*level >= 0) {
4437 		WARN_ON(*level < 0);
4438 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
4439 		cur = path->nodes[*level];
4440 
4441 		if (btrfs_header_level(cur) != *level)
4442 			WARN_ON(1);
4443 
4444 		if (path->slots[*level] >=
4445 		    btrfs_header_nritems(cur))
4446 			break;
4447 
4448 		/* the new code goes down to level 1 and does all the
4449 		 * leaves pointed to that node in bulk.  So, this check
4450 		 * for level 0 will always be false.
4451 		 *
4452 		 * But, the disk format allows the drop_snapshot_progress
4453 		 * field in the root to leave things in a state where
4454 		 * a leaf will need cleaning up here.  If someone crashes
4455 		 * with the old code and then boots with the new code,
4456 		 * we might find a leaf here.
4457 		 */
4458 		if (*level == 0) {
4459 			ret = btrfs_drop_leaf_ref(trans, root, cur);
4460 			BUG_ON(ret);
4461 			break;
4462 		}
4463 
4464 		/*
4465 		 * once we get to level one, process the whole node
4466 		 * at once, including everything below it.
4467 		 */
4468 		if (*level == 1) {
4469 			ret = drop_level_one_refs(trans, root, path);
4470 			BUG_ON(ret);
4471 			break;
4472 		}
4473 
4474 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4475 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4476 		blocksize = btrfs_level_size(root, *level - 1);
4477 
4478 		ret = drop_snap_lookup_refcount(trans, root, bytenr,
4479 						blocksize, &refs);
4480 		BUG_ON(ret);
4481 
4482 		/*
4483 		 * if there is more than one reference, we don't need
4484 		 * to read that node to drop any references it has.  We
4485 		 * just drop the ref we hold on that node and move on to the
4486 		 * next slot in this level.
4487 		 */
4488 		if (refs != 1) {
4489 			parent = path->nodes[*level];
4490 			root_owner = btrfs_header_owner(parent);
4491 			root_gen = btrfs_header_generation(parent);
4492 			path->slots[*level]++;
4493 
4494 			ret = btrfs_free_extent(trans, root, bytenr,
4495 						blocksize, parent->start,
4496 						root_owner, root_gen,
4497 						*level - 1, 1);
4498 			BUG_ON(ret);
4499 
4500 			atomic_inc(&root->fs_info->throttle_gen);
4501 			wake_up(&root->fs_info->transaction_throttle);
4502 			cond_resched();
4503 
4504 			continue;
4505 		}
4506 
4507 		/*
4508 		 * we need to keep freeing things in the next level down.
4509 		 * read the block and loop around to process it
4510 		 */
4511 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4512 		WARN_ON(*level <= 0);
4513 		if (path->nodes[*level-1])
4514 			free_extent_buffer(path->nodes[*level-1]);
4515 		path->nodes[*level-1] = next;
4516 		*level = btrfs_header_level(next);
4517 		path->slots[*level] = 0;
4518 		cond_resched();
4519 	}
4520 out:
4521 	WARN_ON(*level < 0);
4522 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4523 
4524 	if (path->nodes[*level] == root->node) {
4525 		parent = path->nodes[*level];
4526 		bytenr = path->nodes[*level]->start;
4527 	} else {
4528 		parent = path->nodes[*level + 1];
4529 		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4530 	}
4531 
4532 	blocksize = btrfs_level_size(root, *level);
4533 	root_owner = btrfs_header_owner(parent);
4534 	root_gen = btrfs_header_generation(parent);
4535 
4536 	/*
4537 	 * cleanup and free the reference on the last node
4538 	 * we processed
4539 	 */
4540 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4541 				  parent->start, root_owner, root_gen,
4542 				  *level, 1);
4543 	free_extent_buffer(path->nodes[*level]);
4544 	path->nodes[*level] = NULL;
4545 
4546 	*level += 1;
4547 	BUG_ON(ret);
4548 
4549 	cond_resched();
4550 	return 0;
4551 }
4552 #endif
4553 
4554 struct walk_control {
4555 	u64 refs[BTRFS_MAX_LEVEL];
4556 	u64 flags[BTRFS_MAX_LEVEL];
4557 	struct btrfs_key update_progress;
4558 	int stage;
4559 	int level;
4560 	int shared_level;
4561 	int update_ref;
4562 	int keep_locks;
4563 };
4564 
4565 #define DROP_REFERENCE	1
4566 #define UPDATE_BACKREF	2
4567 
4568 /*
4569  * hepler to process tree block while walking down the tree.
4570  *
4571  * when wc->stage == DROP_REFERENCE, this function checks
4572  * reference count of the block. if the block is shared and
4573  * we need update back refs for the subtree rooted at the
4574  * block, this function changes wc->stage to UPDATE_BACKREF
4575  *
4576  * when wc->stage == UPDATE_BACKREF, this function updates
4577  * back refs for pointers in the block.
4578  *
4579  * NOTE: return value 1 means we should stop walking down.
4580  */
4581 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4582 				   struct btrfs_root *root,
4583 				   struct btrfs_path *path,
4584 				   struct walk_control *wc)
4585 {
4586 	int level = wc->level;
4587 	struct extent_buffer *eb = path->nodes[level];
4588 	struct btrfs_key key;
4589 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4590 	int ret;
4591 
4592 	if (wc->stage == UPDATE_BACKREF &&
4593 	    btrfs_header_owner(eb) != root->root_key.objectid)
4594 		return 1;
4595 
4596 	/*
4597 	 * when reference count of tree block is 1, it won't increase
4598 	 * again. once full backref flag is set, we never clear it.
4599 	 */
4600 	if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4601 	    (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4602 		BUG_ON(!path->locks[level]);
4603 		ret = btrfs_lookup_extent_info(trans, root,
4604 					       eb->start, eb->len,
4605 					       &wc->refs[level],
4606 					       &wc->flags[level]);
4607 		BUG_ON(ret);
4608 		BUG_ON(wc->refs[level] == 0);
4609 	}
4610 
4611 	if (wc->stage == DROP_REFERENCE &&
4612 	    wc->update_ref && wc->refs[level] > 1) {
4613 		BUG_ON(eb == root->node);
4614 		BUG_ON(path->slots[level] > 0);
4615 		if (level == 0)
4616 			btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4617 		else
4618 			btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4619 		if (btrfs_header_owner(eb) == root->root_key.objectid &&
4620 		    btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4621 			wc->stage = UPDATE_BACKREF;
4622 			wc->shared_level = level;
4623 		}
4624 	}
4625 
4626 	if (wc->stage == DROP_REFERENCE) {
4627 		if (wc->refs[level] > 1)
4628 			return 1;
4629 
4630 		if (path->locks[level] && !wc->keep_locks) {
4631 			btrfs_tree_unlock(eb);
4632 			path->locks[level] = 0;
4633 		}
4634 		return 0;
4635 	}
4636 
4637 	/* wc->stage == UPDATE_BACKREF */
4638 	if (!(wc->flags[level] & flag)) {
4639 		BUG_ON(!path->locks[level]);
4640 		ret = btrfs_inc_ref(trans, root, eb, 1);
4641 		BUG_ON(ret);
4642 		ret = btrfs_dec_ref(trans, root, eb, 0);
4643 		BUG_ON(ret);
4644 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
4645 						  eb->len, flag, 0);
4646 		BUG_ON(ret);
4647 		wc->flags[level] |= flag;
4648 	}
4649 
4650 	/*
4651 	 * the block is shared by multiple trees, so it's not good to
4652 	 * keep the tree lock
4653 	 */
4654 	if (path->locks[level] && level > 0) {
4655 		btrfs_tree_unlock(eb);
4656 		path->locks[level] = 0;
4657 	}
4658 	return 0;
4659 }
4660 
4661 /*
4662  * hepler to process tree block while walking up the tree.
4663  *
4664  * when wc->stage == DROP_REFERENCE, this function drops
4665  * reference count on the block.
4666  *
4667  * when wc->stage == UPDATE_BACKREF, this function changes
4668  * wc->stage back to DROP_REFERENCE if we changed wc->stage
4669  * to UPDATE_BACKREF previously while processing the block.
4670  *
4671  * NOTE: return value 1 means we should stop walking up.
4672  */
4673 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4674 				 struct btrfs_root *root,
4675 				 struct btrfs_path *path,
4676 				 struct walk_control *wc)
4677 {
4678 	int ret = 0;
4679 	int level = wc->level;
4680 	struct extent_buffer *eb = path->nodes[level];
4681 	u64 parent = 0;
4682 
4683 	if (wc->stage == UPDATE_BACKREF) {
4684 		BUG_ON(wc->shared_level < level);
4685 		if (level < wc->shared_level)
4686 			goto out;
4687 
4688 		BUG_ON(wc->refs[level] <= 1);
4689 		ret = find_next_key(path, level + 1, &wc->update_progress);
4690 		if (ret > 0)
4691 			wc->update_ref = 0;
4692 
4693 		wc->stage = DROP_REFERENCE;
4694 		wc->shared_level = -1;
4695 		path->slots[level] = 0;
4696 
4697 		/*
4698 		 * check reference count again if the block isn't locked.
4699 		 * we should start walking down the tree again if reference
4700 		 * count is one.
4701 		 */
4702 		if (!path->locks[level]) {
4703 			BUG_ON(level == 0);
4704 			btrfs_tree_lock(eb);
4705 			btrfs_set_lock_blocking(eb);
4706 			path->locks[level] = 1;
4707 
4708 			ret = btrfs_lookup_extent_info(trans, root,
4709 						       eb->start, eb->len,
4710 						       &wc->refs[level],
4711 						       &wc->flags[level]);
4712 			BUG_ON(ret);
4713 			BUG_ON(wc->refs[level] == 0);
4714 			if (wc->refs[level] == 1) {
4715 				btrfs_tree_unlock(eb);
4716 				path->locks[level] = 0;
4717 				return 1;
4718 			}
4719 		} else {
4720 			BUG_ON(level != 0);
4721 		}
4722 	}
4723 
4724 	/* wc->stage == DROP_REFERENCE */
4725 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
4726 
4727 	if (wc->refs[level] == 1) {
4728 		if (level == 0) {
4729 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4730 				ret = btrfs_dec_ref(trans, root, eb, 1);
4731 			else
4732 				ret = btrfs_dec_ref(trans, root, eb, 0);
4733 			BUG_ON(ret);
4734 		}
4735 		/* make block locked assertion in clean_tree_block happy */
4736 		if (!path->locks[level] &&
4737 		    btrfs_header_generation(eb) == trans->transid) {
4738 			btrfs_tree_lock(eb);
4739 			btrfs_set_lock_blocking(eb);
4740 			path->locks[level] = 1;
4741 		}
4742 		clean_tree_block(trans, root, eb);
4743 	}
4744 
4745 	if (eb == root->node) {
4746 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4747 			parent = eb->start;
4748 		else
4749 			BUG_ON(root->root_key.objectid !=
4750 			       btrfs_header_owner(eb));
4751 	} else {
4752 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4753 			parent = path->nodes[level + 1]->start;
4754 		else
4755 			BUG_ON(root->root_key.objectid !=
4756 			       btrfs_header_owner(path->nodes[level + 1]));
4757 	}
4758 
4759 	ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
4760 				root->root_key.objectid, level, 0);
4761 	BUG_ON(ret);
4762 out:
4763 	wc->refs[level] = 0;
4764 	wc->flags[level] = 0;
4765 	return ret;
4766 }
4767 
4768 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4769 				   struct btrfs_root *root,
4770 				   struct btrfs_path *path,
4771 				   struct walk_control *wc)
4772 {
4773 	struct extent_buffer *next;
4774 	struct extent_buffer *cur;
4775 	u64 bytenr;
4776 	u64 ptr_gen;
4777 	u32 blocksize;
4778 	int level = wc->level;
4779 	int ret;
4780 
4781 	while (level >= 0) {
4782 		cur = path->nodes[level];
4783 		BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
4784 
4785 		ret = walk_down_proc(trans, root, path, wc);
4786 		if (ret > 0)
4787 			break;
4788 
4789 		if (level == 0)
4790 			break;
4791 
4792 		bytenr = btrfs_node_blockptr(cur, path->slots[level]);
4793 		blocksize = btrfs_level_size(root, level - 1);
4794 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
4795 
4796 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4797 		btrfs_tree_lock(next);
4798 		btrfs_set_lock_blocking(next);
4799 
4800 		level--;
4801 		BUG_ON(level != btrfs_header_level(next));
4802 		path->nodes[level] = next;
4803 		path->slots[level] = 0;
4804 		path->locks[level] = 1;
4805 		wc->level = level;
4806 	}
4807 	return 0;
4808 }
4809 
4810 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
4811 				 struct btrfs_root *root,
4812 				 struct btrfs_path *path,
4813 				 struct walk_control *wc, int max_level)
4814 {
4815 	int level = wc->level;
4816 	int ret;
4817 
4818 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
4819 	while (level < max_level && path->nodes[level]) {
4820 		wc->level = level;
4821 		if (path->slots[level] + 1 <
4822 		    btrfs_header_nritems(path->nodes[level])) {
4823 			path->slots[level]++;
4824 			return 0;
4825 		} else {
4826 			ret = walk_up_proc(trans, root, path, wc);
4827 			if (ret > 0)
4828 				return 0;
4829 
4830 			if (path->locks[level]) {
4831 				btrfs_tree_unlock(path->nodes[level]);
4832 				path->locks[level] = 0;
4833 			}
4834 			free_extent_buffer(path->nodes[level]);
4835 			path->nodes[level] = NULL;
4836 			level++;
4837 		}
4838 	}
4839 	return 1;
4840 }
4841 
4842 /*
4843  * drop a subvolume tree.
4844  *
4845  * this function traverses the tree freeing any blocks that only
4846  * referenced by the tree.
4847  *
4848  * when a shared tree block is found. this function decreases its
4849  * reference count by one. if update_ref is true, this function
4850  * also make sure backrefs for the shared block and all lower level
4851  * blocks are properly updated.
4852  */
4853 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4854 {
4855 	struct btrfs_path *path;
4856 	struct btrfs_trans_handle *trans;
4857 	struct btrfs_root *tree_root = root->fs_info->tree_root;
4858 	struct btrfs_root_item *root_item = &root->root_item;
4859 	struct walk_control *wc;
4860 	struct btrfs_key key;
4861 	int err = 0;
4862 	int ret;
4863 	int level;
4864 
4865 	path = btrfs_alloc_path();
4866 	BUG_ON(!path);
4867 
4868 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
4869 	BUG_ON(!wc);
4870 
4871 	trans = btrfs_start_transaction(tree_root, 1);
4872 
4873 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
4874 		level = btrfs_header_level(root->node);
4875 		path->nodes[level] = btrfs_lock_root_node(root);
4876 		btrfs_set_lock_blocking(path->nodes[level]);
4877 		path->slots[level] = 0;
4878 		path->locks[level] = 1;
4879 		memset(&wc->update_progress, 0,
4880 		       sizeof(wc->update_progress));
4881 	} else {
4882 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
4883 		memcpy(&wc->update_progress, &key,
4884 		       sizeof(wc->update_progress));
4885 
4886 		level = root_item->drop_level;
4887 		BUG_ON(level == 0);
4888 		path->lowest_level = level;
4889 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4890 		path->lowest_level = 0;
4891 		if (ret < 0) {
4892 			err = ret;
4893 			goto out;
4894 		}
4895 		btrfs_node_key_to_cpu(path->nodes[level], &key,
4896 				      path->slots[level]);
4897 		WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
4898 
4899 		/*
4900 		 * unlock our path, this is safe because only this
4901 		 * function is allowed to delete this snapshot
4902 		 */
4903 		btrfs_unlock_up_safe(path, 0);
4904 
4905 		level = btrfs_header_level(root->node);
4906 		while (1) {
4907 			btrfs_tree_lock(path->nodes[level]);
4908 			btrfs_set_lock_blocking(path->nodes[level]);
4909 
4910 			ret = btrfs_lookup_extent_info(trans, root,
4911 						path->nodes[level]->start,
4912 						path->nodes[level]->len,
4913 						&wc->refs[level],
4914 						&wc->flags[level]);
4915 			BUG_ON(ret);
4916 			BUG_ON(wc->refs[level] == 0);
4917 
4918 			if (level == root_item->drop_level)
4919 				break;
4920 
4921 			btrfs_tree_unlock(path->nodes[level]);
4922 			WARN_ON(wc->refs[level] != 1);
4923 			level--;
4924 		}
4925 	}
4926 
4927 	wc->level = level;
4928 	wc->shared_level = -1;
4929 	wc->stage = DROP_REFERENCE;
4930 	wc->update_ref = update_ref;
4931 	wc->keep_locks = 0;
4932 
4933 	while (1) {
4934 		ret = walk_down_tree(trans, root, path, wc);
4935 		if (ret < 0) {
4936 			err = ret;
4937 			break;
4938 		}
4939 
4940 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
4941 		if (ret < 0) {
4942 			err = ret;
4943 			break;
4944 		}
4945 
4946 		if (ret > 0) {
4947 			BUG_ON(wc->stage != DROP_REFERENCE);
4948 			break;
4949 		}
4950 
4951 		if (wc->stage == DROP_REFERENCE) {
4952 			level = wc->level;
4953 			btrfs_node_key(path->nodes[level],
4954 				       &root_item->drop_progress,
4955 				       path->slots[level]);
4956 			root_item->drop_level = level;
4957 		}
4958 
4959 		BUG_ON(wc->level == 0);
4960 		if (trans->transaction->in_commit ||
4961 		    trans->transaction->delayed_refs.flushing) {
4962 			ret = btrfs_update_root(trans, tree_root,
4963 						&root->root_key,
4964 						root_item);
4965 			BUG_ON(ret);
4966 
4967 			btrfs_end_transaction(trans, tree_root);
4968 			trans = btrfs_start_transaction(tree_root, 1);
4969 		} else {
4970 			unsigned long update;
4971 			update = trans->delayed_ref_updates;
4972 			trans->delayed_ref_updates = 0;
4973 			if (update)
4974 				btrfs_run_delayed_refs(trans, tree_root,
4975 						       update);
4976 		}
4977 	}
4978 	btrfs_release_path(root, path);
4979 	BUG_ON(err);
4980 
4981 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
4982 	BUG_ON(ret);
4983 
4984 	free_extent_buffer(root->node);
4985 	free_extent_buffer(root->commit_root);
4986 	kfree(root);
4987 out:
4988 	btrfs_end_transaction(trans, tree_root);
4989 	kfree(wc);
4990 	btrfs_free_path(path);
4991 	return err;
4992 }
4993 
4994 /*
4995  * drop subtree rooted at tree block 'node'.
4996  *
4997  * NOTE: this function will unlock and release tree block 'node'
4998  */
4999 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5000 			struct btrfs_root *root,
5001 			struct extent_buffer *node,
5002 			struct extent_buffer *parent)
5003 {
5004 	struct btrfs_path *path;
5005 	struct walk_control *wc;
5006 	int level;
5007 	int parent_level;
5008 	int ret = 0;
5009 	int wret;
5010 
5011 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5012 
5013 	path = btrfs_alloc_path();
5014 	BUG_ON(!path);
5015 
5016 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5017 	BUG_ON(!wc);
5018 
5019 	btrfs_assert_tree_locked(parent);
5020 	parent_level = btrfs_header_level(parent);
5021 	extent_buffer_get(parent);
5022 	path->nodes[parent_level] = parent;
5023 	path->slots[parent_level] = btrfs_header_nritems(parent);
5024 
5025 	btrfs_assert_tree_locked(node);
5026 	level = btrfs_header_level(node);
5027 	path->nodes[level] = node;
5028 	path->slots[level] = 0;
5029 	path->locks[level] = 1;
5030 
5031 	wc->refs[parent_level] = 1;
5032 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5033 	wc->level = level;
5034 	wc->shared_level = -1;
5035 	wc->stage = DROP_REFERENCE;
5036 	wc->update_ref = 0;
5037 	wc->keep_locks = 1;
5038 
5039 	while (1) {
5040 		wret = walk_down_tree(trans, root, path, wc);
5041 		if (wret < 0) {
5042 			ret = wret;
5043 			break;
5044 		}
5045 
5046 		wret = walk_up_tree(trans, root, path, wc, parent_level);
5047 		if (wret < 0)
5048 			ret = wret;
5049 		if (wret != 0)
5050 			break;
5051 	}
5052 
5053 	kfree(wc);
5054 	btrfs_free_path(path);
5055 	return ret;
5056 }
5057 
5058 #if 0
5059 static unsigned long calc_ra(unsigned long start, unsigned long last,
5060 			     unsigned long nr)
5061 {
5062 	return min(last, start + nr - 1);
5063 }
5064 
5065 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
5066 					 u64 len)
5067 {
5068 	u64 page_start;
5069 	u64 page_end;
5070 	unsigned long first_index;
5071 	unsigned long last_index;
5072 	unsigned long i;
5073 	struct page *page;
5074 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5075 	struct file_ra_state *ra;
5076 	struct btrfs_ordered_extent *ordered;
5077 	unsigned int total_read = 0;
5078 	unsigned int total_dirty = 0;
5079 	int ret = 0;
5080 
5081 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
5082 
5083 	mutex_lock(&inode->i_mutex);
5084 	first_index = start >> PAGE_CACHE_SHIFT;
5085 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
5086 
5087 	/* make sure the dirty trick played by the caller work */
5088 	ret = invalidate_inode_pages2_range(inode->i_mapping,
5089 					    first_index, last_index);
5090 	if (ret)
5091 		goto out_unlock;
5092 
5093 	file_ra_state_init(ra, inode->i_mapping);
5094 
5095 	for (i = first_index ; i <= last_index; i++) {
5096 		if (total_read % ra->ra_pages == 0) {
5097 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
5098 				       calc_ra(i, last_index, ra->ra_pages));
5099 		}
5100 		total_read++;
5101 again:
5102 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
5103 			BUG_ON(1);
5104 		page = grab_cache_page(inode->i_mapping, i);
5105 		if (!page) {
5106 			ret = -ENOMEM;
5107 			goto out_unlock;
5108 		}
5109 		if (!PageUptodate(page)) {
5110 			btrfs_readpage(NULL, page);
5111 			lock_page(page);
5112 			if (!PageUptodate(page)) {
5113 				unlock_page(page);
5114 				page_cache_release(page);
5115 				ret = -EIO;
5116 				goto out_unlock;
5117 			}
5118 		}
5119 		wait_on_page_writeback(page);
5120 
5121 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
5122 		page_end = page_start + PAGE_CACHE_SIZE - 1;
5123 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
5124 
5125 		ordered = btrfs_lookup_ordered_extent(inode, page_start);
5126 		if (ordered) {
5127 			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5128 			unlock_page(page);
5129 			page_cache_release(page);
5130 			btrfs_start_ordered_extent(inode, ordered, 1);
5131 			btrfs_put_ordered_extent(ordered);
5132 			goto again;
5133 		}
5134 		set_page_extent_mapped(page);
5135 
5136 		if (i == first_index)
5137 			set_extent_bits(io_tree, page_start, page_end,
5138 					EXTENT_BOUNDARY, GFP_NOFS);
5139 		btrfs_set_extent_delalloc(inode, page_start, page_end);
5140 
5141 		set_page_dirty(page);
5142 		total_dirty++;
5143 
5144 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5145 		unlock_page(page);
5146 		page_cache_release(page);
5147 	}
5148 
5149 out_unlock:
5150 	kfree(ra);
5151 	mutex_unlock(&inode->i_mutex);
5152 	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
5153 	return ret;
5154 }
5155 
5156 static noinline int relocate_data_extent(struct inode *reloc_inode,
5157 					 struct btrfs_key *extent_key,
5158 					 u64 offset)
5159 {
5160 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5161 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
5162 	struct extent_map *em;
5163 	u64 start = extent_key->objectid - offset;
5164 	u64 end = start + extent_key->offset - 1;
5165 
5166 	em = alloc_extent_map(GFP_NOFS);
5167 	BUG_ON(!em || IS_ERR(em));
5168 
5169 	em->start = start;
5170 	em->len = extent_key->offset;
5171 	em->block_len = extent_key->offset;
5172 	em->block_start = extent_key->objectid;
5173 	em->bdev = root->fs_info->fs_devices->latest_bdev;
5174 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
5175 
5176 	/* setup extent map to cheat btrfs_readpage */
5177 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5178 	while (1) {
5179 		int ret;
5180 		spin_lock(&em_tree->lock);
5181 		ret = add_extent_mapping(em_tree, em);
5182 		spin_unlock(&em_tree->lock);
5183 		if (ret != -EEXIST) {
5184 			free_extent_map(em);
5185 			break;
5186 		}
5187 		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
5188 	}
5189 	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5190 
5191 	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
5192 }
5193 
5194 struct btrfs_ref_path {
5195 	u64 extent_start;
5196 	u64 nodes[BTRFS_MAX_LEVEL];
5197 	u64 root_objectid;
5198 	u64 root_generation;
5199 	u64 owner_objectid;
5200 	u32 num_refs;
5201 	int lowest_level;
5202 	int current_level;
5203 	int shared_level;
5204 
5205 	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
5206 	u64 new_nodes[BTRFS_MAX_LEVEL];
5207 };
5208 
5209 struct disk_extent {
5210 	u64 ram_bytes;
5211 	u64 disk_bytenr;
5212 	u64 disk_num_bytes;
5213 	u64 offset;
5214 	u64 num_bytes;
5215 	u8 compression;
5216 	u8 encryption;
5217 	u16 other_encoding;
5218 };
5219 
5220 static int is_cowonly_root(u64 root_objectid)
5221 {
5222 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
5223 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
5224 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
5225 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
5226 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5227 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
5228 		return 1;
5229 	return 0;
5230 }
5231 
5232 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
5233 				    struct btrfs_root *extent_root,
5234 				    struct btrfs_ref_path *ref_path,
5235 				    int first_time)
5236 {
5237 	struct extent_buffer *leaf;
5238 	struct btrfs_path *path;
5239 	struct btrfs_extent_ref *ref;
5240 	struct btrfs_key key;
5241 	struct btrfs_key found_key;
5242 	u64 bytenr;
5243 	u32 nritems;
5244 	int level;
5245 	int ret = 1;
5246 
5247 	path = btrfs_alloc_path();
5248 	if (!path)
5249 		return -ENOMEM;
5250 
5251 	if (first_time) {
5252 		ref_path->lowest_level = -1;
5253 		ref_path->current_level = -1;
5254 		ref_path->shared_level = -1;
5255 		goto walk_up;
5256 	}
5257 walk_down:
5258 	level = ref_path->current_level - 1;
5259 	while (level >= -1) {
5260 		u64 parent;
5261 		if (level < ref_path->lowest_level)
5262 			break;
5263 
5264 		if (level >= 0)
5265 			bytenr = ref_path->nodes[level];
5266 		else
5267 			bytenr = ref_path->extent_start;
5268 		BUG_ON(bytenr == 0);
5269 
5270 		parent = ref_path->nodes[level + 1];
5271 		ref_path->nodes[level + 1] = 0;
5272 		ref_path->current_level = level;
5273 		BUG_ON(parent == 0);
5274 
5275 		key.objectid = bytenr;
5276 		key.offset = parent + 1;
5277 		key.type = BTRFS_EXTENT_REF_KEY;
5278 
5279 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5280 		if (ret < 0)
5281 			goto out;
5282 		BUG_ON(ret == 0);
5283 
5284 		leaf = path->nodes[0];
5285 		nritems = btrfs_header_nritems(leaf);
5286 		if (path->slots[0] >= nritems) {
5287 			ret = btrfs_next_leaf(extent_root, path);
5288 			if (ret < 0)
5289 				goto out;
5290 			if (ret > 0)
5291 				goto next;
5292 			leaf = path->nodes[0];
5293 		}
5294 
5295 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5296 		if (found_key.objectid == bytenr &&
5297 		    found_key.type == BTRFS_EXTENT_REF_KEY) {
5298 			if (level < ref_path->shared_level)
5299 				ref_path->shared_level = level;
5300 			goto found;
5301 		}
5302 next:
5303 		level--;
5304 		btrfs_release_path(extent_root, path);
5305 		cond_resched();
5306 	}
5307 	/* reached lowest level */
5308 	ret = 1;
5309 	goto out;
5310 walk_up:
5311 	level = ref_path->current_level;
5312 	while (level < BTRFS_MAX_LEVEL - 1) {
5313 		u64 ref_objectid;
5314 
5315 		if (level >= 0)
5316 			bytenr = ref_path->nodes[level];
5317 		else
5318 			bytenr = ref_path->extent_start;
5319 
5320 		BUG_ON(bytenr == 0);
5321 
5322 		key.objectid = bytenr;
5323 		key.offset = 0;
5324 		key.type = BTRFS_EXTENT_REF_KEY;
5325 
5326 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5327 		if (ret < 0)
5328 			goto out;
5329 
5330 		leaf = path->nodes[0];
5331 		nritems = btrfs_header_nritems(leaf);
5332 		if (path->slots[0] >= nritems) {
5333 			ret = btrfs_next_leaf(extent_root, path);
5334 			if (ret < 0)
5335 				goto out;
5336 			if (ret > 0) {
5337 				/* the extent was freed by someone */
5338 				if (ref_path->lowest_level == level)
5339 					goto out;
5340 				btrfs_release_path(extent_root, path);
5341 				goto walk_down;
5342 			}
5343 			leaf = path->nodes[0];
5344 		}
5345 
5346 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5347 		if (found_key.objectid != bytenr ||
5348 				found_key.type != BTRFS_EXTENT_REF_KEY) {
5349 			/* the extent was freed by someone */
5350 			if (ref_path->lowest_level == level) {
5351 				ret = 1;
5352 				goto out;
5353 			}
5354 			btrfs_release_path(extent_root, path);
5355 			goto walk_down;
5356 		}
5357 found:
5358 		ref = btrfs_item_ptr(leaf, path->slots[0],
5359 				struct btrfs_extent_ref);
5360 		ref_objectid = btrfs_ref_objectid(leaf, ref);
5361 		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5362 			if (first_time) {
5363 				level = (int)ref_objectid;
5364 				BUG_ON(level >= BTRFS_MAX_LEVEL);
5365 				ref_path->lowest_level = level;
5366 				ref_path->current_level = level;
5367 				ref_path->nodes[level] = bytenr;
5368 			} else {
5369 				WARN_ON(ref_objectid != level);
5370 			}
5371 		} else {
5372 			WARN_ON(level != -1);
5373 		}
5374 		first_time = 0;
5375 
5376 		if (ref_path->lowest_level == level) {
5377 			ref_path->owner_objectid = ref_objectid;
5378 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
5379 		}
5380 
5381 		/*
5382 		 * the block is tree root or the block isn't in reference
5383 		 * counted tree.
5384 		 */
5385 		if (found_key.objectid == found_key.offset ||
5386 		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
5387 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
5388 			ref_path->root_generation =
5389 				btrfs_ref_generation(leaf, ref);
5390 			if (level < 0) {
5391 				/* special reference from the tree log */
5392 				ref_path->nodes[0] = found_key.offset;
5393 				ref_path->current_level = 0;
5394 			}
5395 			ret = 0;
5396 			goto out;
5397 		}
5398 
5399 		level++;
5400 		BUG_ON(ref_path->nodes[level] != 0);
5401 		ref_path->nodes[level] = found_key.offset;
5402 		ref_path->current_level = level;
5403 
5404 		/*
5405 		 * the reference was created in the running transaction,
5406 		 * no need to continue walking up.
5407 		 */
5408 		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
5409 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
5410 			ref_path->root_generation =
5411 				btrfs_ref_generation(leaf, ref);
5412 			ret = 0;
5413 			goto out;
5414 		}
5415 
5416 		btrfs_release_path(extent_root, path);
5417 		cond_resched();
5418 	}
5419 	/* reached max tree level, but no tree root found. */
5420 	BUG();
5421 out:
5422 	btrfs_free_path(path);
5423 	return ret;
5424 }
5425 
5426 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
5427 				struct btrfs_root *extent_root,
5428 				struct btrfs_ref_path *ref_path,
5429 				u64 extent_start)
5430 {
5431 	memset(ref_path, 0, sizeof(*ref_path));
5432 	ref_path->extent_start = extent_start;
5433 
5434 	return __next_ref_path(trans, extent_root, ref_path, 1);
5435 }
5436 
5437 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
5438 			       struct btrfs_root *extent_root,
5439 			       struct btrfs_ref_path *ref_path)
5440 {
5441 	return __next_ref_path(trans, extent_root, ref_path, 0);
5442 }
5443 
5444 static noinline int get_new_locations(struct inode *reloc_inode,
5445 				      struct btrfs_key *extent_key,
5446 				      u64 offset, int no_fragment,
5447 				      struct disk_extent **extents,
5448 				      int *nr_extents)
5449 {
5450 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5451 	struct btrfs_path *path;
5452 	struct btrfs_file_extent_item *fi;
5453 	struct extent_buffer *leaf;
5454 	struct disk_extent *exts = *extents;
5455 	struct btrfs_key found_key;
5456 	u64 cur_pos;
5457 	u64 last_byte;
5458 	u32 nritems;
5459 	int nr = 0;
5460 	int max = *nr_extents;
5461 	int ret;
5462 
5463 	WARN_ON(!no_fragment && *extents);
5464 	if (!exts) {
5465 		max = 1;
5466 		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
5467 		if (!exts)
5468 			return -ENOMEM;
5469 	}
5470 
5471 	path = btrfs_alloc_path();
5472 	BUG_ON(!path);
5473 
5474 	cur_pos = extent_key->objectid - offset;
5475 	last_byte = extent_key->objectid + extent_key->offset;
5476 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
5477 				       cur_pos, 0);
5478 	if (ret < 0)
5479 		goto out;
5480 	if (ret > 0) {
5481 		ret = -ENOENT;
5482 		goto out;
5483 	}
5484 
5485 	while (1) {
5486 		leaf = path->nodes[0];
5487 		nritems = btrfs_header_nritems(leaf);
5488 		if (path->slots[0] >= nritems) {
5489 			ret = btrfs_next_leaf(root, path);
5490 			if (ret < 0)
5491 				goto out;
5492 			if (ret > 0)
5493 				break;
5494 			leaf = path->nodes[0];
5495 		}
5496 
5497 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5498 		if (found_key.offset != cur_pos ||
5499 		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
5500 		    found_key.objectid != reloc_inode->i_ino)
5501 			break;
5502 
5503 		fi = btrfs_item_ptr(leaf, path->slots[0],
5504 				    struct btrfs_file_extent_item);
5505 		if (btrfs_file_extent_type(leaf, fi) !=
5506 		    BTRFS_FILE_EXTENT_REG ||
5507 		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
5508 			break;
5509 
5510 		if (nr == max) {
5511 			struct disk_extent *old = exts;
5512 			max *= 2;
5513 			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
5514 			memcpy(exts, old, sizeof(*exts) * nr);
5515 			if (old != *extents)
5516 				kfree(old);
5517 		}
5518 
5519 		exts[nr].disk_bytenr =
5520 			btrfs_file_extent_disk_bytenr(leaf, fi);
5521 		exts[nr].disk_num_bytes =
5522 			btrfs_file_extent_disk_num_bytes(leaf, fi);
5523 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
5524 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5525 		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
5526 		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
5527 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
5528 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
5529 									   fi);
5530 		BUG_ON(exts[nr].offset > 0);
5531 		BUG_ON(exts[nr].compression || exts[nr].encryption);
5532 		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
5533 
5534 		cur_pos += exts[nr].num_bytes;
5535 		nr++;
5536 
5537 		if (cur_pos + offset >= last_byte)
5538 			break;
5539 
5540 		if (no_fragment) {
5541 			ret = 1;
5542 			goto out;
5543 		}
5544 		path->slots[0]++;
5545 	}
5546 
5547 	BUG_ON(cur_pos + offset > last_byte);
5548 	if (cur_pos + offset < last_byte) {
5549 		ret = -ENOENT;
5550 		goto out;
5551 	}
5552 	ret = 0;
5553 out:
5554 	btrfs_free_path(path);
5555 	if (ret) {
5556 		if (exts != *extents)
5557 			kfree(exts);
5558 	} else {
5559 		*extents = exts;
5560 		*nr_extents = nr;
5561 	}
5562 	return ret;
5563 }
5564 
5565 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
5566 					struct btrfs_root *root,
5567 					struct btrfs_path *path,
5568 					struct btrfs_key *extent_key,
5569 					struct btrfs_key *leaf_key,
5570 					struct btrfs_ref_path *ref_path,
5571 					struct disk_extent *new_extents,
5572 					int nr_extents)
5573 {
5574 	struct extent_buffer *leaf;
5575 	struct btrfs_file_extent_item *fi;
5576 	struct inode *inode = NULL;
5577 	struct btrfs_key key;
5578 	u64 lock_start = 0;
5579 	u64 lock_end = 0;
5580 	u64 num_bytes;
5581 	u64 ext_offset;
5582 	u64 search_end = (u64)-1;
5583 	u32 nritems;
5584 	int nr_scaned = 0;
5585 	int extent_locked = 0;
5586 	int extent_type;
5587 	int ret;
5588 
5589 	memcpy(&key, leaf_key, sizeof(key));
5590 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5591 		if (key.objectid < ref_path->owner_objectid ||
5592 		    (key.objectid == ref_path->owner_objectid &&
5593 		     key.type < BTRFS_EXTENT_DATA_KEY)) {
5594 			key.objectid = ref_path->owner_objectid;
5595 			key.type = BTRFS_EXTENT_DATA_KEY;
5596 			key.offset = 0;
5597 		}
5598 	}
5599 
5600 	while (1) {
5601 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5602 		if (ret < 0)
5603 			goto out;
5604 
5605 		leaf = path->nodes[0];
5606 		nritems = btrfs_header_nritems(leaf);
5607 next:
5608 		if (extent_locked && ret > 0) {
5609 			/*
5610 			 * the file extent item was modified by someone
5611 			 * before the extent got locked.
5612 			 */
5613 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5614 				      lock_end, GFP_NOFS);
5615 			extent_locked = 0;
5616 		}
5617 
5618 		if (path->slots[0] >= nritems) {
5619 			if (++nr_scaned > 2)
5620 				break;
5621 
5622 			BUG_ON(extent_locked);
5623 			ret = btrfs_next_leaf(root, path);
5624 			if (ret < 0)
5625 				goto out;
5626 			if (ret > 0)
5627 				break;
5628 			leaf = path->nodes[0];
5629 			nritems = btrfs_header_nritems(leaf);
5630 		}
5631 
5632 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5633 
5634 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5635 			if ((key.objectid > ref_path->owner_objectid) ||
5636 			    (key.objectid == ref_path->owner_objectid &&
5637 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
5638 			    key.offset >= search_end)
5639 				break;
5640 		}
5641 
5642 		if (inode && key.objectid != inode->i_ino) {
5643 			BUG_ON(extent_locked);
5644 			btrfs_release_path(root, path);
5645 			mutex_unlock(&inode->i_mutex);
5646 			iput(inode);
5647 			inode = NULL;
5648 			continue;
5649 		}
5650 
5651 		if (key.type != BTRFS_EXTENT_DATA_KEY) {
5652 			path->slots[0]++;
5653 			ret = 1;
5654 			goto next;
5655 		}
5656 		fi = btrfs_item_ptr(leaf, path->slots[0],
5657 				    struct btrfs_file_extent_item);
5658 		extent_type = btrfs_file_extent_type(leaf, fi);
5659 		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
5660 		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
5661 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
5662 		     extent_key->objectid)) {
5663 			path->slots[0]++;
5664 			ret = 1;
5665 			goto next;
5666 		}
5667 
5668 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5669 		ext_offset = btrfs_file_extent_offset(leaf, fi);
5670 
5671 		if (search_end == (u64)-1) {
5672 			search_end = key.offset - ext_offset +
5673 				btrfs_file_extent_ram_bytes(leaf, fi);
5674 		}
5675 
5676 		if (!extent_locked) {
5677 			lock_start = key.offset;
5678 			lock_end = lock_start + num_bytes - 1;
5679 		} else {
5680 			if (lock_start > key.offset ||
5681 			    lock_end + 1 < key.offset + num_bytes) {
5682 				unlock_extent(&BTRFS_I(inode)->io_tree,
5683 					      lock_start, lock_end, GFP_NOFS);
5684 				extent_locked = 0;
5685 			}
5686 		}
5687 
5688 		if (!inode) {
5689 			btrfs_release_path(root, path);
5690 
5691 			inode = btrfs_iget_locked(root->fs_info->sb,
5692 						  key.objectid, root);
5693 			if (inode->i_state & I_NEW) {
5694 				BTRFS_I(inode)->root = root;
5695 				BTRFS_I(inode)->location.objectid =
5696 					key.objectid;
5697 				BTRFS_I(inode)->location.type =
5698 					BTRFS_INODE_ITEM_KEY;
5699 				BTRFS_I(inode)->location.offset = 0;
5700 				btrfs_read_locked_inode(inode);
5701 				unlock_new_inode(inode);
5702 			}
5703 			/*
5704 			 * some code call btrfs_commit_transaction while
5705 			 * holding the i_mutex, so we can't use mutex_lock
5706 			 * here.
5707 			 */
5708 			if (is_bad_inode(inode) ||
5709 			    !mutex_trylock(&inode->i_mutex)) {
5710 				iput(inode);
5711 				inode = NULL;
5712 				key.offset = (u64)-1;
5713 				goto skip;
5714 			}
5715 		}
5716 
5717 		if (!extent_locked) {
5718 			struct btrfs_ordered_extent *ordered;
5719 
5720 			btrfs_release_path(root, path);
5721 
5722 			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5723 				    lock_end, GFP_NOFS);
5724 			ordered = btrfs_lookup_first_ordered_extent(inode,
5725 								    lock_end);
5726 			if (ordered &&
5727 			    ordered->file_offset <= lock_end &&
5728 			    ordered->file_offset + ordered->len > lock_start) {
5729 				unlock_extent(&BTRFS_I(inode)->io_tree,
5730 					      lock_start, lock_end, GFP_NOFS);
5731 				btrfs_start_ordered_extent(inode, ordered, 1);
5732 				btrfs_put_ordered_extent(ordered);
5733 				key.offset += num_bytes;
5734 				goto skip;
5735 			}
5736 			if (ordered)
5737 				btrfs_put_ordered_extent(ordered);
5738 
5739 			extent_locked = 1;
5740 			continue;
5741 		}
5742 
5743 		if (nr_extents == 1) {
5744 			/* update extent pointer in place */
5745 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
5746 						new_extents[0].disk_bytenr);
5747 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5748 						new_extents[0].disk_num_bytes);
5749 			btrfs_mark_buffer_dirty(leaf);
5750 
5751 			btrfs_drop_extent_cache(inode, key.offset,
5752 						key.offset + num_bytes - 1, 0);
5753 
5754 			ret = btrfs_inc_extent_ref(trans, root,
5755 						new_extents[0].disk_bytenr,
5756 						new_extents[0].disk_num_bytes,
5757 						leaf->start,
5758 						root->root_key.objectid,
5759 						trans->transid,
5760 						key.objectid);
5761 			BUG_ON(ret);
5762 
5763 			ret = btrfs_free_extent(trans, root,
5764 						extent_key->objectid,
5765 						extent_key->offset,
5766 						leaf->start,
5767 						btrfs_header_owner(leaf),
5768 						btrfs_header_generation(leaf),
5769 						key.objectid, 0);
5770 			BUG_ON(ret);
5771 
5772 			btrfs_release_path(root, path);
5773 			key.offset += num_bytes;
5774 		} else {
5775 			BUG_ON(1);
5776 #if 0
5777 			u64 alloc_hint;
5778 			u64 extent_len;
5779 			int i;
5780 			/*
5781 			 * drop old extent pointer at first, then insert the
5782 			 * new pointers one bye one
5783 			 */
5784 			btrfs_release_path(root, path);
5785 			ret = btrfs_drop_extents(trans, root, inode, key.offset,
5786 						 key.offset + num_bytes,
5787 						 key.offset, &alloc_hint);
5788 			BUG_ON(ret);
5789 
5790 			for (i = 0; i < nr_extents; i++) {
5791 				if (ext_offset >= new_extents[i].num_bytes) {
5792 					ext_offset -= new_extents[i].num_bytes;
5793 					continue;
5794 				}
5795 				extent_len = min(new_extents[i].num_bytes -
5796 						 ext_offset, num_bytes);
5797 
5798 				ret = btrfs_insert_empty_item(trans, root,
5799 							      path, &key,
5800 							      sizeof(*fi));
5801 				BUG_ON(ret);
5802 
5803 				leaf = path->nodes[0];
5804 				fi = btrfs_item_ptr(leaf, path->slots[0],
5805 						struct btrfs_file_extent_item);
5806 				btrfs_set_file_extent_generation(leaf, fi,
5807 							trans->transid);
5808 				btrfs_set_file_extent_type(leaf, fi,
5809 							BTRFS_FILE_EXTENT_REG);
5810 				btrfs_set_file_extent_disk_bytenr(leaf, fi,
5811 						new_extents[i].disk_bytenr);
5812 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5813 						new_extents[i].disk_num_bytes);
5814 				btrfs_set_file_extent_ram_bytes(leaf, fi,
5815 						new_extents[i].ram_bytes);
5816 
5817 				btrfs_set_file_extent_compression(leaf, fi,
5818 						new_extents[i].compression);
5819 				btrfs_set_file_extent_encryption(leaf, fi,
5820 						new_extents[i].encryption);
5821 				btrfs_set_file_extent_other_encoding(leaf, fi,
5822 						new_extents[i].other_encoding);
5823 
5824 				btrfs_set_file_extent_num_bytes(leaf, fi,
5825 							extent_len);
5826 				ext_offset += new_extents[i].offset;
5827 				btrfs_set_file_extent_offset(leaf, fi,
5828 							ext_offset);
5829 				btrfs_mark_buffer_dirty(leaf);
5830 
5831 				btrfs_drop_extent_cache(inode, key.offset,
5832 						key.offset + extent_len - 1, 0);
5833 
5834 				ret = btrfs_inc_extent_ref(trans, root,
5835 						new_extents[i].disk_bytenr,
5836 						new_extents[i].disk_num_bytes,
5837 						leaf->start,
5838 						root->root_key.objectid,
5839 						trans->transid, key.objectid);
5840 				BUG_ON(ret);
5841 				btrfs_release_path(root, path);
5842 
5843 				inode_add_bytes(inode, extent_len);
5844 
5845 				ext_offset = 0;
5846 				num_bytes -= extent_len;
5847 				key.offset += extent_len;
5848 
5849 				if (num_bytes == 0)
5850 					break;
5851 			}
5852 			BUG_ON(i >= nr_extents);
5853 #endif
5854 		}
5855 
5856 		if (extent_locked) {
5857 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5858 				      lock_end, GFP_NOFS);
5859 			extent_locked = 0;
5860 		}
5861 skip:
5862 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
5863 		    key.offset >= search_end)
5864 			break;
5865 
5866 		cond_resched();
5867 	}
5868 	ret = 0;
5869 out:
5870 	btrfs_release_path(root, path);
5871 	if (inode) {
5872 		mutex_unlock(&inode->i_mutex);
5873 		if (extent_locked) {
5874 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5875 				      lock_end, GFP_NOFS);
5876 		}
5877 		iput(inode);
5878 	}
5879 	return ret;
5880 }
5881 
5882 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
5883 			       struct btrfs_root *root,
5884 			       struct extent_buffer *buf, u64 orig_start)
5885 {
5886 	int level;
5887 	int ret;
5888 
5889 	BUG_ON(btrfs_header_generation(buf) != trans->transid);
5890 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5891 
5892 	level = btrfs_header_level(buf);
5893 	if (level == 0) {
5894 		struct btrfs_leaf_ref *ref;
5895 		struct btrfs_leaf_ref *orig_ref;
5896 
5897 		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
5898 		if (!orig_ref)
5899 			return -ENOENT;
5900 
5901 		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
5902 		if (!ref) {
5903 			btrfs_free_leaf_ref(root, orig_ref);
5904 			return -ENOMEM;
5905 		}
5906 
5907 		ref->nritems = orig_ref->nritems;
5908 		memcpy(ref->extents, orig_ref->extents,
5909 			sizeof(ref->extents[0]) * ref->nritems);
5910 
5911 		btrfs_free_leaf_ref(root, orig_ref);
5912 
5913 		ref->root_gen = trans->transid;
5914 		ref->bytenr = buf->start;
5915 		ref->owner = btrfs_header_owner(buf);
5916 		ref->generation = btrfs_header_generation(buf);
5917 
5918 		ret = btrfs_add_leaf_ref(root, ref, 0);
5919 		WARN_ON(ret);
5920 		btrfs_free_leaf_ref(root, ref);
5921 	}
5922 	return 0;
5923 }
5924 
5925 static noinline int invalidate_extent_cache(struct btrfs_root *root,
5926 					struct extent_buffer *leaf,
5927 					struct btrfs_block_group_cache *group,
5928 					struct btrfs_root *target_root)
5929 {
5930 	struct btrfs_key key;
5931 	struct inode *inode = NULL;
5932 	struct btrfs_file_extent_item *fi;
5933 	u64 num_bytes;
5934 	u64 skip_objectid = 0;
5935 	u32 nritems;
5936 	u32 i;
5937 
5938 	nritems = btrfs_header_nritems(leaf);
5939 	for (i = 0; i < nritems; i++) {
5940 		btrfs_item_key_to_cpu(leaf, &key, i);
5941 		if (key.objectid == skip_objectid ||
5942 		    key.type != BTRFS_EXTENT_DATA_KEY)
5943 			continue;
5944 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
5945 		if (btrfs_file_extent_type(leaf, fi) ==
5946 		    BTRFS_FILE_EXTENT_INLINE)
5947 			continue;
5948 		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
5949 			continue;
5950 		if (!inode || inode->i_ino != key.objectid) {
5951 			iput(inode);
5952 			inode = btrfs_ilookup(target_root->fs_info->sb,
5953 					      key.objectid, target_root, 1);
5954 		}
5955 		if (!inode) {
5956 			skip_objectid = key.objectid;
5957 			continue;
5958 		}
5959 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5960 
5961 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5962 			    key.offset + num_bytes - 1, GFP_NOFS);
5963 		btrfs_drop_extent_cache(inode, key.offset,
5964 					key.offset + num_bytes - 1, 1);
5965 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5966 			      key.offset + num_bytes - 1, GFP_NOFS);
5967 		cond_resched();
5968 	}
5969 	iput(inode);
5970 	return 0;
5971 }
5972 
5973 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5974 					struct btrfs_root *root,
5975 					struct extent_buffer *leaf,
5976 					struct btrfs_block_group_cache *group,
5977 					struct inode *reloc_inode)
5978 {
5979 	struct btrfs_key key;
5980 	struct btrfs_key extent_key;
5981 	struct btrfs_file_extent_item *fi;
5982 	struct btrfs_leaf_ref *ref;
5983 	struct disk_extent *new_extent;
5984 	u64 bytenr;
5985 	u64 num_bytes;
5986 	u32 nritems;
5987 	u32 i;
5988 	int ext_index;
5989 	int nr_extent;
5990 	int ret;
5991 
5992 	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
5993 	BUG_ON(!new_extent);
5994 
5995 	ref = btrfs_lookup_leaf_ref(root, leaf->start);
5996 	BUG_ON(!ref);
5997 
5998 	ext_index = -1;
5999 	nritems = btrfs_header_nritems(leaf);
6000 	for (i = 0; i < nritems; i++) {
6001 		btrfs_item_key_to_cpu(leaf, &key, i);
6002 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6003 			continue;
6004 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6005 		if (btrfs_file_extent_type(leaf, fi) ==
6006 		    BTRFS_FILE_EXTENT_INLINE)
6007 			continue;
6008 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6009 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6010 		if (bytenr == 0)
6011 			continue;
6012 
6013 		ext_index++;
6014 		if (bytenr >= group->key.objectid + group->key.offset ||
6015 		    bytenr + num_bytes <= group->key.objectid)
6016 			continue;
6017 
6018 		extent_key.objectid = bytenr;
6019 		extent_key.offset = num_bytes;
6020 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6021 		nr_extent = 1;
6022 		ret = get_new_locations(reloc_inode, &extent_key,
6023 					group->key.objectid, 1,
6024 					&new_extent, &nr_extent);
6025 		if (ret > 0)
6026 			continue;
6027 		BUG_ON(ret < 0);
6028 
6029 		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
6030 		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
6031 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
6032 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
6033 
6034 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
6035 						new_extent->disk_bytenr);
6036 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6037 						new_extent->disk_num_bytes);
6038 		btrfs_mark_buffer_dirty(leaf);
6039 
6040 		ret = btrfs_inc_extent_ref(trans, root,
6041 					new_extent->disk_bytenr,
6042 					new_extent->disk_num_bytes,
6043 					leaf->start,
6044 					root->root_key.objectid,
6045 					trans->transid, key.objectid);
6046 		BUG_ON(ret);
6047 
6048 		ret = btrfs_free_extent(trans, root,
6049 					bytenr, num_bytes, leaf->start,
6050 					btrfs_header_owner(leaf),
6051 					btrfs_header_generation(leaf),
6052 					key.objectid, 0);
6053 		BUG_ON(ret);
6054 		cond_resched();
6055 	}
6056 	kfree(new_extent);
6057 	BUG_ON(ext_index + 1 != ref->nritems);
6058 	btrfs_free_leaf_ref(root, ref);
6059 	return 0;
6060 }
6061 
6062 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
6063 			  struct btrfs_root *root)
6064 {
6065 	struct btrfs_root *reloc_root;
6066 	int ret;
6067 
6068 	if (root->reloc_root) {
6069 		reloc_root = root->reloc_root;
6070 		root->reloc_root = NULL;
6071 		list_add(&reloc_root->dead_list,
6072 			 &root->fs_info->dead_reloc_roots);
6073 
6074 		btrfs_set_root_bytenr(&reloc_root->root_item,
6075 				      reloc_root->node->start);
6076 		btrfs_set_root_level(&root->root_item,
6077 				     btrfs_header_level(reloc_root->node));
6078 		memset(&reloc_root->root_item.drop_progress, 0,
6079 			sizeof(struct btrfs_disk_key));
6080 		reloc_root->root_item.drop_level = 0;
6081 
6082 		ret = btrfs_update_root(trans, root->fs_info->tree_root,
6083 					&reloc_root->root_key,
6084 					&reloc_root->root_item);
6085 		BUG_ON(ret);
6086 	}
6087 	return 0;
6088 }
6089 
6090 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
6091 {
6092 	struct btrfs_trans_handle *trans;
6093 	struct btrfs_root *reloc_root;
6094 	struct btrfs_root *prev_root = NULL;
6095 	struct list_head dead_roots;
6096 	int ret;
6097 	unsigned long nr;
6098 
6099 	INIT_LIST_HEAD(&dead_roots);
6100 	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
6101 
6102 	while (!list_empty(&dead_roots)) {
6103 		reloc_root = list_entry(dead_roots.prev,
6104 					struct btrfs_root, dead_list);
6105 		list_del_init(&reloc_root->dead_list);
6106 
6107 		BUG_ON(reloc_root->commit_root != NULL);
6108 		while (1) {
6109 			trans = btrfs_join_transaction(root, 1);
6110 			BUG_ON(!trans);
6111 
6112 			mutex_lock(&root->fs_info->drop_mutex);
6113 			ret = btrfs_drop_snapshot(trans, reloc_root);
6114 			if (ret != -EAGAIN)
6115 				break;
6116 			mutex_unlock(&root->fs_info->drop_mutex);
6117 
6118 			nr = trans->blocks_used;
6119 			ret = btrfs_end_transaction(trans, root);
6120 			BUG_ON(ret);
6121 			btrfs_btree_balance_dirty(root, nr);
6122 		}
6123 
6124 		free_extent_buffer(reloc_root->node);
6125 
6126 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
6127 				     &reloc_root->root_key);
6128 		BUG_ON(ret);
6129 		mutex_unlock(&root->fs_info->drop_mutex);
6130 
6131 		nr = trans->blocks_used;
6132 		ret = btrfs_end_transaction(trans, root);
6133 		BUG_ON(ret);
6134 		btrfs_btree_balance_dirty(root, nr);
6135 
6136 		kfree(prev_root);
6137 		prev_root = reloc_root;
6138 	}
6139 	if (prev_root) {
6140 		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
6141 		kfree(prev_root);
6142 	}
6143 	return 0;
6144 }
6145 
6146 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
6147 {
6148 	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
6149 	return 0;
6150 }
6151 
6152 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
6153 {
6154 	struct btrfs_root *reloc_root;
6155 	struct btrfs_trans_handle *trans;
6156 	struct btrfs_key location;
6157 	int found;
6158 	int ret;
6159 
6160 	mutex_lock(&root->fs_info->tree_reloc_mutex);
6161 	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
6162 	BUG_ON(ret);
6163 	found = !list_empty(&root->fs_info->dead_reloc_roots);
6164 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
6165 
6166 	if (found) {
6167 		trans = btrfs_start_transaction(root, 1);
6168 		BUG_ON(!trans);
6169 		ret = btrfs_commit_transaction(trans, root);
6170 		BUG_ON(ret);
6171 	}
6172 
6173 	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6174 	location.offset = (u64)-1;
6175 	location.type = BTRFS_ROOT_ITEM_KEY;
6176 
6177 	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
6178 	BUG_ON(!reloc_root);
6179 	btrfs_orphan_cleanup(reloc_root);
6180 	return 0;
6181 }
6182 
6183 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
6184 				    struct btrfs_root *root)
6185 {
6186 	struct btrfs_root *reloc_root;
6187 	struct extent_buffer *eb;
6188 	struct btrfs_root_item *root_item;
6189 	struct btrfs_key root_key;
6190 	int ret;
6191 
6192 	BUG_ON(!root->ref_cows);
6193 	if (root->reloc_root)
6194 		return 0;
6195 
6196 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
6197 	BUG_ON(!root_item);
6198 
6199 	ret = btrfs_copy_root(trans, root, root->commit_root,
6200 			      &eb, BTRFS_TREE_RELOC_OBJECTID);
6201 	BUG_ON(ret);
6202 
6203 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
6204 	root_key.offset = root->root_key.objectid;
6205 	root_key.type = BTRFS_ROOT_ITEM_KEY;
6206 
6207 	memcpy(root_item, &root->root_item, sizeof(root_item));
6208 	btrfs_set_root_refs(root_item, 0);
6209 	btrfs_set_root_bytenr(root_item, eb->start);
6210 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
6211 	btrfs_set_root_generation(root_item, trans->transid);
6212 
6213 	btrfs_tree_unlock(eb);
6214 	free_extent_buffer(eb);
6215 
6216 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
6217 				&root_key, root_item);
6218 	BUG_ON(ret);
6219 	kfree(root_item);
6220 
6221 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
6222 						 &root_key);
6223 	BUG_ON(!reloc_root);
6224 	reloc_root->last_trans = trans->transid;
6225 	reloc_root->commit_root = NULL;
6226 	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
6227 
6228 	root->reloc_root = reloc_root;
6229 	return 0;
6230 }
6231 
6232 /*
6233  * Core function of space balance.
6234  *
6235  * The idea is using reloc trees to relocate tree blocks in reference
6236  * counted roots. There is one reloc tree for each subvol, and all
6237  * reloc trees share same root key objectid. Reloc trees are snapshots
6238  * of the latest committed roots of subvols (root->commit_root).
6239  *
6240  * To relocate a tree block referenced by a subvol, there are two steps.
6241  * COW the block through subvol's reloc tree, then update block pointer
6242  * in the subvol to point to the new block. Since all reloc trees share
6243  * same root key objectid, doing special handing for tree blocks owned
6244  * by them is easy. Once a tree block has been COWed in one reloc tree,
6245  * we can use the resulting new block directly when the same block is
6246  * required to COW again through other reloc trees. By this way, relocated
6247  * tree blocks are shared between reloc trees, so they are also shared
6248  * between subvols.
6249  */
6250 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
6251 				      struct btrfs_root *root,
6252 				      struct btrfs_path *path,
6253 				      struct btrfs_key *first_key,
6254 				      struct btrfs_ref_path *ref_path,
6255 				      struct btrfs_block_group_cache *group,
6256 				      struct inode *reloc_inode)
6257 {
6258 	struct btrfs_root *reloc_root;
6259 	struct extent_buffer *eb = NULL;
6260 	struct btrfs_key *keys;
6261 	u64 *nodes;
6262 	int level;
6263 	int shared_level;
6264 	int lowest_level = 0;
6265 	int ret;
6266 
6267 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
6268 		lowest_level = ref_path->owner_objectid;
6269 
6270 	if (!root->ref_cows) {
6271 		path->lowest_level = lowest_level;
6272 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
6273 		BUG_ON(ret < 0);
6274 		path->lowest_level = 0;
6275 		btrfs_release_path(root, path);
6276 		return 0;
6277 	}
6278 
6279 	mutex_lock(&root->fs_info->tree_reloc_mutex);
6280 	ret = init_reloc_tree(trans, root);
6281 	BUG_ON(ret);
6282 	reloc_root = root->reloc_root;
6283 
6284 	shared_level = ref_path->shared_level;
6285 	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
6286 
6287 	keys = ref_path->node_keys;
6288 	nodes = ref_path->new_nodes;
6289 	memset(&keys[shared_level + 1], 0,
6290 	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
6291 	memset(&nodes[shared_level + 1], 0,
6292 	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
6293 
6294 	if (nodes[lowest_level] == 0) {
6295 		path->lowest_level = lowest_level;
6296 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6297 					0, 1);
6298 		BUG_ON(ret);
6299 		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
6300 			eb = path->nodes[level];
6301 			if (!eb || eb == reloc_root->node)
6302 				break;
6303 			nodes[level] = eb->start;
6304 			if (level == 0)
6305 				btrfs_item_key_to_cpu(eb, &keys[level], 0);
6306 			else
6307 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
6308 		}
6309 		if (nodes[0] &&
6310 		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6311 			eb = path->nodes[0];
6312 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
6313 						      group, reloc_inode);
6314 			BUG_ON(ret);
6315 		}
6316 		btrfs_release_path(reloc_root, path);
6317 	} else {
6318 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
6319 				       lowest_level);
6320 		BUG_ON(ret);
6321 	}
6322 
6323 	/*
6324 	 * replace tree blocks in the fs tree with tree blocks in
6325 	 * the reloc tree.
6326 	 */
6327 	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
6328 	BUG_ON(ret < 0);
6329 
6330 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6331 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6332 					0, 0);
6333 		BUG_ON(ret);
6334 		extent_buffer_get(path->nodes[0]);
6335 		eb = path->nodes[0];
6336 		btrfs_release_path(reloc_root, path);
6337 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
6338 		BUG_ON(ret);
6339 		free_extent_buffer(eb);
6340 	}
6341 
6342 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
6343 	path->lowest_level = 0;
6344 	return 0;
6345 }
6346 
6347 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
6348 					struct btrfs_root *root,
6349 					struct btrfs_path *path,
6350 					struct btrfs_key *first_key,
6351 					struct btrfs_ref_path *ref_path)
6352 {
6353 	int ret;
6354 
6355 	ret = relocate_one_path(trans, root, path, first_key,
6356 				ref_path, NULL, NULL);
6357 	BUG_ON(ret);
6358 
6359 	return 0;
6360 }
6361 
6362 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
6363 				    struct btrfs_root *extent_root,
6364 				    struct btrfs_path *path,
6365 				    struct btrfs_key *extent_key)
6366 {
6367 	int ret;
6368 
6369 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
6370 	if (ret)
6371 		goto out;
6372 	ret = btrfs_del_item(trans, extent_root, path);
6373 out:
6374 	btrfs_release_path(extent_root, path);
6375 	return ret;
6376 }
6377 
6378 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
6379 						struct btrfs_ref_path *ref_path)
6380 {
6381 	struct btrfs_key root_key;
6382 
6383 	root_key.objectid = ref_path->root_objectid;
6384 	root_key.type = BTRFS_ROOT_ITEM_KEY;
6385 	if (is_cowonly_root(ref_path->root_objectid))
6386 		root_key.offset = 0;
6387 	else
6388 		root_key.offset = (u64)-1;
6389 
6390 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
6391 }
6392 
6393 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
6394 					struct btrfs_path *path,
6395 					struct btrfs_key *extent_key,
6396 					struct btrfs_block_group_cache *group,
6397 					struct inode *reloc_inode, int pass)
6398 {
6399 	struct btrfs_trans_handle *trans;
6400 	struct btrfs_root *found_root;
6401 	struct btrfs_ref_path *ref_path = NULL;
6402 	struct disk_extent *new_extents = NULL;
6403 	int nr_extents = 0;
6404 	int loops;
6405 	int ret;
6406 	int level;
6407 	struct btrfs_key first_key;
6408 	u64 prev_block = 0;
6409 
6410 
6411 	trans = btrfs_start_transaction(extent_root, 1);
6412 	BUG_ON(!trans);
6413 
6414 	if (extent_key->objectid == 0) {
6415 		ret = del_extent_zero(trans, extent_root, path, extent_key);
6416 		goto out;
6417 	}
6418 
6419 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
6420 	if (!ref_path) {
6421 		ret = -ENOMEM;
6422 		goto out;
6423 	}
6424 
6425 	for (loops = 0; ; loops++) {
6426 		if (loops == 0) {
6427 			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
6428 						   extent_key->objectid);
6429 		} else {
6430 			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
6431 		}
6432 		if (ret < 0)
6433 			goto out;
6434 		if (ret > 0)
6435 			break;
6436 
6437 		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6438 		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
6439 			continue;
6440 
6441 		found_root = read_ref_root(extent_root->fs_info, ref_path);
6442 		BUG_ON(!found_root);
6443 		/*
6444 		 * for reference counted tree, only process reference paths
6445 		 * rooted at the latest committed root.
6446 		 */
6447 		if (found_root->ref_cows &&
6448 		    ref_path->root_generation != found_root->root_key.offset)
6449 			continue;
6450 
6451 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6452 			if (pass == 0) {
6453 				/*
6454 				 * copy data extents to new locations
6455 				 */
6456 				u64 group_start = group->key.objectid;
6457 				ret = relocate_data_extent(reloc_inode,
6458 							   extent_key,
6459 							   group_start);
6460 				if (ret < 0)
6461 					goto out;
6462 				break;
6463 			}
6464 			level = 0;
6465 		} else {
6466 			level = ref_path->owner_objectid;
6467 		}
6468 
6469 		if (prev_block != ref_path->nodes[level]) {
6470 			struct extent_buffer *eb;
6471 			u64 block_start = ref_path->nodes[level];
6472 			u64 block_size = btrfs_level_size(found_root, level);
6473 
6474 			eb = read_tree_block(found_root, block_start,
6475 					     block_size, 0);
6476 			btrfs_tree_lock(eb);
6477 			BUG_ON(level != btrfs_header_level(eb));
6478 
6479 			if (level == 0)
6480 				btrfs_item_key_to_cpu(eb, &first_key, 0);
6481 			else
6482 				btrfs_node_key_to_cpu(eb, &first_key, 0);
6483 
6484 			btrfs_tree_unlock(eb);
6485 			free_extent_buffer(eb);
6486 			prev_block = block_start;
6487 		}
6488 
6489 		mutex_lock(&extent_root->fs_info->trans_mutex);
6490 		btrfs_record_root_in_trans(found_root);
6491 		mutex_unlock(&extent_root->fs_info->trans_mutex);
6492 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6493 			/*
6494 			 * try to update data extent references while
6495 			 * keeping metadata shared between snapshots.
6496 			 */
6497 			if (pass == 1) {
6498 				ret = relocate_one_path(trans, found_root,
6499 						path, &first_key, ref_path,
6500 						group, reloc_inode);
6501 				if (ret < 0)
6502 					goto out;
6503 				continue;
6504 			}
6505 			/*
6506 			 * use fallback method to process the remaining
6507 			 * references.
6508 			 */
6509 			if (!new_extents) {
6510 				u64 group_start = group->key.objectid;
6511 				new_extents = kmalloc(sizeof(*new_extents),
6512 						      GFP_NOFS);
6513 				nr_extents = 1;
6514 				ret = get_new_locations(reloc_inode,
6515 							extent_key,
6516 							group_start, 1,
6517 							&new_extents,
6518 							&nr_extents);
6519 				if (ret)
6520 					goto out;
6521 			}
6522 			ret = replace_one_extent(trans, found_root,
6523 						path, extent_key,
6524 						&first_key, ref_path,
6525 						new_extents, nr_extents);
6526 		} else {
6527 			ret = relocate_tree_block(trans, found_root, path,
6528 						  &first_key, ref_path);
6529 		}
6530 		if (ret < 0)
6531 			goto out;
6532 	}
6533 	ret = 0;
6534 out:
6535 	btrfs_end_transaction(trans, extent_root);
6536 	kfree(new_extents);
6537 	kfree(ref_path);
6538 	return ret;
6539 }
6540 #endif
6541 
6542 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6543 {
6544 	u64 num_devices;
6545 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6546 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6547 
6548 	num_devices = root->fs_info->fs_devices->rw_devices;
6549 	if (num_devices == 1) {
6550 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6551 		stripped = flags & ~stripped;
6552 
6553 		/* turn raid0 into single device chunks */
6554 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
6555 			return stripped;
6556 
6557 		/* turn mirroring into duplication */
6558 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6559 			     BTRFS_BLOCK_GROUP_RAID10))
6560 			return stripped | BTRFS_BLOCK_GROUP_DUP;
6561 		return flags;
6562 	} else {
6563 		/* they already had raid on here, just return */
6564 		if (flags & stripped)
6565 			return flags;
6566 
6567 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6568 		stripped = flags & ~stripped;
6569 
6570 		/* switch duplicated blocks with raid1 */
6571 		if (flags & BTRFS_BLOCK_GROUP_DUP)
6572 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
6573 
6574 		/* turn single device chunks into raid0 */
6575 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
6576 	}
6577 	return flags;
6578 }
6579 
6580 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
6581 		     struct btrfs_block_group_cache *shrink_block_group,
6582 		     int force)
6583 {
6584 	struct btrfs_trans_handle *trans;
6585 	u64 new_alloc_flags;
6586 	u64 calc;
6587 
6588 	spin_lock(&shrink_block_group->lock);
6589 	if (btrfs_block_group_used(&shrink_block_group->item) +
6590 	    shrink_block_group->reserved > 0) {
6591 		spin_unlock(&shrink_block_group->lock);
6592 
6593 		trans = btrfs_start_transaction(root, 1);
6594 		spin_lock(&shrink_block_group->lock);
6595 
6596 		new_alloc_flags = update_block_group_flags(root,
6597 						   shrink_block_group->flags);
6598 		if (new_alloc_flags != shrink_block_group->flags) {
6599 			calc =
6600 			     btrfs_block_group_used(&shrink_block_group->item);
6601 		} else {
6602 			calc = shrink_block_group->key.offset;
6603 		}
6604 		spin_unlock(&shrink_block_group->lock);
6605 
6606 		do_chunk_alloc(trans, root->fs_info->extent_root,
6607 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
6608 
6609 		btrfs_end_transaction(trans, root);
6610 	} else
6611 		spin_unlock(&shrink_block_group->lock);
6612 	return 0;
6613 }
6614 
6615 
6616 int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6617 					 struct btrfs_block_group_cache *group)
6618 
6619 {
6620 	__alloc_chunk_for_shrink(root, group, 1);
6621 	set_block_group_readonly(group);
6622 	return 0;
6623 }
6624 
6625 #if 0
6626 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6627 				 struct btrfs_root *root,
6628 				 u64 objectid, u64 size)
6629 {
6630 	struct btrfs_path *path;
6631 	struct btrfs_inode_item *item;
6632 	struct extent_buffer *leaf;
6633 	int ret;
6634 
6635 	path = btrfs_alloc_path();
6636 	if (!path)
6637 		return -ENOMEM;
6638 
6639 	path->leave_spinning = 1;
6640 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6641 	if (ret)
6642 		goto out;
6643 
6644 	leaf = path->nodes[0];
6645 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6646 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6647 	btrfs_set_inode_generation(leaf, item, 1);
6648 	btrfs_set_inode_size(leaf, item, size);
6649 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6650 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6651 	btrfs_mark_buffer_dirty(leaf);
6652 	btrfs_release_path(root, path);
6653 out:
6654 	btrfs_free_path(path);
6655 	return ret;
6656 }
6657 
6658 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6659 					struct btrfs_block_group_cache *group)
6660 {
6661 	struct inode *inode = NULL;
6662 	struct btrfs_trans_handle *trans;
6663 	struct btrfs_root *root;
6664 	struct btrfs_key root_key;
6665 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
6666 	int err = 0;
6667 
6668 	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6669 	root_key.type = BTRFS_ROOT_ITEM_KEY;
6670 	root_key.offset = (u64)-1;
6671 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6672 	if (IS_ERR(root))
6673 		return ERR_CAST(root);
6674 
6675 	trans = btrfs_start_transaction(root, 1);
6676 	BUG_ON(!trans);
6677 
6678 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
6679 	if (err)
6680 		goto out;
6681 
6682 	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
6683 	BUG_ON(err);
6684 
6685 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6686 				       group->key.offset, 0, group->key.offset,
6687 				       0, 0, 0);
6688 	BUG_ON(err);
6689 
6690 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6691 	if (inode->i_state & I_NEW) {
6692 		BTRFS_I(inode)->root = root;
6693 		BTRFS_I(inode)->location.objectid = objectid;
6694 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6695 		BTRFS_I(inode)->location.offset = 0;
6696 		btrfs_read_locked_inode(inode);
6697 		unlock_new_inode(inode);
6698 		BUG_ON(is_bad_inode(inode));
6699 	} else {
6700 		BUG_ON(1);
6701 	}
6702 	BTRFS_I(inode)->index_cnt = group->key.objectid;
6703 
6704 	err = btrfs_orphan_add(trans, inode);
6705 out:
6706 	btrfs_end_transaction(trans, root);
6707 	if (err) {
6708 		if (inode)
6709 			iput(inode);
6710 		inode = ERR_PTR(err);
6711 	}
6712 	return inode;
6713 }
6714 
6715 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6716 {
6717 
6718 	struct btrfs_ordered_sum *sums;
6719 	struct btrfs_sector_sum *sector_sum;
6720 	struct btrfs_ordered_extent *ordered;
6721 	struct btrfs_root *root = BTRFS_I(inode)->root;
6722 	struct list_head list;
6723 	size_t offset;
6724 	int ret;
6725 	u64 disk_bytenr;
6726 
6727 	INIT_LIST_HEAD(&list);
6728 
6729 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6730 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6731 
6732 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6733 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6734 				       disk_bytenr + len - 1, &list);
6735 
6736 	while (!list_empty(&list)) {
6737 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6738 		list_del_init(&sums->list);
6739 
6740 		sector_sum = sums->sums;
6741 		sums->bytenr = ordered->start;
6742 
6743 		offset = 0;
6744 		while (offset < sums->len) {
6745 			sector_sum->bytenr += ordered->start - disk_bytenr;
6746 			sector_sum++;
6747 			offset += root->sectorsize;
6748 		}
6749 
6750 		btrfs_add_ordered_sum(inode, ordered, sums);
6751 	}
6752 	btrfs_put_ordered_extent(ordered);
6753 	return 0;
6754 }
6755 
6756 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6757 {
6758 	struct btrfs_trans_handle *trans;
6759 	struct btrfs_path *path;
6760 	struct btrfs_fs_info *info = root->fs_info;
6761 	struct extent_buffer *leaf;
6762 	struct inode *reloc_inode;
6763 	struct btrfs_block_group_cache *block_group;
6764 	struct btrfs_key key;
6765 	u64 skipped;
6766 	u64 cur_byte;
6767 	u64 total_found;
6768 	u32 nritems;
6769 	int ret;
6770 	int progress;
6771 	int pass = 0;
6772 
6773 	root = root->fs_info->extent_root;
6774 
6775 	block_group = btrfs_lookup_block_group(info, group_start);
6776 	BUG_ON(!block_group);
6777 
6778 	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6779 	       (unsigned long long)block_group->key.objectid,
6780 	       (unsigned long long)block_group->flags);
6781 
6782 	path = btrfs_alloc_path();
6783 	BUG_ON(!path);
6784 
6785 	reloc_inode = create_reloc_inode(info, block_group);
6786 	BUG_ON(IS_ERR(reloc_inode));
6787 
6788 	__alloc_chunk_for_shrink(root, block_group, 1);
6789 	set_block_group_readonly(block_group);
6790 
6791 	btrfs_start_delalloc_inodes(info->tree_root);
6792 	btrfs_wait_ordered_extents(info->tree_root, 0);
6793 again:
6794 	skipped = 0;
6795 	total_found = 0;
6796 	progress = 0;
6797 	key.objectid = block_group->key.objectid;
6798 	key.offset = 0;
6799 	key.type = 0;
6800 	cur_byte = key.objectid;
6801 
6802 	trans = btrfs_start_transaction(info->tree_root, 1);
6803 	btrfs_commit_transaction(trans, info->tree_root);
6804 
6805 	mutex_lock(&root->fs_info->cleaner_mutex);
6806 	btrfs_clean_old_snapshots(info->tree_root);
6807 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6808 	mutex_unlock(&root->fs_info->cleaner_mutex);
6809 
6810 	trans = btrfs_start_transaction(info->tree_root, 1);
6811 	btrfs_commit_transaction(trans, info->tree_root);
6812 
6813 	while (1) {
6814 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6815 		if (ret < 0)
6816 			goto out;
6817 next:
6818 		leaf = path->nodes[0];
6819 		nritems = btrfs_header_nritems(leaf);
6820 		if (path->slots[0] >= nritems) {
6821 			ret = btrfs_next_leaf(root, path);
6822 			if (ret < 0)
6823 				goto out;
6824 			if (ret == 1) {
6825 				ret = 0;
6826 				break;
6827 			}
6828 			leaf = path->nodes[0];
6829 			nritems = btrfs_header_nritems(leaf);
6830 		}
6831 
6832 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6833 
6834 		if (key.objectid >= block_group->key.objectid +
6835 		    block_group->key.offset)
6836 			break;
6837 
6838 		if (progress && need_resched()) {
6839 			btrfs_release_path(root, path);
6840 			cond_resched();
6841 			progress = 0;
6842 			continue;
6843 		}
6844 		progress = 1;
6845 
6846 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
6847 		    key.objectid + key.offset <= cur_byte) {
6848 			path->slots[0]++;
6849 			goto next;
6850 		}
6851 
6852 		total_found++;
6853 		cur_byte = key.objectid + key.offset;
6854 		btrfs_release_path(root, path);
6855 
6856 		__alloc_chunk_for_shrink(root, block_group, 0);
6857 		ret = relocate_one_extent(root, path, &key, block_group,
6858 					  reloc_inode, pass);
6859 		BUG_ON(ret < 0);
6860 		if (ret > 0)
6861 			skipped++;
6862 
6863 		key.objectid = cur_byte;
6864 		key.type = 0;
6865 		key.offset = 0;
6866 	}
6867 
6868 	btrfs_release_path(root, path);
6869 
6870 	if (pass == 0) {
6871 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
6872 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
6873 	}
6874 
6875 	if (total_found > 0) {
6876 		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
6877 		       (unsigned long long)total_found, pass);
6878 		pass++;
6879 		if (total_found == skipped && pass > 2) {
6880 			iput(reloc_inode);
6881 			reloc_inode = create_reloc_inode(info, block_group);
6882 			pass = 0;
6883 		}
6884 		goto again;
6885 	}
6886 
6887 	/* delete reloc_inode */
6888 	iput(reloc_inode);
6889 
6890 	/* unpin extents in this range */
6891 	trans = btrfs_start_transaction(info->tree_root, 1);
6892 	btrfs_commit_transaction(trans, info->tree_root);
6893 
6894 	spin_lock(&block_group->lock);
6895 	WARN_ON(block_group->pinned > 0);
6896 	WARN_ON(block_group->reserved > 0);
6897 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6898 	spin_unlock(&block_group->lock);
6899 	btrfs_put_block_group(block_group);
6900 	ret = 0;
6901 out:
6902 	btrfs_free_path(path);
6903 	return ret;
6904 }
6905 #endif
6906 
6907 static int find_first_block_group(struct btrfs_root *root,
6908 		struct btrfs_path *path, struct btrfs_key *key)
6909 {
6910 	int ret = 0;
6911 	struct btrfs_key found_key;
6912 	struct extent_buffer *leaf;
6913 	int slot;
6914 
6915 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
6916 	if (ret < 0)
6917 		goto out;
6918 
6919 	while (1) {
6920 		slot = path->slots[0];
6921 		leaf = path->nodes[0];
6922 		if (slot >= btrfs_header_nritems(leaf)) {
6923 			ret = btrfs_next_leaf(root, path);
6924 			if (ret == 0)
6925 				continue;
6926 			if (ret < 0)
6927 				goto out;
6928 			break;
6929 		}
6930 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6931 
6932 		if (found_key.objectid >= key->objectid &&
6933 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6934 			ret = 0;
6935 			goto out;
6936 		}
6937 		path->slots[0]++;
6938 	}
6939 	ret = -ENOENT;
6940 out:
6941 	return ret;
6942 }
6943 
6944 int btrfs_free_block_groups(struct btrfs_fs_info *info)
6945 {
6946 	struct btrfs_block_group_cache *block_group;
6947 	struct btrfs_space_info *space_info;
6948 	struct rb_node *n;
6949 
6950 	spin_lock(&info->block_group_cache_lock);
6951 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
6952 		block_group = rb_entry(n, struct btrfs_block_group_cache,
6953 				       cache_node);
6954 		rb_erase(&block_group->cache_node,
6955 			 &info->block_group_cache_tree);
6956 		spin_unlock(&info->block_group_cache_lock);
6957 
6958 		btrfs_remove_free_space_cache(block_group);
6959 		down_write(&block_group->space_info->groups_sem);
6960 		list_del(&block_group->list);
6961 		up_write(&block_group->space_info->groups_sem);
6962 
6963 		WARN_ON(atomic_read(&block_group->count) != 1);
6964 		kfree(block_group);
6965 
6966 		spin_lock(&info->block_group_cache_lock);
6967 	}
6968 	spin_unlock(&info->block_group_cache_lock);
6969 
6970 	/* now that all the block groups are freed, go through and
6971 	 * free all the space_info structs.  This is only called during
6972 	 * the final stages of unmount, and so we know nobody is
6973 	 * using them.  We call synchronize_rcu() once before we start,
6974 	 * just to be on the safe side.
6975 	 */
6976 	synchronize_rcu();
6977 
6978 	while(!list_empty(&info->space_info)) {
6979 		space_info = list_entry(info->space_info.next,
6980 					struct btrfs_space_info,
6981 					list);
6982 
6983 		list_del(&space_info->list);
6984 		kfree(space_info);
6985 	}
6986 	return 0;
6987 }
6988 
6989 int btrfs_read_block_groups(struct btrfs_root *root)
6990 {
6991 	struct btrfs_path *path;
6992 	int ret;
6993 	struct btrfs_block_group_cache *cache;
6994 	struct btrfs_fs_info *info = root->fs_info;
6995 	struct btrfs_space_info *space_info;
6996 	struct btrfs_key key;
6997 	struct btrfs_key found_key;
6998 	struct extent_buffer *leaf;
6999 
7000 	root = info->extent_root;
7001 	key.objectid = 0;
7002 	key.offset = 0;
7003 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7004 	path = btrfs_alloc_path();
7005 	if (!path)
7006 		return -ENOMEM;
7007 
7008 	while (1) {
7009 		ret = find_first_block_group(root, path, &key);
7010 		if (ret > 0) {
7011 			ret = 0;
7012 			goto error;
7013 		}
7014 		if (ret != 0)
7015 			goto error;
7016 
7017 		leaf = path->nodes[0];
7018 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7019 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7020 		if (!cache) {
7021 			ret = -ENOMEM;
7022 			break;
7023 		}
7024 
7025 		atomic_set(&cache->count, 1);
7026 		spin_lock_init(&cache->lock);
7027 		spin_lock_init(&cache->tree_lock);
7028 		mutex_init(&cache->cache_mutex);
7029 		INIT_LIST_HEAD(&cache->list);
7030 		INIT_LIST_HEAD(&cache->cluster_list);
7031 		read_extent_buffer(leaf, &cache->item,
7032 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7033 				   sizeof(cache->item));
7034 		memcpy(&cache->key, &found_key, sizeof(found_key));
7035 
7036 		key.objectid = found_key.objectid + found_key.offset;
7037 		btrfs_release_path(root, path);
7038 		cache->flags = btrfs_block_group_flags(&cache->item);
7039 
7040 		ret = update_space_info(info, cache->flags, found_key.offset,
7041 					btrfs_block_group_used(&cache->item),
7042 					&space_info);
7043 		BUG_ON(ret);
7044 		cache->space_info = space_info;
7045 		down_write(&space_info->groups_sem);
7046 		list_add_tail(&cache->list, &space_info->block_groups);
7047 		up_write(&space_info->groups_sem);
7048 
7049 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7050 		BUG_ON(ret);
7051 
7052 		set_avail_alloc_bits(root->fs_info, cache->flags);
7053 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7054 			set_block_group_readonly(cache);
7055 	}
7056 	ret = 0;
7057 error:
7058 	btrfs_free_path(path);
7059 	return ret;
7060 }
7061 
7062 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7063 			   struct btrfs_root *root, u64 bytes_used,
7064 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7065 			   u64 size)
7066 {
7067 	int ret;
7068 	struct btrfs_root *extent_root;
7069 	struct btrfs_block_group_cache *cache;
7070 
7071 	extent_root = root->fs_info->extent_root;
7072 
7073 	root->fs_info->last_trans_log_full_commit = trans->transid;
7074 
7075 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7076 	if (!cache)
7077 		return -ENOMEM;
7078 
7079 	cache->key.objectid = chunk_offset;
7080 	cache->key.offset = size;
7081 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7082 	atomic_set(&cache->count, 1);
7083 	spin_lock_init(&cache->lock);
7084 	spin_lock_init(&cache->tree_lock);
7085 	mutex_init(&cache->cache_mutex);
7086 	INIT_LIST_HEAD(&cache->list);
7087 	INIT_LIST_HEAD(&cache->cluster_list);
7088 
7089 	btrfs_set_block_group_used(&cache->item, bytes_used);
7090 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7091 	cache->flags = type;
7092 	btrfs_set_block_group_flags(&cache->item, type);
7093 
7094 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7095 				&cache->space_info);
7096 	BUG_ON(ret);
7097 	down_write(&cache->space_info->groups_sem);
7098 	list_add_tail(&cache->list, &cache->space_info->block_groups);
7099 	up_write(&cache->space_info->groups_sem);
7100 
7101 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7102 	BUG_ON(ret);
7103 
7104 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7105 				sizeof(cache->item));
7106 	BUG_ON(ret);
7107 
7108 	set_avail_alloc_bits(extent_root->fs_info, type);
7109 
7110 	return 0;
7111 }
7112 
7113 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7114 			     struct btrfs_root *root, u64 group_start)
7115 {
7116 	struct btrfs_path *path;
7117 	struct btrfs_block_group_cache *block_group;
7118 	struct btrfs_free_cluster *cluster;
7119 	struct btrfs_key key;
7120 	int ret;
7121 
7122 	root = root->fs_info->extent_root;
7123 
7124 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7125 	BUG_ON(!block_group);
7126 	BUG_ON(!block_group->ro);
7127 
7128 	memcpy(&key, &block_group->key, sizeof(key));
7129 
7130 	/* make sure this block group isn't part of an allocation cluster */
7131 	cluster = &root->fs_info->data_alloc_cluster;
7132 	spin_lock(&cluster->refill_lock);
7133 	btrfs_return_cluster_to_free_space(block_group, cluster);
7134 	spin_unlock(&cluster->refill_lock);
7135 
7136 	/*
7137 	 * make sure this block group isn't part of a metadata
7138 	 * allocation cluster
7139 	 */
7140 	cluster = &root->fs_info->meta_alloc_cluster;
7141 	spin_lock(&cluster->refill_lock);
7142 	btrfs_return_cluster_to_free_space(block_group, cluster);
7143 	spin_unlock(&cluster->refill_lock);
7144 
7145 	path = btrfs_alloc_path();
7146 	BUG_ON(!path);
7147 
7148 	spin_lock(&root->fs_info->block_group_cache_lock);
7149 	rb_erase(&block_group->cache_node,
7150 		 &root->fs_info->block_group_cache_tree);
7151 	spin_unlock(&root->fs_info->block_group_cache_lock);
7152 	btrfs_remove_free_space_cache(block_group);
7153 	down_write(&block_group->space_info->groups_sem);
7154 	/*
7155 	 * we must use list_del_init so people can check to see if they
7156 	 * are still on the list after taking the semaphore
7157 	 */
7158 	list_del_init(&block_group->list);
7159 	up_write(&block_group->space_info->groups_sem);
7160 
7161 	spin_lock(&block_group->space_info->lock);
7162 	block_group->space_info->total_bytes -= block_group->key.offset;
7163 	block_group->space_info->bytes_readonly -= block_group->key.offset;
7164 	spin_unlock(&block_group->space_info->lock);
7165 	block_group->space_info->full = 0;
7166 
7167 	btrfs_put_block_group(block_group);
7168 	btrfs_put_block_group(block_group);
7169 
7170 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7171 	if (ret > 0)
7172 		ret = -EIO;
7173 	if (ret < 0)
7174 		goto out;
7175 
7176 	ret = btrfs_del_item(trans, root, path);
7177 out:
7178 	btrfs_free_path(path);
7179 	return ret;
7180 }
7181