xref: /linux/fs/btrfs/extent-tree.c (revision fea88a0c02822fbb91a0b8301bf9af04377876a3)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 /*
38  * control flags for do_chunk_alloc's force field
39  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
40  * if we really need one.
41  *
42  * CHUNK_ALLOC_LIMITED means to only try and allocate one
43  * if we have very few chunks already allocated.  This is
44  * used as part of the clustering code to help make sure
45  * we have a good pool of storage to cluster in, without
46  * filling the FS with empty chunks
47  *
48  * CHUNK_ALLOC_FORCE means it must try to allocate one
49  *
50  */
51 enum {
52 	CHUNK_ALLOC_NO_FORCE = 0,
53 	CHUNK_ALLOC_LIMITED = 1,
54 	CHUNK_ALLOC_FORCE = 2,
55 };
56 
57 /*
58  * Control how reservations are dealt with.
59  *
60  * RESERVE_FREE - freeing a reservation.
61  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
62  *   ENOSPC accounting
63  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
64  *   bytes_may_use as the ENOSPC accounting is done elsewhere
65  */
66 enum {
67 	RESERVE_FREE = 0,
68 	RESERVE_ALLOC = 1,
69 	RESERVE_ALLOC_NO_ACCOUNT = 2,
70 };
71 
72 static int update_block_group(struct btrfs_trans_handle *trans,
73 			      struct btrfs_root *root,
74 			      u64 bytenr, u64 num_bytes, int alloc);
75 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
76 				struct btrfs_root *root,
77 				u64 bytenr, u64 num_bytes, u64 parent,
78 				u64 root_objectid, u64 owner_objectid,
79 				u64 owner_offset, int refs_to_drop,
80 				struct btrfs_delayed_extent_op *extra_op);
81 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
82 				    struct extent_buffer *leaf,
83 				    struct btrfs_extent_item *ei);
84 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
85 				      struct btrfs_root *root,
86 				      u64 parent, u64 root_objectid,
87 				      u64 flags, u64 owner, u64 offset,
88 				      struct btrfs_key *ins, int ref_mod);
89 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
90 				     struct btrfs_root *root,
91 				     u64 parent, u64 root_objectid,
92 				     u64 flags, struct btrfs_disk_key *key,
93 				     int level, struct btrfs_key *ins);
94 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
95 			  struct btrfs_root *extent_root, u64 alloc_bytes,
96 			  u64 flags, int force);
97 static int find_next_key(struct btrfs_path *path, int level,
98 			 struct btrfs_key *key);
99 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
100 			    int dump_block_groups);
101 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
102 				       u64 num_bytes, int reserve);
103 
104 static noinline int
105 block_group_cache_done(struct btrfs_block_group_cache *cache)
106 {
107 	smp_mb();
108 	return cache->cached == BTRFS_CACHE_FINISHED;
109 }
110 
111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
112 {
113 	return (cache->flags & bits) == bits;
114 }
115 
116 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
117 {
118 	atomic_inc(&cache->count);
119 }
120 
121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
122 {
123 	if (atomic_dec_and_test(&cache->count)) {
124 		WARN_ON(cache->pinned > 0);
125 		WARN_ON(cache->reserved > 0);
126 		kfree(cache->free_space_ctl);
127 		kfree(cache);
128 	}
129 }
130 
131 /*
132  * this adds the block group to the fs_info rb tree for the block group
133  * cache
134  */
135 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
136 				struct btrfs_block_group_cache *block_group)
137 {
138 	struct rb_node **p;
139 	struct rb_node *parent = NULL;
140 	struct btrfs_block_group_cache *cache;
141 
142 	spin_lock(&info->block_group_cache_lock);
143 	p = &info->block_group_cache_tree.rb_node;
144 
145 	while (*p) {
146 		parent = *p;
147 		cache = rb_entry(parent, struct btrfs_block_group_cache,
148 				 cache_node);
149 		if (block_group->key.objectid < cache->key.objectid) {
150 			p = &(*p)->rb_left;
151 		} else if (block_group->key.objectid > cache->key.objectid) {
152 			p = &(*p)->rb_right;
153 		} else {
154 			spin_unlock(&info->block_group_cache_lock);
155 			return -EEXIST;
156 		}
157 	}
158 
159 	rb_link_node(&block_group->cache_node, parent, p);
160 	rb_insert_color(&block_group->cache_node,
161 			&info->block_group_cache_tree);
162 	spin_unlock(&info->block_group_cache_lock);
163 
164 	return 0;
165 }
166 
167 /*
168  * This will return the block group at or after bytenr if contains is 0, else
169  * it will return the block group that contains the bytenr
170  */
171 static struct btrfs_block_group_cache *
172 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
173 			      int contains)
174 {
175 	struct btrfs_block_group_cache *cache, *ret = NULL;
176 	struct rb_node *n;
177 	u64 end, start;
178 
179 	spin_lock(&info->block_group_cache_lock);
180 	n = info->block_group_cache_tree.rb_node;
181 
182 	while (n) {
183 		cache = rb_entry(n, struct btrfs_block_group_cache,
184 				 cache_node);
185 		end = cache->key.objectid + cache->key.offset - 1;
186 		start = cache->key.objectid;
187 
188 		if (bytenr < start) {
189 			if (!contains && (!ret || start < ret->key.objectid))
190 				ret = cache;
191 			n = n->rb_left;
192 		} else if (bytenr > start) {
193 			if (contains && bytenr <= end) {
194 				ret = cache;
195 				break;
196 			}
197 			n = n->rb_right;
198 		} else {
199 			ret = cache;
200 			break;
201 		}
202 	}
203 	if (ret)
204 		btrfs_get_block_group(ret);
205 	spin_unlock(&info->block_group_cache_lock);
206 
207 	return ret;
208 }
209 
210 static int add_excluded_extent(struct btrfs_root *root,
211 			       u64 start, u64 num_bytes)
212 {
213 	u64 end = start + num_bytes - 1;
214 	set_extent_bits(&root->fs_info->freed_extents[0],
215 			start, end, EXTENT_UPTODATE, GFP_NOFS);
216 	set_extent_bits(&root->fs_info->freed_extents[1],
217 			start, end, EXTENT_UPTODATE, GFP_NOFS);
218 	return 0;
219 }
220 
221 static void free_excluded_extents(struct btrfs_root *root,
222 				  struct btrfs_block_group_cache *cache)
223 {
224 	u64 start, end;
225 
226 	start = cache->key.objectid;
227 	end = start + cache->key.offset - 1;
228 
229 	clear_extent_bits(&root->fs_info->freed_extents[0],
230 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
231 	clear_extent_bits(&root->fs_info->freed_extents[1],
232 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
233 }
234 
235 static int exclude_super_stripes(struct btrfs_root *root,
236 				 struct btrfs_block_group_cache *cache)
237 {
238 	u64 bytenr;
239 	u64 *logical;
240 	int stripe_len;
241 	int i, nr, ret;
242 
243 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
244 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
245 		cache->bytes_super += stripe_len;
246 		ret = add_excluded_extent(root, cache->key.objectid,
247 					  stripe_len);
248 		BUG_ON(ret); /* -ENOMEM */
249 	}
250 
251 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
252 		bytenr = btrfs_sb_offset(i);
253 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
254 				       cache->key.objectid, bytenr,
255 				       0, &logical, &nr, &stripe_len);
256 		BUG_ON(ret); /* -ENOMEM */
257 
258 		while (nr--) {
259 			cache->bytes_super += stripe_len;
260 			ret = add_excluded_extent(root, logical[nr],
261 						  stripe_len);
262 			BUG_ON(ret); /* -ENOMEM */
263 		}
264 
265 		kfree(logical);
266 	}
267 	return 0;
268 }
269 
270 static struct btrfs_caching_control *
271 get_caching_control(struct btrfs_block_group_cache *cache)
272 {
273 	struct btrfs_caching_control *ctl;
274 
275 	spin_lock(&cache->lock);
276 	if (cache->cached != BTRFS_CACHE_STARTED) {
277 		spin_unlock(&cache->lock);
278 		return NULL;
279 	}
280 
281 	/* We're loading it the fast way, so we don't have a caching_ctl. */
282 	if (!cache->caching_ctl) {
283 		spin_unlock(&cache->lock);
284 		return NULL;
285 	}
286 
287 	ctl = cache->caching_ctl;
288 	atomic_inc(&ctl->count);
289 	spin_unlock(&cache->lock);
290 	return ctl;
291 }
292 
293 static void put_caching_control(struct btrfs_caching_control *ctl)
294 {
295 	if (atomic_dec_and_test(&ctl->count))
296 		kfree(ctl);
297 }
298 
299 /*
300  * this is only called by cache_block_group, since we could have freed extents
301  * we need to check the pinned_extents for any extents that can't be used yet
302  * since their free space will be released as soon as the transaction commits.
303  */
304 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
305 			      struct btrfs_fs_info *info, u64 start, u64 end)
306 {
307 	u64 extent_start, extent_end, size, total_added = 0;
308 	int ret;
309 
310 	while (start < end) {
311 		ret = find_first_extent_bit(info->pinned_extents, start,
312 					    &extent_start, &extent_end,
313 					    EXTENT_DIRTY | EXTENT_UPTODATE);
314 		if (ret)
315 			break;
316 
317 		if (extent_start <= start) {
318 			start = extent_end + 1;
319 		} else if (extent_start > start && extent_start < end) {
320 			size = extent_start - start;
321 			total_added += size;
322 			ret = btrfs_add_free_space(block_group, start,
323 						   size);
324 			BUG_ON(ret); /* -ENOMEM or logic error */
325 			start = extent_end + 1;
326 		} else {
327 			break;
328 		}
329 	}
330 
331 	if (start < end) {
332 		size = end - start;
333 		total_added += size;
334 		ret = btrfs_add_free_space(block_group, start, size);
335 		BUG_ON(ret); /* -ENOMEM or logic error */
336 	}
337 
338 	return total_added;
339 }
340 
341 static noinline void caching_thread(struct btrfs_work *work)
342 {
343 	struct btrfs_block_group_cache *block_group;
344 	struct btrfs_fs_info *fs_info;
345 	struct btrfs_caching_control *caching_ctl;
346 	struct btrfs_root *extent_root;
347 	struct btrfs_path *path;
348 	struct extent_buffer *leaf;
349 	struct btrfs_key key;
350 	u64 total_found = 0;
351 	u64 last = 0;
352 	u32 nritems;
353 	int ret = 0;
354 
355 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
356 	block_group = caching_ctl->block_group;
357 	fs_info = block_group->fs_info;
358 	extent_root = fs_info->extent_root;
359 
360 	path = btrfs_alloc_path();
361 	if (!path)
362 		goto out;
363 
364 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
365 
366 	/*
367 	 * We don't want to deadlock with somebody trying to allocate a new
368 	 * extent for the extent root while also trying to search the extent
369 	 * root to add free space.  So we skip locking and search the commit
370 	 * root, since its read-only
371 	 */
372 	path->skip_locking = 1;
373 	path->search_commit_root = 1;
374 	path->reada = 1;
375 
376 	key.objectid = last;
377 	key.offset = 0;
378 	key.type = BTRFS_EXTENT_ITEM_KEY;
379 again:
380 	mutex_lock(&caching_ctl->mutex);
381 	/* need to make sure the commit_root doesn't disappear */
382 	down_read(&fs_info->extent_commit_sem);
383 
384 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
385 	if (ret < 0)
386 		goto err;
387 
388 	leaf = path->nodes[0];
389 	nritems = btrfs_header_nritems(leaf);
390 
391 	while (1) {
392 		if (btrfs_fs_closing(fs_info) > 1) {
393 			last = (u64)-1;
394 			break;
395 		}
396 
397 		if (path->slots[0] < nritems) {
398 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
399 		} else {
400 			ret = find_next_key(path, 0, &key);
401 			if (ret)
402 				break;
403 
404 			if (need_resched() ||
405 			    btrfs_next_leaf(extent_root, path)) {
406 				caching_ctl->progress = last;
407 				btrfs_release_path(path);
408 				up_read(&fs_info->extent_commit_sem);
409 				mutex_unlock(&caching_ctl->mutex);
410 				cond_resched();
411 				goto again;
412 			}
413 			leaf = path->nodes[0];
414 			nritems = btrfs_header_nritems(leaf);
415 			continue;
416 		}
417 
418 		if (key.objectid < block_group->key.objectid) {
419 			path->slots[0]++;
420 			continue;
421 		}
422 
423 		if (key.objectid >= block_group->key.objectid +
424 		    block_group->key.offset)
425 			break;
426 
427 		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
428 			total_found += add_new_free_space(block_group,
429 							  fs_info, last,
430 							  key.objectid);
431 			last = key.objectid + key.offset;
432 
433 			if (total_found > (1024 * 1024 * 2)) {
434 				total_found = 0;
435 				wake_up(&caching_ctl->wait);
436 			}
437 		}
438 		path->slots[0]++;
439 	}
440 	ret = 0;
441 
442 	total_found += add_new_free_space(block_group, fs_info, last,
443 					  block_group->key.objectid +
444 					  block_group->key.offset);
445 	caching_ctl->progress = (u64)-1;
446 
447 	spin_lock(&block_group->lock);
448 	block_group->caching_ctl = NULL;
449 	block_group->cached = BTRFS_CACHE_FINISHED;
450 	spin_unlock(&block_group->lock);
451 
452 err:
453 	btrfs_free_path(path);
454 	up_read(&fs_info->extent_commit_sem);
455 
456 	free_excluded_extents(extent_root, block_group);
457 
458 	mutex_unlock(&caching_ctl->mutex);
459 out:
460 	wake_up(&caching_ctl->wait);
461 
462 	put_caching_control(caching_ctl);
463 	btrfs_put_block_group(block_group);
464 }
465 
466 static int cache_block_group(struct btrfs_block_group_cache *cache,
467 			     struct btrfs_trans_handle *trans,
468 			     struct btrfs_root *root,
469 			     int load_cache_only)
470 {
471 	DEFINE_WAIT(wait);
472 	struct btrfs_fs_info *fs_info = cache->fs_info;
473 	struct btrfs_caching_control *caching_ctl;
474 	int ret = 0;
475 
476 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
477 	if (!caching_ctl)
478 		return -ENOMEM;
479 
480 	INIT_LIST_HEAD(&caching_ctl->list);
481 	mutex_init(&caching_ctl->mutex);
482 	init_waitqueue_head(&caching_ctl->wait);
483 	caching_ctl->block_group = cache;
484 	caching_ctl->progress = cache->key.objectid;
485 	atomic_set(&caching_ctl->count, 1);
486 	caching_ctl->work.func = caching_thread;
487 
488 	spin_lock(&cache->lock);
489 	/*
490 	 * This should be a rare occasion, but this could happen I think in the
491 	 * case where one thread starts to load the space cache info, and then
492 	 * some other thread starts a transaction commit which tries to do an
493 	 * allocation while the other thread is still loading the space cache
494 	 * info.  The previous loop should have kept us from choosing this block
495 	 * group, but if we've moved to the state where we will wait on caching
496 	 * block groups we need to first check if we're doing a fast load here,
497 	 * so we can wait for it to finish, otherwise we could end up allocating
498 	 * from a block group who's cache gets evicted for one reason or
499 	 * another.
500 	 */
501 	while (cache->cached == BTRFS_CACHE_FAST) {
502 		struct btrfs_caching_control *ctl;
503 
504 		ctl = cache->caching_ctl;
505 		atomic_inc(&ctl->count);
506 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
507 		spin_unlock(&cache->lock);
508 
509 		schedule();
510 
511 		finish_wait(&ctl->wait, &wait);
512 		put_caching_control(ctl);
513 		spin_lock(&cache->lock);
514 	}
515 
516 	if (cache->cached != BTRFS_CACHE_NO) {
517 		spin_unlock(&cache->lock);
518 		kfree(caching_ctl);
519 		return 0;
520 	}
521 	WARN_ON(cache->caching_ctl);
522 	cache->caching_ctl = caching_ctl;
523 	cache->cached = BTRFS_CACHE_FAST;
524 	spin_unlock(&cache->lock);
525 
526 	/*
527 	 * We can't do the read from on-disk cache during a commit since we need
528 	 * to have the normal tree locking.  Also if we are currently trying to
529 	 * allocate blocks for the tree root we can't do the fast caching since
530 	 * we likely hold important locks.
531 	 */
532 	if (trans && (!trans->transaction->in_commit) &&
533 	    (root && root != root->fs_info->tree_root) &&
534 	    btrfs_test_opt(root, SPACE_CACHE)) {
535 		ret = load_free_space_cache(fs_info, cache);
536 
537 		spin_lock(&cache->lock);
538 		if (ret == 1) {
539 			cache->caching_ctl = NULL;
540 			cache->cached = BTRFS_CACHE_FINISHED;
541 			cache->last_byte_to_unpin = (u64)-1;
542 		} else {
543 			if (load_cache_only) {
544 				cache->caching_ctl = NULL;
545 				cache->cached = BTRFS_CACHE_NO;
546 			} else {
547 				cache->cached = BTRFS_CACHE_STARTED;
548 			}
549 		}
550 		spin_unlock(&cache->lock);
551 		wake_up(&caching_ctl->wait);
552 		if (ret == 1) {
553 			put_caching_control(caching_ctl);
554 			free_excluded_extents(fs_info->extent_root, cache);
555 			return 0;
556 		}
557 	} else {
558 		/*
559 		 * We are not going to do the fast caching, set cached to the
560 		 * appropriate value and wakeup any waiters.
561 		 */
562 		spin_lock(&cache->lock);
563 		if (load_cache_only) {
564 			cache->caching_ctl = NULL;
565 			cache->cached = BTRFS_CACHE_NO;
566 		} else {
567 			cache->cached = BTRFS_CACHE_STARTED;
568 		}
569 		spin_unlock(&cache->lock);
570 		wake_up(&caching_ctl->wait);
571 	}
572 
573 	if (load_cache_only) {
574 		put_caching_control(caching_ctl);
575 		return 0;
576 	}
577 
578 	down_write(&fs_info->extent_commit_sem);
579 	atomic_inc(&caching_ctl->count);
580 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
581 	up_write(&fs_info->extent_commit_sem);
582 
583 	btrfs_get_block_group(cache);
584 
585 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
586 
587 	return ret;
588 }
589 
590 /*
591  * return the block group that starts at or after bytenr
592  */
593 static struct btrfs_block_group_cache *
594 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
595 {
596 	struct btrfs_block_group_cache *cache;
597 
598 	cache = block_group_cache_tree_search(info, bytenr, 0);
599 
600 	return cache;
601 }
602 
603 /*
604  * return the block group that contains the given bytenr
605  */
606 struct btrfs_block_group_cache *btrfs_lookup_block_group(
607 						 struct btrfs_fs_info *info,
608 						 u64 bytenr)
609 {
610 	struct btrfs_block_group_cache *cache;
611 
612 	cache = block_group_cache_tree_search(info, bytenr, 1);
613 
614 	return cache;
615 }
616 
617 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 						  u64 flags)
619 {
620 	struct list_head *head = &info->space_info;
621 	struct btrfs_space_info *found;
622 
623 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
624 
625 	rcu_read_lock();
626 	list_for_each_entry_rcu(found, head, list) {
627 		if (found->flags & flags) {
628 			rcu_read_unlock();
629 			return found;
630 		}
631 	}
632 	rcu_read_unlock();
633 	return NULL;
634 }
635 
636 /*
637  * after adding space to the filesystem, we need to clear the full flags
638  * on all the space infos.
639  */
640 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
641 {
642 	struct list_head *head = &info->space_info;
643 	struct btrfs_space_info *found;
644 
645 	rcu_read_lock();
646 	list_for_each_entry_rcu(found, head, list)
647 		found->full = 0;
648 	rcu_read_unlock();
649 }
650 
651 static u64 div_factor(u64 num, int factor)
652 {
653 	if (factor == 10)
654 		return num;
655 	num *= factor;
656 	do_div(num, 10);
657 	return num;
658 }
659 
660 static u64 div_factor_fine(u64 num, int factor)
661 {
662 	if (factor == 100)
663 		return num;
664 	num *= factor;
665 	do_div(num, 100);
666 	return num;
667 }
668 
669 u64 btrfs_find_block_group(struct btrfs_root *root,
670 			   u64 search_start, u64 search_hint, int owner)
671 {
672 	struct btrfs_block_group_cache *cache;
673 	u64 used;
674 	u64 last = max(search_hint, search_start);
675 	u64 group_start = 0;
676 	int full_search = 0;
677 	int factor = 9;
678 	int wrapped = 0;
679 again:
680 	while (1) {
681 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
682 		if (!cache)
683 			break;
684 
685 		spin_lock(&cache->lock);
686 		last = cache->key.objectid + cache->key.offset;
687 		used = btrfs_block_group_used(&cache->item);
688 
689 		if ((full_search || !cache->ro) &&
690 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
691 			if (used + cache->pinned + cache->reserved <
692 			    div_factor(cache->key.offset, factor)) {
693 				group_start = cache->key.objectid;
694 				spin_unlock(&cache->lock);
695 				btrfs_put_block_group(cache);
696 				goto found;
697 			}
698 		}
699 		spin_unlock(&cache->lock);
700 		btrfs_put_block_group(cache);
701 		cond_resched();
702 	}
703 	if (!wrapped) {
704 		last = search_start;
705 		wrapped = 1;
706 		goto again;
707 	}
708 	if (!full_search && factor < 10) {
709 		last = search_start;
710 		full_search = 1;
711 		factor = 10;
712 		goto again;
713 	}
714 found:
715 	return group_start;
716 }
717 
718 /* simple helper to search for an existing extent at a given offset */
719 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
720 {
721 	int ret;
722 	struct btrfs_key key;
723 	struct btrfs_path *path;
724 
725 	path = btrfs_alloc_path();
726 	if (!path)
727 		return -ENOMEM;
728 
729 	key.objectid = start;
730 	key.offset = len;
731 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
732 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
733 				0, 0);
734 	btrfs_free_path(path);
735 	return ret;
736 }
737 
738 /*
739  * helper function to lookup reference count and flags of extent.
740  *
741  * the head node for delayed ref is used to store the sum of all the
742  * reference count modifications queued up in the rbtree. the head
743  * node may also store the extent flags to set. This way you can check
744  * to see what the reference count and extent flags would be if all of
745  * the delayed refs are not processed.
746  */
747 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
748 			     struct btrfs_root *root, u64 bytenr,
749 			     u64 num_bytes, u64 *refs, u64 *flags)
750 {
751 	struct btrfs_delayed_ref_head *head;
752 	struct btrfs_delayed_ref_root *delayed_refs;
753 	struct btrfs_path *path;
754 	struct btrfs_extent_item *ei;
755 	struct extent_buffer *leaf;
756 	struct btrfs_key key;
757 	u32 item_size;
758 	u64 num_refs;
759 	u64 extent_flags;
760 	int ret;
761 
762 	path = btrfs_alloc_path();
763 	if (!path)
764 		return -ENOMEM;
765 
766 	key.objectid = bytenr;
767 	key.type = BTRFS_EXTENT_ITEM_KEY;
768 	key.offset = num_bytes;
769 	if (!trans) {
770 		path->skip_locking = 1;
771 		path->search_commit_root = 1;
772 	}
773 again:
774 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
775 				&key, path, 0, 0);
776 	if (ret < 0)
777 		goto out_free;
778 
779 	if (ret == 0) {
780 		leaf = path->nodes[0];
781 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
782 		if (item_size >= sizeof(*ei)) {
783 			ei = btrfs_item_ptr(leaf, path->slots[0],
784 					    struct btrfs_extent_item);
785 			num_refs = btrfs_extent_refs(leaf, ei);
786 			extent_flags = btrfs_extent_flags(leaf, ei);
787 		} else {
788 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
789 			struct btrfs_extent_item_v0 *ei0;
790 			BUG_ON(item_size != sizeof(*ei0));
791 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
792 					     struct btrfs_extent_item_v0);
793 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
794 			/* FIXME: this isn't correct for data */
795 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
796 #else
797 			BUG();
798 #endif
799 		}
800 		BUG_ON(num_refs == 0);
801 	} else {
802 		num_refs = 0;
803 		extent_flags = 0;
804 		ret = 0;
805 	}
806 
807 	if (!trans)
808 		goto out;
809 
810 	delayed_refs = &trans->transaction->delayed_refs;
811 	spin_lock(&delayed_refs->lock);
812 	head = btrfs_find_delayed_ref_head(trans, bytenr);
813 	if (head) {
814 		if (!mutex_trylock(&head->mutex)) {
815 			atomic_inc(&head->node.refs);
816 			spin_unlock(&delayed_refs->lock);
817 
818 			btrfs_release_path(path);
819 
820 			/*
821 			 * Mutex was contended, block until it's released and try
822 			 * again
823 			 */
824 			mutex_lock(&head->mutex);
825 			mutex_unlock(&head->mutex);
826 			btrfs_put_delayed_ref(&head->node);
827 			goto again;
828 		}
829 		if (head->extent_op && head->extent_op->update_flags)
830 			extent_flags |= head->extent_op->flags_to_set;
831 		else
832 			BUG_ON(num_refs == 0);
833 
834 		num_refs += head->node.ref_mod;
835 		mutex_unlock(&head->mutex);
836 	}
837 	spin_unlock(&delayed_refs->lock);
838 out:
839 	WARN_ON(num_refs == 0);
840 	if (refs)
841 		*refs = num_refs;
842 	if (flags)
843 		*flags = extent_flags;
844 out_free:
845 	btrfs_free_path(path);
846 	return ret;
847 }
848 
849 /*
850  * Back reference rules.  Back refs have three main goals:
851  *
852  * 1) differentiate between all holders of references to an extent so that
853  *    when a reference is dropped we can make sure it was a valid reference
854  *    before freeing the extent.
855  *
856  * 2) Provide enough information to quickly find the holders of an extent
857  *    if we notice a given block is corrupted or bad.
858  *
859  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
860  *    maintenance.  This is actually the same as #2, but with a slightly
861  *    different use case.
862  *
863  * There are two kinds of back refs. The implicit back refs is optimized
864  * for pointers in non-shared tree blocks. For a given pointer in a block,
865  * back refs of this kind provide information about the block's owner tree
866  * and the pointer's key. These information allow us to find the block by
867  * b-tree searching. The full back refs is for pointers in tree blocks not
868  * referenced by their owner trees. The location of tree block is recorded
869  * in the back refs. Actually the full back refs is generic, and can be
870  * used in all cases the implicit back refs is used. The major shortcoming
871  * of the full back refs is its overhead. Every time a tree block gets
872  * COWed, we have to update back refs entry for all pointers in it.
873  *
874  * For a newly allocated tree block, we use implicit back refs for
875  * pointers in it. This means most tree related operations only involve
876  * implicit back refs. For a tree block created in old transaction, the
877  * only way to drop a reference to it is COW it. So we can detect the
878  * event that tree block loses its owner tree's reference and do the
879  * back refs conversion.
880  *
881  * When a tree block is COW'd through a tree, there are four cases:
882  *
883  * The reference count of the block is one and the tree is the block's
884  * owner tree. Nothing to do in this case.
885  *
886  * The reference count of the block is one and the tree is not the
887  * block's owner tree. In this case, full back refs is used for pointers
888  * in the block. Remove these full back refs, add implicit back refs for
889  * every pointers in the new block.
890  *
891  * The reference count of the block is greater than one and the tree is
892  * the block's owner tree. In this case, implicit back refs is used for
893  * pointers in the block. Add full back refs for every pointers in the
894  * block, increase lower level extents' reference counts. The original
895  * implicit back refs are entailed to the new block.
896  *
897  * The reference count of the block is greater than one and the tree is
898  * not the block's owner tree. Add implicit back refs for every pointer in
899  * the new block, increase lower level extents' reference count.
900  *
901  * Back Reference Key composing:
902  *
903  * The key objectid corresponds to the first byte in the extent,
904  * The key type is used to differentiate between types of back refs.
905  * There are different meanings of the key offset for different types
906  * of back refs.
907  *
908  * File extents can be referenced by:
909  *
910  * - multiple snapshots, subvolumes, or different generations in one subvol
911  * - different files inside a single subvolume
912  * - different offsets inside a file (bookend extents in file.c)
913  *
914  * The extent ref structure for the implicit back refs has fields for:
915  *
916  * - Objectid of the subvolume root
917  * - objectid of the file holding the reference
918  * - original offset in the file
919  * - how many bookend extents
920  *
921  * The key offset for the implicit back refs is hash of the first
922  * three fields.
923  *
924  * The extent ref structure for the full back refs has field for:
925  *
926  * - number of pointers in the tree leaf
927  *
928  * The key offset for the implicit back refs is the first byte of
929  * the tree leaf
930  *
931  * When a file extent is allocated, The implicit back refs is used.
932  * the fields are filled in:
933  *
934  *     (root_key.objectid, inode objectid, offset in file, 1)
935  *
936  * When a file extent is removed file truncation, we find the
937  * corresponding implicit back refs and check the following fields:
938  *
939  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
940  *
941  * Btree extents can be referenced by:
942  *
943  * - Different subvolumes
944  *
945  * Both the implicit back refs and the full back refs for tree blocks
946  * only consist of key. The key offset for the implicit back refs is
947  * objectid of block's owner tree. The key offset for the full back refs
948  * is the first byte of parent block.
949  *
950  * When implicit back refs is used, information about the lowest key and
951  * level of the tree block are required. These information are stored in
952  * tree block info structure.
953  */
954 
955 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
956 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
957 				  struct btrfs_root *root,
958 				  struct btrfs_path *path,
959 				  u64 owner, u32 extra_size)
960 {
961 	struct btrfs_extent_item *item;
962 	struct btrfs_extent_item_v0 *ei0;
963 	struct btrfs_extent_ref_v0 *ref0;
964 	struct btrfs_tree_block_info *bi;
965 	struct extent_buffer *leaf;
966 	struct btrfs_key key;
967 	struct btrfs_key found_key;
968 	u32 new_size = sizeof(*item);
969 	u64 refs;
970 	int ret;
971 
972 	leaf = path->nodes[0];
973 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
974 
975 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
976 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
977 			     struct btrfs_extent_item_v0);
978 	refs = btrfs_extent_refs_v0(leaf, ei0);
979 
980 	if (owner == (u64)-1) {
981 		while (1) {
982 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
983 				ret = btrfs_next_leaf(root, path);
984 				if (ret < 0)
985 					return ret;
986 				BUG_ON(ret > 0); /* Corruption */
987 				leaf = path->nodes[0];
988 			}
989 			btrfs_item_key_to_cpu(leaf, &found_key,
990 					      path->slots[0]);
991 			BUG_ON(key.objectid != found_key.objectid);
992 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
993 				path->slots[0]++;
994 				continue;
995 			}
996 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
997 					      struct btrfs_extent_ref_v0);
998 			owner = btrfs_ref_objectid_v0(leaf, ref0);
999 			break;
1000 		}
1001 	}
1002 	btrfs_release_path(path);
1003 
1004 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1005 		new_size += sizeof(*bi);
1006 
1007 	new_size -= sizeof(*ei0);
1008 	ret = btrfs_search_slot(trans, root, &key, path,
1009 				new_size + extra_size, 1);
1010 	if (ret < 0)
1011 		return ret;
1012 	BUG_ON(ret); /* Corruption */
1013 
1014 	btrfs_extend_item(trans, root, path, new_size);
1015 
1016 	leaf = path->nodes[0];
1017 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1018 	btrfs_set_extent_refs(leaf, item, refs);
1019 	/* FIXME: get real generation */
1020 	btrfs_set_extent_generation(leaf, item, 0);
1021 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1022 		btrfs_set_extent_flags(leaf, item,
1023 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1024 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1025 		bi = (struct btrfs_tree_block_info *)(item + 1);
1026 		/* FIXME: get first key of the block */
1027 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1028 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1029 	} else {
1030 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1031 	}
1032 	btrfs_mark_buffer_dirty(leaf);
1033 	return 0;
1034 }
1035 #endif
1036 
1037 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1038 {
1039 	u32 high_crc = ~(u32)0;
1040 	u32 low_crc = ~(u32)0;
1041 	__le64 lenum;
1042 
1043 	lenum = cpu_to_le64(root_objectid);
1044 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1045 	lenum = cpu_to_le64(owner);
1046 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1047 	lenum = cpu_to_le64(offset);
1048 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1049 
1050 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1051 }
1052 
1053 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1054 				     struct btrfs_extent_data_ref *ref)
1055 {
1056 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1057 				    btrfs_extent_data_ref_objectid(leaf, ref),
1058 				    btrfs_extent_data_ref_offset(leaf, ref));
1059 }
1060 
1061 static int match_extent_data_ref(struct extent_buffer *leaf,
1062 				 struct btrfs_extent_data_ref *ref,
1063 				 u64 root_objectid, u64 owner, u64 offset)
1064 {
1065 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1066 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1067 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1068 		return 0;
1069 	return 1;
1070 }
1071 
1072 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1073 					   struct btrfs_root *root,
1074 					   struct btrfs_path *path,
1075 					   u64 bytenr, u64 parent,
1076 					   u64 root_objectid,
1077 					   u64 owner, u64 offset)
1078 {
1079 	struct btrfs_key key;
1080 	struct btrfs_extent_data_ref *ref;
1081 	struct extent_buffer *leaf;
1082 	u32 nritems;
1083 	int ret;
1084 	int recow;
1085 	int err = -ENOENT;
1086 
1087 	key.objectid = bytenr;
1088 	if (parent) {
1089 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1090 		key.offset = parent;
1091 	} else {
1092 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1093 		key.offset = hash_extent_data_ref(root_objectid,
1094 						  owner, offset);
1095 	}
1096 again:
1097 	recow = 0;
1098 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1099 	if (ret < 0) {
1100 		err = ret;
1101 		goto fail;
1102 	}
1103 
1104 	if (parent) {
1105 		if (!ret)
1106 			return 0;
1107 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1108 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1109 		btrfs_release_path(path);
1110 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111 		if (ret < 0) {
1112 			err = ret;
1113 			goto fail;
1114 		}
1115 		if (!ret)
1116 			return 0;
1117 #endif
1118 		goto fail;
1119 	}
1120 
1121 	leaf = path->nodes[0];
1122 	nritems = btrfs_header_nritems(leaf);
1123 	while (1) {
1124 		if (path->slots[0] >= nritems) {
1125 			ret = btrfs_next_leaf(root, path);
1126 			if (ret < 0)
1127 				err = ret;
1128 			if (ret)
1129 				goto fail;
1130 
1131 			leaf = path->nodes[0];
1132 			nritems = btrfs_header_nritems(leaf);
1133 			recow = 1;
1134 		}
1135 
1136 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1137 		if (key.objectid != bytenr ||
1138 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1139 			goto fail;
1140 
1141 		ref = btrfs_item_ptr(leaf, path->slots[0],
1142 				     struct btrfs_extent_data_ref);
1143 
1144 		if (match_extent_data_ref(leaf, ref, root_objectid,
1145 					  owner, offset)) {
1146 			if (recow) {
1147 				btrfs_release_path(path);
1148 				goto again;
1149 			}
1150 			err = 0;
1151 			break;
1152 		}
1153 		path->slots[0]++;
1154 	}
1155 fail:
1156 	return err;
1157 }
1158 
1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1160 					   struct btrfs_root *root,
1161 					   struct btrfs_path *path,
1162 					   u64 bytenr, u64 parent,
1163 					   u64 root_objectid, u64 owner,
1164 					   u64 offset, int refs_to_add)
1165 {
1166 	struct btrfs_key key;
1167 	struct extent_buffer *leaf;
1168 	u32 size;
1169 	u32 num_refs;
1170 	int ret;
1171 
1172 	key.objectid = bytenr;
1173 	if (parent) {
1174 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1175 		key.offset = parent;
1176 		size = sizeof(struct btrfs_shared_data_ref);
1177 	} else {
1178 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1179 		key.offset = hash_extent_data_ref(root_objectid,
1180 						  owner, offset);
1181 		size = sizeof(struct btrfs_extent_data_ref);
1182 	}
1183 
1184 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1185 	if (ret && ret != -EEXIST)
1186 		goto fail;
1187 
1188 	leaf = path->nodes[0];
1189 	if (parent) {
1190 		struct btrfs_shared_data_ref *ref;
1191 		ref = btrfs_item_ptr(leaf, path->slots[0],
1192 				     struct btrfs_shared_data_ref);
1193 		if (ret == 0) {
1194 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1195 		} else {
1196 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1197 			num_refs += refs_to_add;
1198 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1199 		}
1200 	} else {
1201 		struct btrfs_extent_data_ref *ref;
1202 		while (ret == -EEXIST) {
1203 			ref = btrfs_item_ptr(leaf, path->slots[0],
1204 					     struct btrfs_extent_data_ref);
1205 			if (match_extent_data_ref(leaf, ref, root_objectid,
1206 						  owner, offset))
1207 				break;
1208 			btrfs_release_path(path);
1209 			key.offset++;
1210 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1211 						      size);
1212 			if (ret && ret != -EEXIST)
1213 				goto fail;
1214 
1215 			leaf = path->nodes[0];
1216 		}
1217 		ref = btrfs_item_ptr(leaf, path->slots[0],
1218 				     struct btrfs_extent_data_ref);
1219 		if (ret == 0) {
1220 			btrfs_set_extent_data_ref_root(leaf, ref,
1221 						       root_objectid);
1222 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1223 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1224 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1225 		} else {
1226 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1227 			num_refs += refs_to_add;
1228 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1229 		}
1230 	}
1231 	btrfs_mark_buffer_dirty(leaf);
1232 	ret = 0;
1233 fail:
1234 	btrfs_release_path(path);
1235 	return ret;
1236 }
1237 
1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1239 					   struct btrfs_root *root,
1240 					   struct btrfs_path *path,
1241 					   int refs_to_drop)
1242 {
1243 	struct btrfs_key key;
1244 	struct btrfs_extent_data_ref *ref1 = NULL;
1245 	struct btrfs_shared_data_ref *ref2 = NULL;
1246 	struct extent_buffer *leaf;
1247 	u32 num_refs = 0;
1248 	int ret = 0;
1249 
1250 	leaf = path->nodes[0];
1251 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1252 
1253 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1254 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1255 				      struct btrfs_extent_data_ref);
1256 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1257 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1258 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1259 				      struct btrfs_shared_data_ref);
1260 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1262 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1263 		struct btrfs_extent_ref_v0 *ref0;
1264 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1265 				      struct btrfs_extent_ref_v0);
1266 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1267 #endif
1268 	} else {
1269 		BUG();
1270 	}
1271 
1272 	BUG_ON(num_refs < refs_to_drop);
1273 	num_refs -= refs_to_drop;
1274 
1275 	if (num_refs == 0) {
1276 		ret = btrfs_del_item(trans, root, path);
1277 	} else {
1278 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1279 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1280 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1281 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1283 		else {
1284 			struct btrfs_extent_ref_v0 *ref0;
1285 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1286 					struct btrfs_extent_ref_v0);
1287 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1288 		}
1289 #endif
1290 		btrfs_mark_buffer_dirty(leaf);
1291 	}
1292 	return ret;
1293 }
1294 
1295 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1296 					  struct btrfs_path *path,
1297 					  struct btrfs_extent_inline_ref *iref)
1298 {
1299 	struct btrfs_key key;
1300 	struct extent_buffer *leaf;
1301 	struct btrfs_extent_data_ref *ref1;
1302 	struct btrfs_shared_data_ref *ref2;
1303 	u32 num_refs = 0;
1304 
1305 	leaf = path->nodes[0];
1306 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1307 	if (iref) {
1308 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1309 		    BTRFS_EXTENT_DATA_REF_KEY) {
1310 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1311 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1312 		} else {
1313 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1314 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1315 		}
1316 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1317 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1318 				      struct btrfs_extent_data_ref);
1319 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1320 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1321 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1322 				      struct btrfs_shared_data_ref);
1323 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1325 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1326 		struct btrfs_extent_ref_v0 *ref0;
1327 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1328 				      struct btrfs_extent_ref_v0);
1329 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1330 #endif
1331 	} else {
1332 		WARN_ON(1);
1333 	}
1334 	return num_refs;
1335 }
1336 
1337 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1338 					  struct btrfs_root *root,
1339 					  struct btrfs_path *path,
1340 					  u64 bytenr, u64 parent,
1341 					  u64 root_objectid)
1342 {
1343 	struct btrfs_key key;
1344 	int ret;
1345 
1346 	key.objectid = bytenr;
1347 	if (parent) {
1348 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1349 		key.offset = parent;
1350 	} else {
1351 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1352 		key.offset = root_objectid;
1353 	}
1354 
1355 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1356 	if (ret > 0)
1357 		ret = -ENOENT;
1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1359 	if (ret == -ENOENT && parent) {
1360 		btrfs_release_path(path);
1361 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1362 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1363 		if (ret > 0)
1364 			ret = -ENOENT;
1365 	}
1366 #endif
1367 	return ret;
1368 }
1369 
1370 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1371 					  struct btrfs_root *root,
1372 					  struct btrfs_path *path,
1373 					  u64 bytenr, u64 parent,
1374 					  u64 root_objectid)
1375 {
1376 	struct btrfs_key key;
1377 	int ret;
1378 
1379 	key.objectid = bytenr;
1380 	if (parent) {
1381 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1382 		key.offset = parent;
1383 	} else {
1384 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1385 		key.offset = root_objectid;
1386 	}
1387 
1388 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1389 	btrfs_release_path(path);
1390 	return ret;
1391 }
1392 
1393 static inline int extent_ref_type(u64 parent, u64 owner)
1394 {
1395 	int type;
1396 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1397 		if (parent > 0)
1398 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1399 		else
1400 			type = BTRFS_TREE_BLOCK_REF_KEY;
1401 	} else {
1402 		if (parent > 0)
1403 			type = BTRFS_SHARED_DATA_REF_KEY;
1404 		else
1405 			type = BTRFS_EXTENT_DATA_REF_KEY;
1406 	}
1407 	return type;
1408 }
1409 
1410 static int find_next_key(struct btrfs_path *path, int level,
1411 			 struct btrfs_key *key)
1412 
1413 {
1414 	for (; level < BTRFS_MAX_LEVEL; level++) {
1415 		if (!path->nodes[level])
1416 			break;
1417 		if (path->slots[level] + 1 >=
1418 		    btrfs_header_nritems(path->nodes[level]))
1419 			continue;
1420 		if (level == 0)
1421 			btrfs_item_key_to_cpu(path->nodes[level], key,
1422 					      path->slots[level] + 1);
1423 		else
1424 			btrfs_node_key_to_cpu(path->nodes[level], key,
1425 					      path->slots[level] + 1);
1426 		return 0;
1427 	}
1428 	return 1;
1429 }
1430 
1431 /*
1432  * look for inline back ref. if back ref is found, *ref_ret is set
1433  * to the address of inline back ref, and 0 is returned.
1434  *
1435  * if back ref isn't found, *ref_ret is set to the address where it
1436  * should be inserted, and -ENOENT is returned.
1437  *
1438  * if insert is true and there are too many inline back refs, the path
1439  * points to the extent item, and -EAGAIN is returned.
1440  *
1441  * NOTE: inline back refs are ordered in the same way that back ref
1442  *	 items in the tree are ordered.
1443  */
1444 static noinline_for_stack
1445 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1446 				 struct btrfs_root *root,
1447 				 struct btrfs_path *path,
1448 				 struct btrfs_extent_inline_ref **ref_ret,
1449 				 u64 bytenr, u64 num_bytes,
1450 				 u64 parent, u64 root_objectid,
1451 				 u64 owner, u64 offset, int insert)
1452 {
1453 	struct btrfs_key key;
1454 	struct extent_buffer *leaf;
1455 	struct btrfs_extent_item *ei;
1456 	struct btrfs_extent_inline_ref *iref;
1457 	u64 flags;
1458 	u64 item_size;
1459 	unsigned long ptr;
1460 	unsigned long end;
1461 	int extra_size;
1462 	int type;
1463 	int want;
1464 	int ret;
1465 	int err = 0;
1466 
1467 	key.objectid = bytenr;
1468 	key.type = BTRFS_EXTENT_ITEM_KEY;
1469 	key.offset = num_bytes;
1470 
1471 	want = extent_ref_type(parent, owner);
1472 	if (insert) {
1473 		extra_size = btrfs_extent_inline_ref_size(want);
1474 		path->keep_locks = 1;
1475 	} else
1476 		extra_size = -1;
1477 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1478 	if (ret < 0) {
1479 		err = ret;
1480 		goto out;
1481 	}
1482 	if (ret && !insert) {
1483 		err = -ENOENT;
1484 		goto out;
1485 	}
1486 	BUG_ON(ret); /* Corruption */
1487 
1488 	leaf = path->nodes[0];
1489 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1490 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1491 	if (item_size < sizeof(*ei)) {
1492 		if (!insert) {
1493 			err = -ENOENT;
1494 			goto out;
1495 		}
1496 		ret = convert_extent_item_v0(trans, root, path, owner,
1497 					     extra_size);
1498 		if (ret < 0) {
1499 			err = ret;
1500 			goto out;
1501 		}
1502 		leaf = path->nodes[0];
1503 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1504 	}
1505 #endif
1506 	BUG_ON(item_size < sizeof(*ei));
1507 
1508 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1509 	flags = btrfs_extent_flags(leaf, ei);
1510 
1511 	ptr = (unsigned long)(ei + 1);
1512 	end = (unsigned long)ei + item_size;
1513 
1514 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1515 		ptr += sizeof(struct btrfs_tree_block_info);
1516 		BUG_ON(ptr > end);
1517 	} else {
1518 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1519 	}
1520 
1521 	err = -ENOENT;
1522 	while (1) {
1523 		if (ptr >= end) {
1524 			WARN_ON(ptr > end);
1525 			break;
1526 		}
1527 		iref = (struct btrfs_extent_inline_ref *)ptr;
1528 		type = btrfs_extent_inline_ref_type(leaf, iref);
1529 		if (want < type)
1530 			break;
1531 		if (want > type) {
1532 			ptr += btrfs_extent_inline_ref_size(type);
1533 			continue;
1534 		}
1535 
1536 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1537 			struct btrfs_extent_data_ref *dref;
1538 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1539 			if (match_extent_data_ref(leaf, dref, root_objectid,
1540 						  owner, offset)) {
1541 				err = 0;
1542 				break;
1543 			}
1544 			if (hash_extent_data_ref_item(leaf, dref) <
1545 			    hash_extent_data_ref(root_objectid, owner, offset))
1546 				break;
1547 		} else {
1548 			u64 ref_offset;
1549 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1550 			if (parent > 0) {
1551 				if (parent == ref_offset) {
1552 					err = 0;
1553 					break;
1554 				}
1555 				if (ref_offset < parent)
1556 					break;
1557 			} else {
1558 				if (root_objectid == ref_offset) {
1559 					err = 0;
1560 					break;
1561 				}
1562 				if (ref_offset < root_objectid)
1563 					break;
1564 			}
1565 		}
1566 		ptr += btrfs_extent_inline_ref_size(type);
1567 	}
1568 	if (err == -ENOENT && insert) {
1569 		if (item_size + extra_size >=
1570 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1571 			err = -EAGAIN;
1572 			goto out;
1573 		}
1574 		/*
1575 		 * To add new inline back ref, we have to make sure
1576 		 * there is no corresponding back ref item.
1577 		 * For simplicity, we just do not add new inline back
1578 		 * ref if there is any kind of item for this block
1579 		 */
1580 		if (find_next_key(path, 0, &key) == 0 &&
1581 		    key.objectid == bytenr &&
1582 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1583 			err = -EAGAIN;
1584 			goto out;
1585 		}
1586 	}
1587 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1588 out:
1589 	if (insert) {
1590 		path->keep_locks = 0;
1591 		btrfs_unlock_up_safe(path, 1);
1592 	}
1593 	return err;
1594 }
1595 
1596 /*
1597  * helper to add new inline back ref
1598  */
1599 static noinline_for_stack
1600 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1601 				 struct btrfs_root *root,
1602 				 struct btrfs_path *path,
1603 				 struct btrfs_extent_inline_ref *iref,
1604 				 u64 parent, u64 root_objectid,
1605 				 u64 owner, u64 offset, int refs_to_add,
1606 				 struct btrfs_delayed_extent_op *extent_op)
1607 {
1608 	struct extent_buffer *leaf;
1609 	struct btrfs_extent_item *ei;
1610 	unsigned long ptr;
1611 	unsigned long end;
1612 	unsigned long item_offset;
1613 	u64 refs;
1614 	int size;
1615 	int type;
1616 
1617 	leaf = path->nodes[0];
1618 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1619 	item_offset = (unsigned long)iref - (unsigned long)ei;
1620 
1621 	type = extent_ref_type(parent, owner);
1622 	size = btrfs_extent_inline_ref_size(type);
1623 
1624 	btrfs_extend_item(trans, root, path, size);
1625 
1626 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1627 	refs = btrfs_extent_refs(leaf, ei);
1628 	refs += refs_to_add;
1629 	btrfs_set_extent_refs(leaf, ei, refs);
1630 	if (extent_op)
1631 		__run_delayed_extent_op(extent_op, leaf, ei);
1632 
1633 	ptr = (unsigned long)ei + item_offset;
1634 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1635 	if (ptr < end - size)
1636 		memmove_extent_buffer(leaf, ptr + size, ptr,
1637 				      end - size - ptr);
1638 
1639 	iref = (struct btrfs_extent_inline_ref *)ptr;
1640 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1641 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1642 		struct btrfs_extent_data_ref *dref;
1643 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1644 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1645 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1646 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1647 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1648 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1649 		struct btrfs_shared_data_ref *sref;
1650 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1651 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1652 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1653 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1654 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1655 	} else {
1656 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1657 	}
1658 	btrfs_mark_buffer_dirty(leaf);
1659 }
1660 
1661 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1662 				 struct btrfs_root *root,
1663 				 struct btrfs_path *path,
1664 				 struct btrfs_extent_inline_ref **ref_ret,
1665 				 u64 bytenr, u64 num_bytes, u64 parent,
1666 				 u64 root_objectid, u64 owner, u64 offset)
1667 {
1668 	int ret;
1669 
1670 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1671 					   bytenr, num_bytes, parent,
1672 					   root_objectid, owner, offset, 0);
1673 	if (ret != -ENOENT)
1674 		return ret;
1675 
1676 	btrfs_release_path(path);
1677 	*ref_ret = NULL;
1678 
1679 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1680 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1681 					    root_objectid);
1682 	} else {
1683 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1684 					     root_objectid, owner, offset);
1685 	}
1686 	return ret;
1687 }
1688 
1689 /*
1690  * helper to update/remove inline back ref
1691  */
1692 static noinline_for_stack
1693 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1694 				  struct btrfs_root *root,
1695 				  struct btrfs_path *path,
1696 				  struct btrfs_extent_inline_ref *iref,
1697 				  int refs_to_mod,
1698 				  struct btrfs_delayed_extent_op *extent_op)
1699 {
1700 	struct extent_buffer *leaf;
1701 	struct btrfs_extent_item *ei;
1702 	struct btrfs_extent_data_ref *dref = NULL;
1703 	struct btrfs_shared_data_ref *sref = NULL;
1704 	unsigned long ptr;
1705 	unsigned long end;
1706 	u32 item_size;
1707 	int size;
1708 	int type;
1709 	u64 refs;
1710 
1711 	leaf = path->nodes[0];
1712 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1713 	refs = btrfs_extent_refs(leaf, ei);
1714 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1715 	refs += refs_to_mod;
1716 	btrfs_set_extent_refs(leaf, ei, refs);
1717 	if (extent_op)
1718 		__run_delayed_extent_op(extent_op, leaf, ei);
1719 
1720 	type = btrfs_extent_inline_ref_type(leaf, iref);
1721 
1722 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1723 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1724 		refs = btrfs_extent_data_ref_count(leaf, dref);
1725 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1726 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1727 		refs = btrfs_shared_data_ref_count(leaf, sref);
1728 	} else {
1729 		refs = 1;
1730 		BUG_ON(refs_to_mod != -1);
1731 	}
1732 
1733 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1734 	refs += refs_to_mod;
1735 
1736 	if (refs > 0) {
1737 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1738 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1739 		else
1740 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1741 	} else {
1742 		size =  btrfs_extent_inline_ref_size(type);
1743 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1744 		ptr = (unsigned long)iref;
1745 		end = (unsigned long)ei + item_size;
1746 		if (ptr + size < end)
1747 			memmove_extent_buffer(leaf, ptr, ptr + size,
1748 					      end - ptr - size);
1749 		item_size -= size;
1750 		btrfs_truncate_item(trans, root, path, item_size, 1);
1751 	}
1752 	btrfs_mark_buffer_dirty(leaf);
1753 }
1754 
1755 static noinline_for_stack
1756 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1757 				 struct btrfs_root *root,
1758 				 struct btrfs_path *path,
1759 				 u64 bytenr, u64 num_bytes, u64 parent,
1760 				 u64 root_objectid, u64 owner,
1761 				 u64 offset, int refs_to_add,
1762 				 struct btrfs_delayed_extent_op *extent_op)
1763 {
1764 	struct btrfs_extent_inline_ref *iref;
1765 	int ret;
1766 
1767 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1768 					   bytenr, num_bytes, parent,
1769 					   root_objectid, owner, offset, 1);
1770 	if (ret == 0) {
1771 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1772 		update_inline_extent_backref(trans, root, path, iref,
1773 					     refs_to_add, extent_op);
1774 	} else if (ret == -ENOENT) {
1775 		setup_inline_extent_backref(trans, root, path, iref, parent,
1776 					    root_objectid, owner, offset,
1777 					    refs_to_add, extent_op);
1778 		ret = 0;
1779 	}
1780 	return ret;
1781 }
1782 
1783 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1784 				 struct btrfs_root *root,
1785 				 struct btrfs_path *path,
1786 				 u64 bytenr, u64 parent, u64 root_objectid,
1787 				 u64 owner, u64 offset, int refs_to_add)
1788 {
1789 	int ret;
1790 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1791 		BUG_ON(refs_to_add != 1);
1792 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1793 					    parent, root_objectid);
1794 	} else {
1795 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1796 					     parent, root_objectid,
1797 					     owner, offset, refs_to_add);
1798 	}
1799 	return ret;
1800 }
1801 
1802 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1803 				 struct btrfs_root *root,
1804 				 struct btrfs_path *path,
1805 				 struct btrfs_extent_inline_ref *iref,
1806 				 int refs_to_drop, int is_data)
1807 {
1808 	int ret = 0;
1809 
1810 	BUG_ON(!is_data && refs_to_drop != 1);
1811 	if (iref) {
1812 		update_inline_extent_backref(trans, root, path, iref,
1813 					     -refs_to_drop, NULL);
1814 	} else if (is_data) {
1815 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1816 	} else {
1817 		ret = btrfs_del_item(trans, root, path);
1818 	}
1819 	return ret;
1820 }
1821 
1822 static int btrfs_issue_discard(struct block_device *bdev,
1823 				u64 start, u64 len)
1824 {
1825 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1826 }
1827 
1828 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1829 				u64 num_bytes, u64 *actual_bytes)
1830 {
1831 	int ret;
1832 	u64 discarded_bytes = 0;
1833 	struct btrfs_bio *bbio = NULL;
1834 
1835 
1836 	/* Tell the block device(s) that the sectors can be discarded */
1837 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1838 			      bytenr, &num_bytes, &bbio, 0);
1839 	/* Error condition is -ENOMEM */
1840 	if (!ret) {
1841 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1842 		int i;
1843 
1844 
1845 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1846 			if (!stripe->dev->can_discard)
1847 				continue;
1848 
1849 			ret = btrfs_issue_discard(stripe->dev->bdev,
1850 						  stripe->physical,
1851 						  stripe->length);
1852 			if (!ret)
1853 				discarded_bytes += stripe->length;
1854 			else if (ret != -EOPNOTSUPP)
1855 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1856 
1857 			/*
1858 			 * Just in case we get back EOPNOTSUPP for some reason,
1859 			 * just ignore the return value so we don't screw up
1860 			 * people calling discard_extent.
1861 			 */
1862 			ret = 0;
1863 		}
1864 		kfree(bbio);
1865 	}
1866 
1867 	if (actual_bytes)
1868 		*actual_bytes = discarded_bytes;
1869 
1870 
1871 	return ret;
1872 }
1873 
1874 /* Can return -ENOMEM */
1875 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1876 			 struct btrfs_root *root,
1877 			 u64 bytenr, u64 num_bytes, u64 parent,
1878 			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1879 {
1880 	int ret;
1881 	struct btrfs_fs_info *fs_info = root->fs_info;
1882 
1883 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1884 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1885 
1886 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1887 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1888 					num_bytes,
1889 					parent, root_objectid, (int)owner,
1890 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1891 	} else {
1892 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1893 					num_bytes,
1894 					parent, root_objectid, owner, offset,
1895 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1896 	}
1897 	return ret;
1898 }
1899 
1900 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1901 				  struct btrfs_root *root,
1902 				  u64 bytenr, u64 num_bytes,
1903 				  u64 parent, u64 root_objectid,
1904 				  u64 owner, u64 offset, int refs_to_add,
1905 				  struct btrfs_delayed_extent_op *extent_op)
1906 {
1907 	struct btrfs_path *path;
1908 	struct extent_buffer *leaf;
1909 	struct btrfs_extent_item *item;
1910 	u64 refs;
1911 	int ret;
1912 	int err = 0;
1913 
1914 	path = btrfs_alloc_path();
1915 	if (!path)
1916 		return -ENOMEM;
1917 
1918 	path->reada = 1;
1919 	path->leave_spinning = 1;
1920 	/* this will setup the path even if it fails to insert the back ref */
1921 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1922 					   path, bytenr, num_bytes, parent,
1923 					   root_objectid, owner, offset,
1924 					   refs_to_add, extent_op);
1925 	if (ret == 0)
1926 		goto out;
1927 
1928 	if (ret != -EAGAIN) {
1929 		err = ret;
1930 		goto out;
1931 	}
1932 
1933 	leaf = path->nodes[0];
1934 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1935 	refs = btrfs_extent_refs(leaf, item);
1936 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1937 	if (extent_op)
1938 		__run_delayed_extent_op(extent_op, leaf, item);
1939 
1940 	btrfs_mark_buffer_dirty(leaf);
1941 	btrfs_release_path(path);
1942 
1943 	path->reada = 1;
1944 	path->leave_spinning = 1;
1945 
1946 	/* now insert the actual backref */
1947 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1948 				    path, bytenr, parent, root_objectid,
1949 				    owner, offset, refs_to_add);
1950 	if (ret)
1951 		btrfs_abort_transaction(trans, root, ret);
1952 out:
1953 	btrfs_free_path(path);
1954 	return err;
1955 }
1956 
1957 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1958 				struct btrfs_root *root,
1959 				struct btrfs_delayed_ref_node *node,
1960 				struct btrfs_delayed_extent_op *extent_op,
1961 				int insert_reserved)
1962 {
1963 	int ret = 0;
1964 	struct btrfs_delayed_data_ref *ref;
1965 	struct btrfs_key ins;
1966 	u64 parent = 0;
1967 	u64 ref_root = 0;
1968 	u64 flags = 0;
1969 
1970 	ins.objectid = node->bytenr;
1971 	ins.offset = node->num_bytes;
1972 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1973 
1974 	ref = btrfs_delayed_node_to_data_ref(node);
1975 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1976 		parent = ref->parent;
1977 	else
1978 		ref_root = ref->root;
1979 
1980 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1981 		if (extent_op) {
1982 			BUG_ON(extent_op->update_key);
1983 			flags |= extent_op->flags_to_set;
1984 		}
1985 		ret = alloc_reserved_file_extent(trans, root,
1986 						 parent, ref_root, flags,
1987 						 ref->objectid, ref->offset,
1988 						 &ins, node->ref_mod);
1989 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1990 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1991 					     node->num_bytes, parent,
1992 					     ref_root, ref->objectid,
1993 					     ref->offset, node->ref_mod,
1994 					     extent_op);
1995 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1996 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1997 					  node->num_bytes, parent,
1998 					  ref_root, ref->objectid,
1999 					  ref->offset, node->ref_mod,
2000 					  extent_op);
2001 	} else {
2002 		BUG();
2003 	}
2004 	return ret;
2005 }
2006 
2007 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2008 				    struct extent_buffer *leaf,
2009 				    struct btrfs_extent_item *ei)
2010 {
2011 	u64 flags = btrfs_extent_flags(leaf, ei);
2012 	if (extent_op->update_flags) {
2013 		flags |= extent_op->flags_to_set;
2014 		btrfs_set_extent_flags(leaf, ei, flags);
2015 	}
2016 
2017 	if (extent_op->update_key) {
2018 		struct btrfs_tree_block_info *bi;
2019 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2020 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2021 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2022 	}
2023 }
2024 
2025 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2026 				 struct btrfs_root *root,
2027 				 struct btrfs_delayed_ref_node *node,
2028 				 struct btrfs_delayed_extent_op *extent_op)
2029 {
2030 	struct btrfs_key key;
2031 	struct btrfs_path *path;
2032 	struct btrfs_extent_item *ei;
2033 	struct extent_buffer *leaf;
2034 	u32 item_size;
2035 	int ret;
2036 	int err = 0;
2037 
2038 	if (trans->aborted)
2039 		return 0;
2040 
2041 	path = btrfs_alloc_path();
2042 	if (!path)
2043 		return -ENOMEM;
2044 
2045 	key.objectid = node->bytenr;
2046 	key.type = BTRFS_EXTENT_ITEM_KEY;
2047 	key.offset = node->num_bytes;
2048 
2049 	path->reada = 1;
2050 	path->leave_spinning = 1;
2051 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2052 				path, 0, 1);
2053 	if (ret < 0) {
2054 		err = ret;
2055 		goto out;
2056 	}
2057 	if (ret > 0) {
2058 		err = -EIO;
2059 		goto out;
2060 	}
2061 
2062 	leaf = path->nodes[0];
2063 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2064 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2065 	if (item_size < sizeof(*ei)) {
2066 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2067 					     path, (u64)-1, 0);
2068 		if (ret < 0) {
2069 			err = ret;
2070 			goto out;
2071 		}
2072 		leaf = path->nodes[0];
2073 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2074 	}
2075 #endif
2076 	BUG_ON(item_size < sizeof(*ei));
2077 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2078 	__run_delayed_extent_op(extent_op, leaf, ei);
2079 
2080 	btrfs_mark_buffer_dirty(leaf);
2081 out:
2082 	btrfs_free_path(path);
2083 	return err;
2084 }
2085 
2086 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2087 				struct btrfs_root *root,
2088 				struct btrfs_delayed_ref_node *node,
2089 				struct btrfs_delayed_extent_op *extent_op,
2090 				int insert_reserved)
2091 {
2092 	int ret = 0;
2093 	struct btrfs_delayed_tree_ref *ref;
2094 	struct btrfs_key ins;
2095 	u64 parent = 0;
2096 	u64 ref_root = 0;
2097 
2098 	ins.objectid = node->bytenr;
2099 	ins.offset = node->num_bytes;
2100 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2101 
2102 	ref = btrfs_delayed_node_to_tree_ref(node);
2103 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2104 		parent = ref->parent;
2105 	else
2106 		ref_root = ref->root;
2107 
2108 	BUG_ON(node->ref_mod != 1);
2109 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2110 		BUG_ON(!extent_op || !extent_op->update_flags ||
2111 		       !extent_op->update_key);
2112 		ret = alloc_reserved_tree_block(trans, root,
2113 						parent, ref_root,
2114 						extent_op->flags_to_set,
2115 						&extent_op->key,
2116 						ref->level, &ins);
2117 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2118 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2119 					     node->num_bytes, parent, ref_root,
2120 					     ref->level, 0, 1, extent_op);
2121 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2122 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2123 					  node->num_bytes, parent, ref_root,
2124 					  ref->level, 0, 1, extent_op);
2125 	} else {
2126 		BUG();
2127 	}
2128 	return ret;
2129 }
2130 
2131 /* helper function to actually process a single delayed ref entry */
2132 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2133 			       struct btrfs_root *root,
2134 			       struct btrfs_delayed_ref_node *node,
2135 			       struct btrfs_delayed_extent_op *extent_op,
2136 			       int insert_reserved)
2137 {
2138 	int ret = 0;
2139 
2140 	if (trans->aborted)
2141 		return 0;
2142 
2143 	if (btrfs_delayed_ref_is_head(node)) {
2144 		struct btrfs_delayed_ref_head *head;
2145 		/*
2146 		 * we've hit the end of the chain and we were supposed
2147 		 * to insert this extent into the tree.  But, it got
2148 		 * deleted before we ever needed to insert it, so all
2149 		 * we have to do is clean up the accounting
2150 		 */
2151 		BUG_ON(extent_op);
2152 		head = btrfs_delayed_node_to_head(node);
2153 		if (insert_reserved) {
2154 			btrfs_pin_extent(root, node->bytenr,
2155 					 node->num_bytes, 1);
2156 			if (head->is_data) {
2157 				ret = btrfs_del_csums(trans, root,
2158 						      node->bytenr,
2159 						      node->num_bytes);
2160 			}
2161 		}
2162 		mutex_unlock(&head->mutex);
2163 		return ret;
2164 	}
2165 
2166 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2167 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2168 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2169 					   insert_reserved);
2170 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2171 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2172 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2173 					   insert_reserved);
2174 	else
2175 		BUG();
2176 	return ret;
2177 }
2178 
2179 static noinline struct btrfs_delayed_ref_node *
2180 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2181 {
2182 	struct rb_node *node;
2183 	struct btrfs_delayed_ref_node *ref;
2184 	int action = BTRFS_ADD_DELAYED_REF;
2185 again:
2186 	/*
2187 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2188 	 * this prevents ref count from going down to zero when
2189 	 * there still are pending delayed ref.
2190 	 */
2191 	node = rb_prev(&head->node.rb_node);
2192 	while (1) {
2193 		if (!node)
2194 			break;
2195 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2196 				rb_node);
2197 		if (ref->bytenr != head->node.bytenr)
2198 			break;
2199 		if (ref->action == action)
2200 			return ref;
2201 		node = rb_prev(node);
2202 	}
2203 	if (action == BTRFS_ADD_DELAYED_REF) {
2204 		action = BTRFS_DROP_DELAYED_REF;
2205 		goto again;
2206 	}
2207 	return NULL;
2208 }
2209 
2210 /*
2211  * Returns 0 on success or if called with an already aborted transaction.
2212  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2213  */
2214 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2215 				       struct btrfs_root *root,
2216 				       struct list_head *cluster)
2217 {
2218 	struct btrfs_delayed_ref_root *delayed_refs;
2219 	struct btrfs_delayed_ref_node *ref;
2220 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2221 	struct btrfs_delayed_extent_op *extent_op;
2222 	int ret;
2223 	int count = 0;
2224 	int must_insert_reserved = 0;
2225 
2226 	delayed_refs = &trans->transaction->delayed_refs;
2227 	while (1) {
2228 		if (!locked_ref) {
2229 			/* pick a new head ref from the cluster list */
2230 			if (list_empty(cluster))
2231 				break;
2232 
2233 			locked_ref = list_entry(cluster->next,
2234 				     struct btrfs_delayed_ref_head, cluster);
2235 
2236 			/* grab the lock that says we are going to process
2237 			 * all the refs for this head */
2238 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2239 
2240 			/*
2241 			 * we may have dropped the spin lock to get the head
2242 			 * mutex lock, and that might have given someone else
2243 			 * time to free the head.  If that's true, it has been
2244 			 * removed from our list and we can move on.
2245 			 */
2246 			if (ret == -EAGAIN) {
2247 				locked_ref = NULL;
2248 				count++;
2249 				continue;
2250 			}
2251 		}
2252 
2253 		/*
2254 		 * locked_ref is the head node, so we have to go one
2255 		 * node back for any delayed ref updates
2256 		 */
2257 		ref = select_delayed_ref(locked_ref);
2258 
2259 		if (ref && ref->seq &&
2260 		    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2261 			/*
2262 			 * there are still refs with lower seq numbers in the
2263 			 * process of being added. Don't run this ref yet.
2264 			 */
2265 			list_del_init(&locked_ref->cluster);
2266 			mutex_unlock(&locked_ref->mutex);
2267 			locked_ref = NULL;
2268 			delayed_refs->num_heads_ready++;
2269 			spin_unlock(&delayed_refs->lock);
2270 			cond_resched();
2271 			spin_lock(&delayed_refs->lock);
2272 			continue;
2273 		}
2274 
2275 		/*
2276 		 * record the must insert reserved flag before we
2277 		 * drop the spin lock.
2278 		 */
2279 		must_insert_reserved = locked_ref->must_insert_reserved;
2280 		locked_ref->must_insert_reserved = 0;
2281 
2282 		extent_op = locked_ref->extent_op;
2283 		locked_ref->extent_op = NULL;
2284 
2285 		if (!ref) {
2286 			/* All delayed refs have been processed, Go ahead
2287 			 * and send the head node to run_one_delayed_ref,
2288 			 * so that any accounting fixes can happen
2289 			 */
2290 			ref = &locked_ref->node;
2291 
2292 			if (extent_op && must_insert_reserved) {
2293 				kfree(extent_op);
2294 				extent_op = NULL;
2295 			}
2296 
2297 			if (extent_op) {
2298 				spin_unlock(&delayed_refs->lock);
2299 
2300 				ret = run_delayed_extent_op(trans, root,
2301 							    ref, extent_op);
2302 				kfree(extent_op);
2303 
2304 				if (ret) {
2305 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2306 					return ret;
2307 				}
2308 
2309 				goto next;
2310 			}
2311 
2312 			list_del_init(&locked_ref->cluster);
2313 			locked_ref = NULL;
2314 		}
2315 
2316 		ref->in_tree = 0;
2317 		rb_erase(&ref->rb_node, &delayed_refs->root);
2318 		delayed_refs->num_entries--;
2319 		/*
2320 		 * we modified num_entries, but as we're currently running
2321 		 * delayed refs, skip
2322 		 *     wake_up(&delayed_refs->seq_wait);
2323 		 * here.
2324 		 */
2325 		spin_unlock(&delayed_refs->lock);
2326 
2327 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2328 					  must_insert_reserved);
2329 
2330 		btrfs_put_delayed_ref(ref);
2331 		kfree(extent_op);
2332 		count++;
2333 
2334 		if (ret) {
2335 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2336 			return ret;
2337 		}
2338 
2339 next:
2340 		do_chunk_alloc(trans, root->fs_info->extent_root,
2341 			       2 * 1024 * 1024,
2342 			       btrfs_get_alloc_profile(root, 0),
2343 			       CHUNK_ALLOC_NO_FORCE);
2344 		cond_resched();
2345 		spin_lock(&delayed_refs->lock);
2346 	}
2347 	return count;
2348 }
2349 
2350 
2351 static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2352 			unsigned long num_refs)
2353 {
2354 	struct list_head *first_seq = delayed_refs->seq_head.next;
2355 
2356 	spin_unlock(&delayed_refs->lock);
2357 	pr_debug("waiting for more refs (num %ld, first %p)\n",
2358 		 num_refs, first_seq);
2359 	wait_event(delayed_refs->seq_wait,
2360 		   num_refs != delayed_refs->num_entries ||
2361 		   delayed_refs->seq_head.next != first_seq);
2362 	pr_debug("done waiting for more refs (num %ld, first %p)\n",
2363 		 delayed_refs->num_entries, delayed_refs->seq_head.next);
2364 	spin_lock(&delayed_refs->lock);
2365 }
2366 
2367 /*
2368  * this starts processing the delayed reference count updates and
2369  * extent insertions we have queued up so far.  count can be
2370  * 0, which means to process everything in the tree at the start
2371  * of the run (but not newly added entries), or it can be some target
2372  * number you'd like to process.
2373  *
2374  * Returns 0 on success or if called with an aborted transaction
2375  * Returns <0 on error and aborts the transaction
2376  */
2377 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2378 			   struct btrfs_root *root, unsigned long count)
2379 {
2380 	struct rb_node *node;
2381 	struct btrfs_delayed_ref_root *delayed_refs;
2382 	struct btrfs_delayed_ref_node *ref;
2383 	struct list_head cluster;
2384 	int ret;
2385 	u64 delayed_start;
2386 	int run_all = count == (unsigned long)-1;
2387 	int run_most = 0;
2388 	unsigned long num_refs = 0;
2389 	int consider_waiting;
2390 
2391 	/* We'll clean this up in btrfs_cleanup_transaction */
2392 	if (trans->aborted)
2393 		return 0;
2394 
2395 	if (root == root->fs_info->extent_root)
2396 		root = root->fs_info->tree_root;
2397 
2398 	do_chunk_alloc(trans, root->fs_info->extent_root,
2399 		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2400 		       CHUNK_ALLOC_NO_FORCE);
2401 
2402 	delayed_refs = &trans->transaction->delayed_refs;
2403 	INIT_LIST_HEAD(&cluster);
2404 again:
2405 	consider_waiting = 0;
2406 	spin_lock(&delayed_refs->lock);
2407 	if (count == 0) {
2408 		count = delayed_refs->num_entries * 2;
2409 		run_most = 1;
2410 	}
2411 	while (1) {
2412 		if (!(run_all || run_most) &&
2413 		    delayed_refs->num_heads_ready < 64)
2414 			break;
2415 
2416 		/*
2417 		 * go find something we can process in the rbtree.  We start at
2418 		 * the beginning of the tree, and then build a cluster
2419 		 * of refs to process starting at the first one we are able to
2420 		 * lock
2421 		 */
2422 		delayed_start = delayed_refs->run_delayed_start;
2423 		ret = btrfs_find_ref_cluster(trans, &cluster,
2424 					     delayed_refs->run_delayed_start);
2425 		if (ret)
2426 			break;
2427 
2428 		if (delayed_start >= delayed_refs->run_delayed_start) {
2429 			if (consider_waiting == 0) {
2430 				/*
2431 				 * btrfs_find_ref_cluster looped. let's do one
2432 				 * more cycle. if we don't run any delayed ref
2433 				 * during that cycle (because we can't because
2434 				 * all of them are blocked) and if the number of
2435 				 * refs doesn't change, we avoid busy waiting.
2436 				 */
2437 				consider_waiting = 1;
2438 				num_refs = delayed_refs->num_entries;
2439 			} else {
2440 				wait_for_more_refs(delayed_refs, num_refs);
2441 				/*
2442 				 * after waiting, things have changed. we
2443 				 * dropped the lock and someone else might have
2444 				 * run some refs, built new clusters and so on.
2445 				 * therefore, we restart staleness detection.
2446 				 */
2447 				consider_waiting = 0;
2448 			}
2449 		}
2450 
2451 		ret = run_clustered_refs(trans, root, &cluster);
2452 		if (ret < 0) {
2453 			spin_unlock(&delayed_refs->lock);
2454 			btrfs_abort_transaction(trans, root, ret);
2455 			return ret;
2456 		}
2457 
2458 		count -= min_t(unsigned long, ret, count);
2459 
2460 		if (count == 0)
2461 			break;
2462 
2463 		if (ret || delayed_refs->run_delayed_start == 0) {
2464 			/* refs were run, let's reset staleness detection */
2465 			consider_waiting = 0;
2466 		}
2467 	}
2468 
2469 	if (run_all) {
2470 		node = rb_first(&delayed_refs->root);
2471 		if (!node)
2472 			goto out;
2473 		count = (unsigned long)-1;
2474 
2475 		while (node) {
2476 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2477 				       rb_node);
2478 			if (btrfs_delayed_ref_is_head(ref)) {
2479 				struct btrfs_delayed_ref_head *head;
2480 
2481 				head = btrfs_delayed_node_to_head(ref);
2482 				atomic_inc(&ref->refs);
2483 
2484 				spin_unlock(&delayed_refs->lock);
2485 				/*
2486 				 * Mutex was contended, block until it's
2487 				 * released and try again
2488 				 */
2489 				mutex_lock(&head->mutex);
2490 				mutex_unlock(&head->mutex);
2491 
2492 				btrfs_put_delayed_ref(ref);
2493 				cond_resched();
2494 				goto again;
2495 			}
2496 			node = rb_next(node);
2497 		}
2498 		spin_unlock(&delayed_refs->lock);
2499 		schedule_timeout(1);
2500 		goto again;
2501 	}
2502 out:
2503 	spin_unlock(&delayed_refs->lock);
2504 	return 0;
2505 }
2506 
2507 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2508 				struct btrfs_root *root,
2509 				u64 bytenr, u64 num_bytes, u64 flags,
2510 				int is_data)
2511 {
2512 	struct btrfs_delayed_extent_op *extent_op;
2513 	int ret;
2514 
2515 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2516 	if (!extent_op)
2517 		return -ENOMEM;
2518 
2519 	extent_op->flags_to_set = flags;
2520 	extent_op->update_flags = 1;
2521 	extent_op->update_key = 0;
2522 	extent_op->is_data = is_data ? 1 : 0;
2523 
2524 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2525 					  num_bytes, extent_op);
2526 	if (ret)
2527 		kfree(extent_op);
2528 	return ret;
2529 }
2530 
2531 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2532 				      struct btrfs_root *root,
2533 				      struct btrfs_path *path,
2534 				      u64 objectid, u64 offset, u64 bytenr)
2535 {
2536 	struct btrfs_delayed_ref_head *head;
2537 	struct btrfs_delayed_ref_node *ref;
2538 	struct btrfs_delayed_data_ref *data_ref;
2539 	struct btrfs_delayed_ref_root *delayed_refs;
2540 	struct rb_node *node;
2541 	int ret = 0;
2542 
2543 	ret = -ENOENT;
2544 	delayed_refs = &trans->transaction->delayed_refs;
2545 	spin_lock(&delayed_refs->lock);
2546 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2547 	if (!head)
2548 		goto out;
2549 
2550 	if (!mutex_trylock(&head->mutex)) {
2551 		atomic_inc(&head->node.refs);
2552 		spin_unlock(&delayed_refs->lock);
2553 
2554 		btrfs_release_path(path);
2555 
2556 		/*
2557 		 * Mutex was contended, block until it's released and let
2558 		 * caller try again
2559 		 */
2560 		mutex_lock(&head->mutex);
2561 		mutex_unlock(&head->mutex);
2562 		btrfs_put_delayed_ref(&head->node);
2563 		return -EAGAIN;
2564 	}
2565 
2566 	node = rb_prev(&head->node.rb_node);
2567 	if (!node)
2568 		goto out_unlock;
2569 
2570 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2571 
2572 	if (ref->bytenr != bytenr)
2573 		goto out_unlock;
2574 
2575 	ret = 1;
2576 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2577 		goto out_unlock;
2578 
2579 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2580 
2581 	node = rb_prev(node);
2582 	if (node) {
2583 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2584 		if (ref->bytenr == bytenr)
2585 			goto out_unlock;
2586 	}
2587 
2588 	if (data_ref->root != root->root_key.objectid ||
2589 	    data_ref->objectid != objectid || data_ref->offset != offset)
2590 		goto out_unlock;
2591 
2592 	ret = 0;
2593 out_unlock:
2594 	mutex_unlock(&head->mutex);
2595 out:
2596 	spin_unlock(&delayed_refs->lock);
2597 	return ret;
2598 }
2599 
2600 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2601 					struct btrfs_root *root,
2602 					struct btrfs_path *path,
2603 					u64 objectid, u64 offset, u64 bytenr)
2604 {
2605 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2606 	struct extent_buffer *leaf;
2607 	struct btrfs_extent_data_ref *ref;
2608 	struct btrfs_extent_inline_ref *iref;
2609 	struct btrfs_extent_item *ei;
2610 	struct btrfs_key key;
2611 	u32 item_size;
2612 	int ret;
2613 
2614 	key.objectid = bytenr;
2615 	key.offset = (u64)-1;
2616 	key.type = BTRFS_EXTENT_ITEM_KEY;
2617 
2618 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2619 	if (ret < 0)
2620 		goto out;
2621 	BUG_ON(ret == 0); /* Corruption */
2622 
2623 	ret = -ENOENT;
2624 	if (path->slots[0] == 0)
2625 		goto out;
2626 
2627 	path->slots[0]--;
2628 	leaf = path->nodes[0];
2629 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2630 
2631 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2632 		goto out;
2633 
2634 	ret = 1;
2635 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2636 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2637 	if (item_size < sizeof(*ei)) {
2638 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2639 		goto out;
2640 	}
2641 #endif
2642 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2643 
2644 	if (item_size != sizeof(*ei) +
2645 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2646 		goto out;
2647 
2648 	if (btrfs_extent_generation(leaf, ei) <=
2649 	    btrfs_root_last_snapshot(&root->root_item))
2650 		goto out;
2651 
2652 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2653 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2654 	    BTRFS_EXTENT_DATA_REF_KEY)
2655 		goto out;
2656 
2657 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2658 	if (btrfs_extent_refs(leaf, ei) !=
2659 	    btrfs_extent_data_ref_count(leaf, ref) ||
2660 	    btrfs_extent_data_ref_root(leaf, ref) !=
2661 	    root->root_key.objectid ||
2662 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2663 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2664 		goto out;
2665 
2666 	ret = 0;
2667 out:
2668 	return ret;
2669 }
2670 
2671 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2672 			  struct btrfs_root *root,
2673 			  u64 objectid, u64 offset, u64 bytenr)
2674 {
2675 	struct btrfs_path *path;
2676 	int ret;
2677 	int ret2;
2678 
2679 	path = btrfs_alloc_path();
2680 	if (!path)
2681 		return -ENOENT;
2682 
2683 	do {
2684 		ret = check_committed_ref(trans, root, path, objectid,
2685 					  offset, bytenr);
2686 		if (ret && ret != -ENOENT)
2687 			goto out;
2688 
2689 		ret2 = check_delayed_ref(trans, root, path, objectid,
2690 					 offset, bytenr);
2691 	} while (ret2 == -EAGAIN);
2692 
2693 	if (ret2 && ret2 != -ENOENT) {
2694 		ret = ret2;
2695 		goto out;
2696 	}
2697 
2698 	if (ret != -ENOENT || ret2 != -ENOENT)
2699 		ret = 0;
2700 out:
2701 	btrfs_free_path(path);
2702 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2703 		WARN_ON(ret > 0);
2704 	return ret;
2705 }
2706 
2707 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2708 			   struct btrfs_root *root,
2709 			   struct extent_buffer *buf,
2710 			   int full_backref, int inc, int for_cow)
2711 {
2712 	u64 bytenr;
2713 	u64 num_bytes;
2714 	u64 parent;
2715 	u64 ref_root;
2716 	u32 nritems;
2717 	struct btrfs_key key;
2718 	struct btrfs_file_extent_item *fi;
2719 	int i;
2720 	int level;
2721 	int ret = 0;
2722 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2723 			    u64, u64, u64, u64, u64, u64, int);
2724 
2725 	ref_root = btrfs_header_owner(buf);
2726 	nritems = btrfs_header_nritems(buf);
2727 	level = btrfs_header_level(buf);
2728 
2729 	if (!root->ref_cows && level == 0)
2730 		return 0;
2731 
2732 	if (inc)
2733 		process_func = btrfs_inc_extent_ref;
2734 	else
2735 		process_func = btrfs_free_extent;
2736 
2737 	if (full_backref)
2738 		parent = buf->start;
2739 	else
2740 		parent = 0;
2741 
2742 	for (i = 0; i < nritems; i++) {
2743 		if (level == 0) {
2744 			btrfs_item_key_to_cpu(buf, &key, i);
2745 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2746 				continue;
2747 			fi = btrfs_item_ptr(buf, i,
2748 					    struct btrfs_file_extent_item);
2749 			if (btrfs_file_extent_type(buf, fi) ==
2750 			    BTRFS_FILE_EXTENT_INLINE)
2751 				continue;
2752 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2753 			if (bytenr == 0)
2754 				continue;
2755 
2756 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2757 			key.offset -= btrfs_file_extent_offset(buf, fi);
2758 			ret = process_func(trans, root, bytenr, num_bytes,
2759 					   parent, ref_root, key.objectid,
2760 					   key.offset, for_cow);
2761 			if (ret)
2762 				goto fail;
2763 		} else {
2764 			bytenr = btrfs_node_blockptr(buf, i);
2765 			num_bytes = btrfs_level_size(root, level - 1);
2766 			ret = process_func(trans, root, bytenr, num_bytes,
2767 					   parent, ref_root, level - 1, 0,
2768 					   for_cow);
2769 			if (ret)
2770 				goto fail;
2771 		}
2772 	}
2773 	return 0;
2774 fail:
2775 	return ret;
2776 }
2777 
2778 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2779 		  struct extent_buffer *buf, int full_backref, int for_cow)
2780 {
2781 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2782 }
2783 
2784 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2785 		  struct extent_buffer *buf, int full_backref, int for_cow)
2786 {
2787 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2788 }
2789 
2790 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2791 				 struct btrfs_root *root,
2792 				 struct btrfs_path *path,
2793 				 struct btrfs_block_group_cache *cache)
2794 {
2795 	int ret;
2796 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2797 	unsigned long bi;
2798 	struct extent_buffer *leaf;
2799 
2800 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2801 	if (ret < 0)
2802 		goto fail;
2803 	BUG_ON(ret); /* Corruption */
2804 
2805 	leaf = path->nodes[0];
2806 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2807 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2808 	btrfs_mark_buffer_dirty(leaf);
2809 	btrfs_release_path(path);
2810 fail:
2811 	if (ret) {
2812 		btrfs_abort_transaction(trans, root, ret);
2813 		return ret;
2814 	}
2815 	return 0;
2816 
2817 }
2818 
2819 static struct btrfs_block_group_cache *
2820 next_block_group(struct btrfs_root *root,
2821 		 struct btrfs_block_group_cache *cache)
2822 {
2823 	struct rb_node *node;
2824 	spin_lock(&root->fs_info->block_group_cache_lock);
2825 	node = rb_next(&cache->cache_node);
2826 	btrfs_put_block_group(cache);
2827 	if (node) {
2828 		cache = rb_entry(node, struct btrfs_block_group_cache,
2829 				 cache_node);
2830 		btrfs_get_block_group(cache);
2831 	} else
2832 		cache = NULL;
2833 	spin_unlock(&root->fs_info->block_group_cache_lock);
2834 	return cache;
2835 }
2836 
2837 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2838 			    struct btrfs_trans_handle *trans,
2839 			    struct btrfs_path *path)
2840 {
2841 	struct btrfs_root *root = block_group->fs_info->tree_root;
2842 	struct inode *inode = NULL;
2843 	u64 alloc_hint = 0;
2844 	int dcs = BTRFS_DC_ERROR;
2845 	int num_pages = 0;
2846 	int retries = 0;
2847 	int ret = 0;
2848 
2849 	/*
2850 	 * If this block group is smaller than 100 megs don't bother caching the
2851 	 * block group.
2852 	 */
2853 	if (block_group->key.offset < (100 * 1024 * 1024)) {
2854 		spin_lock(&block_group->lock);
2855 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2856 		spin_unlock(&block_group->lock);
2857 		return 0;
2858 	}
2859 
2860 again:
2861 	inode = lookup_free_space_inode(root, block_group, path);
2862 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2863 		ret = PTR_ERR(inode);
2864 		btrfs_release_path(path);
2865 		goto out;
2866 	}
2867 
2868 	if (IS_ERR(inode)) {
2869 		BUG_ON(retries);
2870 		retries++;
2871 
2872 		if (block_group->ro)
2873 			goto out_free;
2874 
2875 		ret = create_free_space_inode(root, trans, block_group, path);
2876 		if (ret)
2877 			goto out_free;
2878 		goto again;
2879 	}
2880 
2881 	/* We've already setup this transaction, go ahead and exit */
2882 	if (block_group->cache_generation == trans->transid &&
2883 	    i_size_read(inode)) {
2884 		dcs = BTRFS_DC_SETUP;
2885 		goto out_put;
2886 	}
2887 
2888 	/*
2889 	 * We want to set the generation to 0, that way if anything goes wrong
2890 	 * from here on out we know not to trust this cache when we load up next
2891 	 * time.
2892 	 */
2893 	BTRFS_I(inode)->generation = 0;
2894 	ret = btrfs_update_inode(trans, root, inode);
2895 	WARN_ON(ret);
2896 
2897 	if (i_size_read(inode) > 0) {
2898 		ret = btrfs_truncate_free_space_cache(root, trans, path,
2899 						      inode);
2900 		if (ret)
2901 			goto out_put;
2902 	}
2903 
2904 	spin_lock(&block_group->lock);
2905 	if (block_group->cached != BTRFS_CACHE_FINISHED) {
2906 		/* We're not cached, don't bother trying to write stuff out */
2907 		dcs = BTRFS_DC_WRITTEN;
2908 		spin_unlock(&block_group->lock);
2909 		goto out_put;
2910 	}
2911 	spin_unlock(&block_group->lock);
2912 
2913 	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2914 	if (!num_pages)
2915 		num_pages = 1;
2916 
2917 	/*
2918 	 * Just to make absolutely sure we have enough space, we're going to
2919 	 * preallocate 12 pages worth of space for each block group.  In
2920 	 * practice we ought to use at most 8, but we need extra space so we can
2921 	 * add our header and have a terminator between the extents and the
2922 	 * bitmaps.
2923 	 */
2924 	num_pages *= 16;
2925 	num_pages *= PAGE_CACHE_SIZE;
2926 
2927 	ret = btrfs_check_data_free_space(inode, num_pages);
2928 	if (ret)
2929 		goto out_put;
2930 
2931 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2932 					      num_pages, num_pages,
2933 					      &alloc_hint);
2934 	if (!ret)
2935 		dcs = BTRFS_DC_SETUP;
2936 	btrfs_free_reserved_data_space(inode, num_pages);
2937 
2938 out_put:
2939 	iput(inode);
2940 out_free:
2941 	btrfs_release_path(path);
2942 out:
2943 	spin_lock(&block_group->lock);
2944 	if (!ret && dcs == BTRFS_DC_SETUP)
2945 		block_group->cache_generation = trans->transid;
2946 	block_group->disk_cache_state = dcs;
2947 	spin_unlock(&block_group->lock);
2948 
2949 	return ret;
2950 }
2951 
2952 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2953 				   struct btrfs_root *root)
2954 {
2955 	struct btrfs_block_group_cache *cache;
2956 	int err = 0;
2957 	struct btrfs_path *path;
2958 	u64 last = 0;
2959 
2960 	path = btrfs_alloc_path();
2961 	if (!path)
2962 		return -ENOMEM;
2963 
2964 again:
2965 	while (1) {
2966 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2967 		while (cache) {
2968 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2969 				break;
2970 			cache = next_block_group(root, cache);
2971 		}
2972 		if (!cache) {
2973 			if (last == 0)
2974 				break;
2975 			last = 0;
2976 			continue;
2977 		}
2978 		err = cache_save_setup(cache, trans, path);
2979 		last = cache->key.objectid + cache->key.offset;
2980 		btrfs_put_block_group(cache);
2981 	}
2982 
2983 	while (1) {
2984 		if (last == 0) {
2985 			err = btrfs_run_delayed_refs(trans, root,
2986 						     (unsigned long)-1);
2987 			if (err) /* File system offline */
2988 				goto out;
2989 		}
2990 
2991 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2992 		while (cache) {
2993 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2994 				btrfs_put_block_group(cache);
2995 				goto again;
2996 			}
2997 
2998 			if (cache->dirty)
2999 				break;
3000 			cache = next_block_group(root, cache);
3001 		}
3002 		if (!cache) {
3003 			if (last == 0)
3004 				break;
3005 			last = 0;
3006 			continue;
3007 		}
3008 
3009 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
3010 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3011 		cache->dirty = 0;
3012 		last = cache->key.objectid + cache->key.offset;
3013 
3014 		err = write_one_cache_group(trans, root, path, cache);
3015 		if (err) /* File system offline */
3016 			goto out;
3017 
3018 		btrfs_put_block_group(cache);
3019 	}
3020 
3021 	while (1) {
3022 		/*
3023 		 * I don't think this is needed since we're just marking our
3024 		 * preallocated extent as written, but just in case it can't
3025 		 * hurt.
3026 		 */
3027 		if (last == 0) {
3028 			err = btrfs_run_delayed_refs(trans, root,
3029 						     (unsigned long)-1);
3030 			if (err) /* File system offline */
3031 				goto out;
3032 		}
3033 
3034 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3035 		while (cache) {
3036 			/*
3037 			 * Really this shouldn't happen, but it could if we
3038 			 * couldn't write the entire preallocated extent and
3039 			 * splitting the extent resulted in a new block.
3040 			 */
3041 			if (cache->dirty) {
3042 				btrfs_put_block_group(cache);
3043 				goto again;
3044 			}
3045 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3046 				break;
3047 			cache = next_block_group(root, cache);
3048 		}
3049 		if (!cache) {
3050 			if (last == 0)
3051 				break;
3052 			last = 0;
3053 			continue;
3054 		}
3055 
3056 		err = btrfs_write_out_cache(root, trans, cache, path);
3057 
3058 		/*
3059 		 * If we didn't have an error then the cache state is still
3060 		 * NEED_WRITE, so we can set it to WRITTEN.
3061 		 */
3062 		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3063 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
3064 		last = cache->key.objectid + cache->key.offset;
3065 		btrfs_put_block_group(cache);
3066 	}
3067 out:
3068 
3069 	btrfs_free_path(path);
3070 	return err;
3071 }
3072 
3073 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3074 {
3075 	struct btrfs_block_group_cache *block_group;
3076 	int readonly = 0;
3077 
3078 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3079 	if (!block_group || block_group->ro)
3080 		readonly = 1;
3081 	if (block_group)
3082 		btrfs_put_block_group(block_group);
3083 	return readonly;
3084 }
3085 
3086 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3087 			     u64 total_bytes, u64 bytes_used,
3088 			     struct btrfs_space_info **space_info)
3089 {
3090 	struct btrfs_space_info *found;
3091 	int i;
3092 	int factor;
3093 
3094 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3095 		     BTRFS_BLOCK_GROUP_RAID10))
3096 		factor = 2;
3097 	else
3098 		factor = 1;
3099 
3100 	found = __find_space_info(info, flags);
3101 	if (found) {
3102 		spin_lock(&found->lock);
3103 		found->total_bytes += total_bytes;
3104 		found->disk_total += total_bytes * factor;
3105 		found->bytes_used += bytes_used;
3106 		found->disk_used += bytes_used * factor;
3107 		found->full = 0;
3108 		spin_unlock(&found->lock);
3109 		*space_info = found;
3110 		return 0;
3111 	}
3112 	found = kzalloc(sizeof(*found), GFP_NOFS);
3113 	if (!found)
3114 		return -ENOMEM;
3115 
3116 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3117 		INIT_LIST_HEAD(&found->block_groups[i]);
3118 	init_rwsem(&found->groups_sem);
3119 	spin_lock_init(&found->lock);
3120 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3121 	found->total_bytes = total_bytes;
3122 	found->disk_total = total_bytes * factor;
3123 	found->bytes_used = bytes_used;
3124 	found->disk_used = bytes_used * factor;
3125 	found->bytes_pinned = 0;
3126 	found->bytes_reserved = 0;
3127 	found->bytes_readonly = 0;
3128 	found->bytes_may_use = 0;
3129 	found->full = 0;
3130 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3131 	found->chunk_alloc = 0;
3132 	found->flush = 0;
3133 	init_waitqueue_head(&found->wait);
3134 	*space_info = found;
3135 	list_add_rcu(&found->list, &info->space_info);
3136 	return 0;
3137 }
3138 
3139 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3140 {
3141 	u64 extra_flags = chunk_to_extended(flags) &
3142 				BTRFS_EXTENDED_PROFILE_MASK;
3143 
3144 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3145 		fs_info->avail_data_alloc_bits |= extra_flags;
3146 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
3147 		fs_info->avail_metadata_alloc_bits |= extra_flags;
3148 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3149 		fs_info->avail_system_alloc_bits |= extra_flags;
3150 }
3151 
3152 /*
3153  * returns target flags in extended format or 0 if restripe for this
3154  * chunk_type is not in progress
3155  */
3156 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3157 {
3158 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3159 	u64 target = 0;
3160 
3161 	BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) &&
3162 	       !spin_is_locked(&fs_info->balance_lock));
3163 
3164 	if (!bctl)
3165 		return 0;
3166 
3167 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
3168 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3169 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3170 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3171 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3172 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3173 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3174 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3175 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3176 	}
3177 
3178 	return target;
3179 }
3180 
3181 /*
3182  * @flags: available profiles in extended format (see ctree.h)
3183  *
3184  * Returns reduced profile in chunk format.  If profile changing is in
3185  * progress (either running or paused) picks the target profile (if it's
3186  * already available), otherwise falls back to plain reducing.
3187  */
3188 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3189 {
3190 	/*
3191 	 * we add in the count of missing devices because we want
3192 	 * to make sure that any RAID levels on a degraded FS
3193 	 * continue to be honored.
3194 	 */
3195 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3196 		root->fs_info->fs_devices->missing_devices;
3197 	u64 target;
3198 
3199 	/*
3200 	 * see if restripe for this chunk_type is in progress, if so
3201 	 * try to reduce to the target profile
3202 	 */
3203 	spin_lock(&root->fs_info->balance_lock);
3204 	target = get_restripe_target(root->fs_info, flags);
3205 	if (target) {
3206 		/* pick target profile only if it's already available */
3207 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3208 			spin_unlock(&root->fs_info->balance_lock);
3209 			return extended_to_chunk(target);
3210 		}
3211 	}
3212 	spin_unlock(&root->fs_info->balance_lock);
3213 
3214 	if (num_devices == 1)
3215 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3216 	if (num_devices < 4)
3217 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3218 
3219 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3220 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3221 		      BTRFS_BLOCK_GROUP_RAID10))) {
3222 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
3223 	}
3224 
3225 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3226 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3227 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3228 	}
3229 
3230 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3231 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3232 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
3233 	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
3234 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3235 	}
3236 
3237 	return extended_to_chunk(flags);
3238 }
3239 
3240 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3241 {
3242 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3243 		flags |= root->fs_info->avail_data_alloc_bits;
3244 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3245 		flags |= root->fs_info->avail_system_alloc_bits;
3246 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3247 		flags |= root->fs_info->avail_metadata_alloc_bits;
3248 
3249 	return btrfs_reduce_alloc_profile(root, flags);
3250 }
3251 
3252 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3253 {
3254 	u64 flags;
3255 
3256 	if (data)
3257 		flags = BTRFS_BLOCK_GROUP_DATA;
3258 	else if (root == root->fs_info->chunk_root)
3259 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3260 	else
3261 		flags = BTRFS_BLOCK_GROUP_METADATA;
3262 
3263 	return get_alloc_profile(root, flags);
3264 }
3265 
3266 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3267 {
3268 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3269 						       BTRFS_BLOCK_GROUP_DATA);
3270 }
3271 
3272 /*
3273  * This will check the space that the inode allocates from to make sure we have
3274  * enough space for bytes.
3275  */
3276 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3277 {
3278 	struct btrfs_space_info *data_sinfo;
3279 	struct btrfs_root *root = BTRFS_I(inode)->root;
3280 	u64 used;
3281 	int ret = 0, committed = 0, alloc_chunk = 1;
3282 
3283 	/* make sure bytes are sectorsize aligned */
3284 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3285 
3286 	if (root == root->fs_info->tree_root ||
3287 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3288 		alloc_chunk = 0;
3289 		committed = 1;
3290 	}
3291 
3292 	data_sinfo = BTRFS_I(inode)->space_info;
3293 	if (!data_sinfo)
3294 		goto alloc;
3295 
3296 again:
3297 	/* make sure we have enough space to handle the data first */
3298 	spin_lock(&data_sinfo->lock);
3299 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3300 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3301 		data_sinfo->bytes_may_use;
3302 
3303 	if (used + bytes > data_sinfo->total_bytes) {
3304 		struct btrfs_trans_handle *trans;
3305 
3306 		/*
3307 		 * if we don't have enough free bytes in this space then we need
3308 		 * to alloc a new chunk.
3309 		 */
3310 		if (!data_sinfo->full && alloc_chunk) {
3311 			u64 alloc_target;
3312 
3313 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3314 			spin_unlock(&data_sinfo->lock);
3315 alloc:
3316 			alloc_target = btrfs_get_alloc_profile(root, 1);
3317 			trans = btrfs_join_transaction(root);
3318 			if (IS_ERR(trans))
3319 				return PTR_ERR(trans);
3320 
3321 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3322 					     bytes + 2 * 1024 * 1024,
3323 					     alloc_target,
3324 					     CHUNK_ALLOC_NO_FORCE);
3325 			btrfs_end_transaction(trans, root);
3326 			if (ret < 0) {
3327 				if (ret != -ENOSPC)
3328 					return ret;
3329 				else
3330 					goto commit_trans;
3331 			}
3332 
3333 			if (!data_sinfo) {
3334 				btrfs_set_inode_space_info(root, inode);
3335 				data_sinfo = BTRFS_I(inode)->space_info;
3336 			}
3337 			goto again;
3338 		}
3339 
3340 		/*
3341 		 * If we have less pinned bytes than we want to allocate then
3342 		 * don't bother committing the transaction, it won't help us.
3343 		 */
3344 		if (data_sinfo->bytes_pinned < bytes)
3345 			committed = 1;
3346 		spin_unlock(&data_sinfo->lock);
3347 
3348 		/* commit the current transaction and try again */
3349 commit_trans:
3350 		if (!committed &&
3351 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3352 			committed = 1;
3353 			trans = btrfs_join_transaction(root);
3354 			if (IS_ERR(trans))
3355 				return PTR_ERR(trans);
3356 			ret = btrfs_commit_transaction(trans, root);
3357 			if (ret)
3358 				return ret;
3359 			goto again;
3360 		}
3361 
3362 		return -ENOSPC;
3363 	}
3364 	data_sinfo->bytes_may_use += bytes;
3365 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3366 				      data_sinfo->flags, bytes, 1);
3367 	spin_unlock(&data_sinfo->lock);
3368 
3369 	return 0;
3370 }
3371 
3372 /*
3373  * Called if we need to clear a data reservation for this inode.
3374  */
3375 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3376 {
3377 	struct btrfs_root *root = BTRFS_I(inode)->root;
3378 	struct btrfs_space_info *data_sinfo;
3379 
3380 	/* make sure bytes are sectorsize aligned */
3381 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3382 
3383 	data_sinfo = BTRFS_I(inode)->space_info;
3384 	spin_lock(&data_sinfo->lock);
3385 	data_sinfo->bytes_may_use -= bytes;
3386 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3387 				      data_sinfo->flags, bytes, 0);
3388 	spin_unlock(&data_sinfo->lock);
3389 }
3390 
3391 static void force_metadata_allocation(struct btrfs_fs_info *info)
3392 {
3393 	struct list_head *head = &info->space_info;
3394 	struct btrfs_space_info *found;
3395 
3396 	rcu_read_lock();
3397 	list_for_each_entry_rcu(found, head, list) {
3398 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3399 			found->force_alloc = CHUNK_ALLOC_FORCE;
3400 	}
3401 	rcu_read_unlock();
3402 }
3403 
3404 static int should_alloc_chunk(struct btrfs_root *root,
3405 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
3406 			      int force)
3407 {
3408 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3409 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3410 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3411 	u64 thresh;
3412 
3413 	if (force == CHUNK_ALLOC_FORCE)
3414 		return 1;
3415 
3416 	/*
3417 	 * We need to take into account the global rsv because for all intents
3418 	 * and purposes it's used space.  Don't worry about locking the
3419 	 * global_rsv, it doesn't change except when the transaction commits.
3420 	 */
3421 	num_allocated += global_rsv->size;
3422 
3423 	/*
3424 	 * in limited mode, we want to have some free space up to
3425 	 * about 1% of the FS size.
3426 	 */
3427 	if (force == CHUNK_ALLOC_LIMITED) {
3428 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3429 		thresh = max_t(u64, 64 * 1024 * 1024,
3430 			       div_factor_fine(thresh, 1));
3431 
3432 		if (num_bytes - num_allocated < thresh)
3433 			return 1;
3434 	}
3435 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3436 
3437 	/* 256MB or 2% of the FS */
3438 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3439 	/* system chunks need a much small threshold */
3440 	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3441 		thresh = 32 * 1024 * 1024;
3442 
3443 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3444 		return 0;
3445 	return 1;
3446 }
3447 
3448 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3449 {
3450 	u64 num_dev;
3451 
3452 	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3453 	    type & BTRFS_BLOCK_GROUP_RAID0)
3454 		num_dev = root->fs_info->fs_devices->rw_devices;
3455 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
3456 		num_dev = 2;
3457 	else
3458 		num_dev = 1;	/* DUP or single */
3459 
3460 	/* metadata for updaing devices and chunk tree */
3461 	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3462 }
3463 
3464 static void check_system_chunk(struct btrfs_trans_handle *trans,
3465 			       struct btrfs_root *root, u64 type)
3466 {
3467 	struct btrfs_space_info *info;
3468 	u64 left;
3469 	u64 thresh;
3470 
3471 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3472 	spin_lock(&info->lock);
3473 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3474 		info->bytes_reserved - info->bytes_readonly;
3475 	spin_unlock(&info->lock);
3476 
3477 	thresh = get_system_chunk_thresh(root, type);
3478 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3479 		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3480 		       left, thresh, type);
3481 		dump_space_info(info, 0, 0);
3482 	}
3483 
3484 	if (left < thresh) {
3485 		u64 flags;
3486 
3487 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3488 		btrfs_alloc_chunk(trans, root, flags);
3489 	}
3490 }
3491 
3492 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3493 			  struct btrfs_root *extent_root, u64 alloc_bytes,
3494 			  u64 flags, int force)
3495 {
3496 	struct btrfs_space_info *space_info;
3497 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3498 	int wait_for_alloc = 0;
3499 	int ret = 0;
3500 
3501 	space_info = __find_space_info(extent_root->fs_info, flags);
3502 	if (!space_info) {
3503 		ret = update_space_info(extent_root->fs_info, flags,
3504 					0, 0, &space_info);
3505 		BUG_ON(ret); /* -ENOMEM */
3506 	}
3507 	BUG_ON(!space_info); /* Logic error */
3508 
3509 again:
3510 	spin_lock(&space_info->lock);
3511 	if (force < space_info->force_alloc)
3512 		force = space_info->force_alloc;
3513 	if (space_info->full) {
3514 		spin_unlock(&space_info->lock);
3515 		return 0;
3516 	}
3517 
3518 	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3519 		spin_unlock(&space_info->lock);
3520 		return 0;
3521 	} else if (space_info->chunk_alloc) {
3522 		wait_for_alloc = 1;
3523 	} else {
3524 		space_info->chunk_alloc = 1;
3525 	}
3526 
3527 	spin_unlock(&space_info->lock);
3528 
3529 	mutex_lock(&fs_info->chunk_mutex);
3530 
3531 	/*
3532 	 * The chunk_mutex is held throughout the entirety of a chunk
3533 	 * allocation, so once we've acquired the chunk_mutex we know that the
3534 	 * other guy is done and we need to recheck and see if we should
3535 	 * allocate.
3536 	 */
3537 	if (wait_for_alloc) {
3538 		mutex_unlock(&fs_info->chunk_mutex);
3539 		wait_for_alloc = 0;
3540 		goto again;
3541 	}
3542 
3543 	/*
3544 	 * If we have mixed data/metadata chunks we want to make sure we keep
3545 	 * allocating mixed chunks instead of individual chunks.
3546 	 */
3547 	if (btrfs_mixed_space_info(space_info))
3548 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3549 
3550 	/*
3551 	 * if we're doing a data chunk, go ahead and make sure that
3552 	 * we keep a reasonable number of metadata chunks allocated in the
3553 	 * FS as well.
3554 	 */
3555 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3556 		fs_info->data_chunk_allocations++;
3557 		if (!(fs_info->data_chunk_allocations %
3558 		      fs_info->metadata_ratio))
3559 			force_metadata_allocation(fs_info);
3560 	}
3561 
3562 	/*
3563 	 * Check if we have enough space in SYSTEM chunk because we may need
3564 	 * to update devices.
3565 	 */
3566 	check_system_chunk(trans, extent_root, flags);
3567 
3568 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3569 	if (ret < 0 && ret != -ENOSPC)
3570 		goto out;
3571 
3572 	spin_lock(&space_info->lock);
3573 	if (ret)
3574 		space_info->full = 1;
3575 	else
3576 		ret = 1;
3577 
3578 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3579 	space_info->chunk_alloc = 0;
3580 	spin_unlock(&space_info->lock);
3581 out:
3582 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
3583 	return ret;
3584 }
3585 
3586 /*
3587  * shrink metadata reservation for delalloc
3588  */
3589 static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3590 			   bool wait_ordered)
3591 {
3592 	struct btrfs_block_rsv *block_rsv;
3593 	struct btrfs_space_info *space_info;
3594 	struct btrfs_trans_handle *trans;
3595 	u64 reserved;
3596 	u64 max_reclaim;
3597 	u64 reclaimed = 0;
3598 	long time_left;
3599 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3600 	int loops = 0;
3601 	unsigned long progress;
3602 
3603 	trans = (struct btrfs_trans_handle *)current->journal_info;
3604 	block_rsv = &root->fs_info->delalloc_block_rsv;
3605 	space_info = block_rsv->space_info;
3606 
3607 	smp_mb();
3608 	reserved = space_info->bytes_may_use;
3609 	progress = space_info->reservation_progress;
3610 
3611 	if (reserved == 0)
3612 		return 0;
3613 
3614 	smp_mb();
3615 	if (root->fs_info->delalloc_bytes == 0) {
3616 		if (trans)
3617 			return 0;
3618 		btrfs_wait_ordered_extents(root, 0, 0);
3619 		return 0;
3620 	}
3621 
3622 	max_reclaim = min(reserved, to_reclaim);
3623 	nr_pages = max_t(unsigned long, nr_pages,
3624 			 max_reclaim >> PAGE_CACHE_SHIFT);
3625 	while (loops < 1024) {
3626 		/* have the flusher threads jump in and do some IO */
3627 		smp_mb();
3628 		nr_pages = min_t(unsigned long, nr_pages,
3629 		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3630 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3631 						WB_REASON_FS_FREE_SPACE);
3632 
3633 		spin_lock(&space_info->lock);
3634 		if (reserved > space_info->bytes_may_use)
3635 			reclaimed += reserved - space_info->bytes_may_use;
3636 		reserved = space_info->bytes_may_use;
3637 		spin_unlock(&space_info->lock);
3638 
3639 		loops++;
3640 
3641 		if (reserved == 0 || reclaimed >= max_reclaim)
3642 			break;
3643 
3644 		if (trans && trans->transaction->blocked)
3645 			return -EAGAIN;
3646 
3647 		if (wait_ordered && !trans) {
3648 			btrfs_wait_ordered_extents(root, 0, 0);
3649 		} else {
3650 			time_left = schedule_timeout_interruptible(1);
3651 
3652 			/* We were interrupted, exit */
3653 			if (time_left)
3654 				break;
3655 		}
3656 
3657 		/* we've kicked the IO a few times, if anything has been freed,
3658 		 * exit.  There is no sense in looping here for a long time
3659 		 * when we really need to commit the transaction, or there are
3660 		 * just too many writers without enough free space
3661 		 */
3662 
3663 		if (loops > 3) {
3664 			smp_mb();
3665 			if (progress != space_info->reservation_progress)
3666 				break;
3667 		}
3668 
3669 	}
3670 
3671 	return reclaimed >= to_reclaim;
3672 }
3673 
3674 /**
3675  * maybe_commit_transaction - possibly commit the transaction if its ok to
3676  * @root - the root we're allocating for
3677  * @bytes - the number of bytes we want to reserve
3678  * @force - force the commit
3679  *
3680  * This will check to make sure that committing the transaction will actually
3681  * get us somewhere and then commit the transaction if it does.  Otherwise it
3682  * will return -ENOSPC.
3683  */
3684 static int may_commit_transaction(struct btrfs_root *root,
3685 				  struct btrfs_space_info *space_info,
3686 				  u64 bytes, int force)
3687 {
3688 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3689 	struct btrfs_trans_handle *trans;
3690 
3691 	trans = (struct btrfs_trans_handle *)current->journal_info;
3692 	if (trans)
3693 		return -EAGAIN;
3694 
3695 	if (force)
3696 		goto commit;
3697 
3698 	/* See if there is enough pinned space to make this reservation */
3699 	spin_lock(&space_info->lock);
3700 	if (space_info->bytes_pinned >= bytes) {
3701 		spin_unlock(&space_info->lock);
3702 		goto commit;
3703 	}
3704 	spin_unlock(&space_info->lock);
3705 
3706 	/*
3707 	 * See if there is some space in the delayed insertion reservation for
3708 	 * this reservation.
3709 	 */
3710 	if (space_info != delayed_rsv->space_info)
3711 		return -ENOSPC;
3712 
3713 	spin_lock(&space_info->lock);
3714 	spin_lock(&delayed_rsv->lock);
3715 	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3716 		spin_unlock(&delayed_rsv->lock);
3717 		spin_unlock(&space_info->lock);
3718 		return -ENOSPC;
3719 	}
3720 	spin_unlock(&delayed_rsv->lock);
3721 	spin_unlock(&space_info->lock);
3722 
3723 commit:
3724 	trans = btrfs_join_transaction(root);
3725 	if (IS_ERR(trans))
3726 		return -ENOSPC;
3727 
3728 	return btrfs_commit_transaction(trans, root);
3729 }
3730 
3731 /**
3732  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3733  * @root - the root we're allocating for
3734  * @block_rsv - the block_rsv we're allocating for
3735  * @orig_bytes - the number of bytes we want
3736  * @flush - wether or not we can flush to make our reservation
3737  *
3738  * This will reserve orgi_bytes number of bytes from the space info associated
3739  * with the block_rsv.  If there is not enough space it will make an attempt to
3740  * flush out space to make room.  It will do this by flushing delalloc if
3741  * possible or committing the transaction.  If flush is 0 then no attempts to
3742  * regain reservations will be made and this will fail if there is not enough
3743  * space already.
3744  */
3745 static int reserve_metadata_bytes(struct btrfs_root *root,
3746 				  struct btrfs_block_rsv *block_rsv,
3747 				  u64 orig_bytes, int flush)
3748 {
3749 	struct btrfs_space_info *space_info = block_rsv->space_info;
3750 	u64 used;
3751 	u64 num_bytes = orig_bytes;
3752 	int retries = 0;
3753 	int ret = 0;
3754 	bool committed = false;
3755 	bool flushing = false;
3756 	bool wait_ordered = false;
3757 
3758 again:
3759 	ret = 0;
3760 	spin_lock(&space_info->lock);
3761 	/*
3762 	 * We only want to wait if somebody other than us is flushing and we are
3763 	 * actually alloed to flush.
3764 	 */
3765 	while (flush && !flushing && space_info->flush) {
3766 		spin_unlock(&space_info->lock);
3767 		/*
3768 		 * If we have a trans handle we can't wait because the flusher
3769 		 * may have to commit the transaction, which would mean we would
3770 		 * deadlock since we are waiting for the flusher to finish, but
3771 		 * hold the current transaction open.
3772 		 */
3773 		if (current->journal_info)
3774 			return -EAGAIN;
3775 		ret = wait_event_interruptible(space_info->wait,
3776 					       !space_info->flush);
3777 		/* Must have been interrupted, return */
3778 		if (ret) {
3779 			printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__);
3780 			return -EINTR;
3781 		}
3782 
3783 		spin_lock(&space_info->lock);
3784 	}
3785 
3786 	ret = -ENOSPC;
3787 	used = space_info->bytes_used + space_info->bytes_reserved +
3788 		space_info->bytes_pinned + space_info->bytes_readonly +
3789 		space_info->bytes_may_use;
3790 
3791 	/*
3792 	 * The idea here is that we've not already over-reserved the block group
3793 	 * then we can go ahead and save our reservation first and then start
3794 	 * flushing if we need to.  Otherwise if we've already overcommitted
3795 	 * lets start flushing stuff first and then come back and try to make
3796 	 * our reservation.
3797 	 */
3798 	if (used <= space_info->total_bytes) {
3799 		if (used + orig_bytes <= space_info->total_bytes) {
3800 			space_info->bytes_may_use += orig_bytes;
3801 			trace_btrfs_space_reservation(root->fs_info,
3802 				"space_info", space_info->flags, orig_bytes, 1);
3803 			ret = 0;
3804 		} else {
3805 			/*
3806 			 * Ok set num_bytes to orig_bytes since we aren't
3807 			 * overocmmitted, this way we only try and reclaim what
3808 			 * we need.
3809 			 */
3810 			num_bytes = orig_bytes;
3811 		}
3812 	} else {
3813 		/*
3814 		 * Ok we're over committed, set num_bytes to the overcommitted
3815 		 * amount plus the amount of bytes that we need for this
3816 		 * reservation.
3817 		 */
3818 		wait_ordered = true;
3819 		num_bytes = used - space_info->total_bytes +
3820 			(orig_bytes * (retries + 1));
3821 	}
3822 
3823 	if (ret) {
3824 		u64 profile = btrfs_get_alloc_profile(root, 0);
3825 		u64 avail;
3826 
3827 		/*
3828 		 * If we have a lot of space that's pinned, don't bother doing
3829 		 * the overcommit dance yet and just commit the transaction.
3830 		 */
3831 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3832 		do_div(avail, 10);
3833 		if (space_info->bytes_pinned >= avail && flush && !committed) {
3834 			space_info->flush = 1;
3835 			flushing = true;
3836 			spin_unlock(&space_info->lock);
3837 			ret = may_commit_transaction(root, space_info,
3838 						     orig_bytes, 1);
3839 			if (ret)
3840 				goto out;
3841 			committed = true;
3842 			goto again;
3843 		}
3844 
3845 		spin_lock(&root->fs_info->free_chunk_lock);
3846 		avail = root->fs_info->free_chunk_space;
3847 
3848 		/*
3849 		 * If we have dup, raid1 or raid10 then only half of the free
3850 		 * space is actually useable.
3851 		 */
3852 		if (profile & (BTRFS_BLOCK_GROUP_DUP |
3853 			       BTRFS_BLOCK_GROUP_RAID1 |
3854 			       BTRFS_BLOCK_GROUP_RAID10))
3855 			avail >>= 1;
3856 
3857 		/*
3858 		 * If we aren't flushing don't let us overcommit too much, say
3859 		 * 1/8th of the space.  If we can flush, let it overcommit up to
3860 		 * 1/2 of the space.
3861 		 */
3862 		if (flush)
3863 			avail >>= 3;
3864 		else
3865 			avail >>= 1;
3866 		 spin_unlock(&root->fs_info->free_chunk_lock);
3867 
3868 		if (used + num_bytes < space_info->total_bytes + avail) {
3869 			space_info->bytes_may_use += orig_bytes;
3870 			trace_btrfs_space_reservation(root->fs_info,
3871 				"space_info", space_info->flags, orig_bytes, 1);
3872 			ret = 0;
3873 		} else {
3874 			wait_ordered = true;
3875 		}
3876 	}
3877 
3878 	/*
3879 	 * Couldn't make our reservation, save our place so while we're trying
3880 	 * to reclaim space we can actually use it instead of somebody else
3881 	 * stealing it from us.
3882 	 */
3883 	if (ret && flush) {
3884 		flushing = true;
3885 		space_info->flush = 1;
3886 	}
3887 
3888 	spin_unlock(&space_info->lock);
3889 
3890 	if (!ret || !flush)
3891 		goto out;
3892 
3893 	/*
3894 	 * We do synchronous shrinking since we don't actually unreserve
3895 	 * metadata until after the IO is completed.
3896 	 */
3897 	ret = shrink_delalloc(root, num_bytes, wait_ordered);
3898 	if (ret < 0)
3899 		goto out;
3900 
3901 	ret = 0;
3902 
3903 	/*
3904 	 * So if we were overcommitted it's possible that somebody else flushed
3905 	 * out enough space and we simply didn't have enough space to reclaim,
3906 	 * so go back around and try again.
3907 	 */
3908 	if (retries < 2) {
3909 		wait_ordered = true;
3910 		retries++;
3911 		goto again;
3912 	}
3913 
3914 	ret = -ENOSPC;
3915 	if (committed)
3916 		goto out;
3917 
3918 	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3919 	if (!ret) {
3920 		committed = true;
3921 		goto again;
3922 	}
3923 
3924 out:
3925 	if (flushing) {
3926 		spin_lock(&space_info->lock);
3927 		space_info->flush = 0;
3928 		wake_up_all(&space_info->wait);
3929 		spin_unlock(&space_info->lock);
3930 	}
3931 	return ret;
3932 }
3933 
3934 static struct btrfs_block_rsv *get_block_rsv(
3935 					const struct btrfs_trans_handle *trans,
3936 					const struct btrfs_root *root)
3937 {
3938 	struct btrfs_block_rsv *block_rsv = NULL;
3939 
3940 	if (root->ref_cows || root == root->fs_info->csum_root)
3941 		block_rsv = trans->block_rsv;
3942 
3943 	if (!block_rsv)
3944 		block_rsv = root->block_rsv;
3945 
3946 	if (!block_rsv)
3947 		block_rsv = &root->fs_info->empty_block_rsv;
3948 
3949 	return block_rsv;
3950 }
3951 
3952 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3953 			       u64 num_bytes)
3954 {
3955 	int ret = -ENOSPC;
3956 	spin_lock(&block_rsv->lock);
3957 	if (block_rsv->reserved >= num_bytes) {
3958 		block_rsv->reserved -= num_bytes;
3959 		if (block_rsv->reserved < block_rsv->size)
3960 			block_rsv->full = 0;
3961 		ret = 0;
3962 	}
3963 	spin_unlock(&block_rsv->lock);
3964 	return ret;
3965 }
3966 
3967 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3968 				u64 num_bytes, int update_size)
3969 {
3970 	spin_lock(&block_rsv->lock);
3971 	block_rsv->reserved += num_bytes;
3972 	if (update_size)
3973 		block_rsv->size += num_bytes;
3974 	else if (block_rsv->reserved >= block_rsv->size)
3975 		block_rsv->full = 1;
3976 	spin_unlock(&block_rsv->lock);
3977 }
3978 
3979 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3980 				    struct btrfs_block_rsv *block_rsv,
3981 				    struct btrfs_block_rsv *dest, u64 num_bytes)
3982 {
3983 	struct btrfs_space_info *space_info = block_rsv->space_info;
3984 
3985 	spin_lock(&block_rsv->lock);
3986 	if (num_bytes == (u64)-1)
3987 		num_bytes = block_rsv->size;
3988 	block_rsv->size -= num_bytes;
3989 	if (block_rsv->reserved >= block_rsv->size) {
3990 		num_bytes = block_rsv->reserved - block_rsv->size;
3991 		block_rsv->reserved = block_rsv->size;
3992 		block_rsv->full = 1;
3993 	} else {
3994 		num_bytes = 0;
3995 	}
3996 	spin_unlock(&block_rsv->lock);
3997 
3998 	if (num_bytes > 0) {
3999 		if (dest) {
4000 			spin_lock(&dest->lock);
4001 			if (!dest->full) {
4002 				u64 bytes_to_add;
4003 
4004 				bytes_to_add = dest->size - dest->reserved;
4005 				bytes_to_add = min(num_bytes, bytes_to_add);
4006 				dest->reserved += bytes_to_add;
4007 				if (dest->reserved >= dest->size)
4008 					dest->full = 1;
4009 				num_bytes -= bytes_to_add;
4010 			}
4011 			spin_unlock(&dest->lock);
4012 		}
4013 		if (num_bytes) {
4014 			spin_lock(&space_info->lock);
4015 			space_info->bytes_may_use -= num_bytes;
4016 			trace_btrfs_space_reservation(fs_info, "space_info",
4017 					space_info->flags, num_bytes, 0);
4018 			space_info->reservation_progress++;
4019 			spin_unlock(&space_info->lock);
4020 		}
4021 	}
4022 }
4023 
4024 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4025 				   struct btrfs_block_rsv *dst, u64 num_bytes)
4026 {
4027 	int ret;
4028 
4029 	ret = block_rsv_use_bytes(src, num_bytes);
4030 	if (ret)
4031 		return ret;
4032 
4033 	block_rsv_add_bytes(dst, num_bytes, 1);
4034 	return 0;
4035 }
4036 
4037 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
4038 {
4039 	memset(rsv, 0, sizeof(*rsv));
4040 	spin_lock_init(&rsv->lock);
4041 }
4042 
4043 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4044 {
4045 	struct btrfs_block_rsv *block_rsv;
4046 	struct btrfs_fs_info *fs_info = root->fs_info;
4047 
4048 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4049 	if (!block_rsv)
4050 		return NULL;
4051 
4052 	btrfs_init_block_rsv(block_rsv);
4053 	block_rsv->space_info = __find_space_info(fs_info,
4054 						  BTRFS_BLOCK_GROUP_METADATA);
4055 	return block_rsv;
4056 }
4057 
4058 void btrfs_free_block_rsv(struct btrfs_root *root,
4059 			  struct btrfs_block_rsv *rsv)
4060 {
4061 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4062 	kfree(rsv);
4063 }
4064 
4065 static inline int __block_rsv_add(struct btrfs_root *root,
4066 				  struct btrfs_block_rsv *block_rsv,
4067 				  u64 num_bytes, int flush)
4068 {
4069 	int ret;
4070 
4071 	if (num_bytes == 0)
4072 		return 0;
4073 
4074 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4075 	if (!ret) {
4076 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
4077 		return 0;
4078 	}
4079 
4080 	return ret;
4081 }
4082 
4083 int btrfs_block_rsv_add(struct btrfs_root *root,
4084 			struct btrfs_block_rsv *block_rsv,
4085 			u64 num_bytes)
4086 {
4087 	return __block_rsv_add(root, block_rsv, num_bytes, 1);
4088 }
4089 
4090 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4091 				struct btrfs_block_rsv *block_rsv,
4092 				u64 num_bytes)
4093 {
4094 	return __block_rsv_add(root, block_rsv, num_bytes, 0);
4095 }
4096 
4097 int btrfs_block_rsv_check(struct btrfs_root *root,
4098 			  struct btrfs_block_rsv *block_rsv, int min_factor)
4099 {
4100 	u64 num_bytes = 0;
4101 	int ret = -ENOSPC;
4102 
4103 	if (!block_rsv)
4104 		return 0;
4105 
4106 	spin_lock(&block_rsv->lock);
4107 	num_bytes = div_factor(block_rsv->size, min_factor);
4108 	if (block_rsv->reserved >= num_bytes)
4109 		ret = 0;
4110 	spin_unlock(&block_rsv->lock);
4111 
4112 	return ret;
4113 }
4114 
4115 static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4116 					   struct btrfs_block_rsv *block_rsv,
4117 					   u64 min_reserved, int flush)
4118 {
4119 	u64 num_bytes = 0;
4120 	int ret = -ENOSPC;
4121 
4122 	if (!block_rsv)
4123 		return 0;
4124 
4125 	spin_lock(&block_rsv->lock);
4126 	num_bytes = min_reserved;
4127 	if (block_rsv->reserved >= num_bytes)
4128 		ret = 0;
4129 	else
4130 		num_bytes -= block_rsv->reserved;
4131 	spin_unlock(&block_rsv->lock);
4132 
4133 	if (!ret)
4134 		return 0;
4135 
4136 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4137 	if (!ret) {
4138 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
4139 		return 0;
4140 	}
4141 
4142 	return ret;
4143 }
4144 
4145 int btrfs_block_rsv_refill(struct btrfs_root *root,
4146 			   struct btrfs_block_rsv *block_rsv,
4147 			   u64 min_reserved)
4148 {
4149 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4150 }
4151 
4152 int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4153 				   struct btrfs_block_rsv *block_rsv,
4154 				   u64 min_reserved)
4155 {
4156 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4157 }
4158 
4159 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4160 			    struct btrfs_block_rsv *dst_rsv,
4161 			    u64 num_bytes)
4162 {
4163 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4164 }
4165 
4166 void btrfs_block_rsv_release(struct btrfs_root *root,
4167 			     struct btrfs_block_rsv *block_rsv,
4168 			     u64 num_bytes)
4169 {
4170 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4171 	if (global_rsv->full || global_rsv == block_rsv ||
4172 	    block_rsv->space_info != global_rsv->space_info)
4173 		global_rsv = NULL;
4174 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4175 				num_bytes);
4176 }
4177 
4178 /*
4179  * helper to calculate size of global block reservation.
4180  * the desired value is sum of space used by extent tree,
4181  * checksum tree and root tree
4182  */
4183 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4184 {
4185 	struct btrfs_space_info *sinfo;
4186 	u64 num_bytes;
4187 	u64 meta_used;
4188 	u64 data_used;
4189 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4190 
4191 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4192 	spin_lock(&sinfo->lock);
4193 	data_used = sinfo->bytes_used;
4194 	spin_unlock(&sinfo->lock);
4195 
4196 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4197 	spin_lock(&sinfo->lock);
4198 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4199 		data_used = 0;
4200 	meta_used = sinfo->bytes_used;
4201 	spin_unlock(&sinfo->lock);
4202 
4203 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4204 		    csum_size * 2;
4205 	num_bytes += div64_u64(data_used + meta_used, 50);
4206 
4207 	if (num_bytes * 3 > meta_used)
4208 		num_bytes = div64_u64(meta_used, 3) * 2;
4209 
4210 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4211 }
4212 
4213 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4214 {
4215 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4216 	struct btrfs_space_info *sinfo = block_rsv->space_info;
4217 	u64 num_bytes;
4218 
4219 	num_bytes = calc_global_metadata_size(fs_info);
4220 
4221 	spin_lock(&block_rsv->lock);
4222 	spin_lock(&sinfo->lock);
4223 
4224 	block_rsv->size = num_bytes;
4225 
4226 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4227 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
4228 		    sinfo->bytes_may_use;
4229 
4230 	if (sinfo->total_bytes > num_bytes) {
4231 		num_bytes = sinfo->total_bytes - num_bytes;
4232 		block_rsv->reserved += num_bytes;
4233 		sinfo->bytes_may_use += num_bytes;
4234 		trace_btrfs_space_reservation(fs_info, "space_info",
4235 				      sinfo->flags, num_bytes, 1);
4236 	}
4237 
4238 	if (block_rsv->reserved >= block_rsv->size) {
4239 		num_bytes = block_rsv->reserved - block_rsv->size;
4240 		sinfo->bytes_may_use -= num_bytes;
4241 		trace_btrfs_space_reservation(fs_info, "space_info",
4242 				      sinfo->flags, num_bytes, 0);
4243 		sinfo->reservation_progress++;
4244 		block_rsv->reserved = block_rsv->size;
4245 		block_rsv->full = 1;
4246 	}
4247 
4248 	spin_unlock(&sinfo->lock);
4249 	spin_unlock(&block_rsv->lock);
4250 }
4251 
4252 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4253 {
4254 	struct btrfs_space_info *space_info;
4255 
4256 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4257 	fs_info->chunk_block_rsv.space_info = space_info;
4258 
4259 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4260 	fs_info->global_block_rsv.space_info = space_info;
4261 	fs_info->delalloc_block_rsv.space_info = space_info;
4262 	fs_info->trans_block_rsv.space_info = space_info;
4263 	fs_info->empty_block_rsv.space_info = space_info;
4264 	fs_info->delayed_block_rsv.space_info = space_info;
4265 
4266 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4267 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4268 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4269 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4270 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4271 
4272 	update_global_block_rsv(fs_info);
4273 }
4274 
4275 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4276 {
4277 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4278 				(u64)-1);
4279 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4280 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4281 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4282 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4283 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4284 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4285 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4286 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4287 }
4288 
4289 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4290 				  struct btrfs_root *root)
4291 {
4292 	if (!trans->bytes_reserved)
4293 		return;
4294 
4295 	trace_btrfs_space_reservation(root->fs_info, "transaction",
4296 				      trans->transid, trans->bytes_reserved, 0);
4297 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4298 	trans->bytes_reserved = 0;
4299 }
4300 
4301 /* Can only return 0 or -ENOSPC */
4302 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4303 				  struct inode *inode)
4304 {
4305 	struct btrfs_root *root = BTRFS_I(inode)->root;
4306 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4307 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4308 
4309 	/*
4310 	 * We need to hold space in order to delete our orphan item once we've
4311 	 * added it, so this takes the reservation so we can release it later
4312 	 * when we are truly done with the orphan item.
4313 	 */
4314 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4315 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4316 				      btrfs_ino(inode), num_bytes, 1);
4317 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4318 }
4319 
4320 void btrfs_orphan_release_metadata(struct inode *inode)
4321 {
4322 	struct btrfs_root *root = BTRFS_I(inode)->root;
4323 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4324 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4325 				      btrfs_ino(inode), num_bytes, 0);
4326 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4327 }
4328 
4329 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4330 				struct btrfs_pending_snapshot *pending)
4331 {
4332 	struct btrfs_root *root = pending->root;
4333 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4334 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4335 	/*
4336 	 * two for root back/forward refs, two for directory entries
4337 	 * and one for root of the snapshot.
4338 	 */
4339 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4340 	dst_rsv->space_info = src_rsv->space_info;
4341 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4342 }
4343 
4344 /**
4345  * drop_outstanding_extent - drop an outstanding extent
4346  * @inode: the inode we're dropping the extent for
4347  *
4348  * This is called when we are freeing up an outstanding extent, either called
4349  * after an error or after an extent is written.  This will return the number of
4350  * reserved extents that need to be freed.  This must be called with
4351  * BTRFS_I(inode)->lock held.
4352  */
4353 static unsigned drop_outstanding_extent(struct inode *inode)
4354 {
4355 	unsigned drop_inode_space = 0;
4356 	unsigned dropped_extents = 0;
4357 
4358 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4359 	BTRFS_I(inode)->outstanding_extents--;
4360 
4361 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4362 	    BTRFS_I(inode)->delalloc_meta_reserved) {
4363 		drop_inode_space = 1;
4364 		BTRFS_I(inode)->delalloc_meta_reserved = 0;
4365 	}
4366 
4367 	/*
4368 	 * If we have more or the same amount of outsanding extents than we have
4369 	 * reserved then we need to leave the reserved extents count alone.
4370 	 */
4371 	if (BTRFS_I(inode)->outstanding_extents >=
4372 	    BTRFS_I(inode)->reserved_extents)
4373 		return drop_inode_space;
4374 
4375 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4376 		BTRFS_I(inode)->outstanding_extents;
4377 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4378 	return dropped_extents + drop_inode_space;
4379 }
4380 
4381 /**
4382  * calc_csum_metadata_size - return the amount of metada space that must be
4383  *	reserved/free'd for the given bytes.
4384  * @inode: the inode we're manipulating
4385  * @num_bytes: the number of bytes in question
4386  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4387  *
4388  * This adjusts the number of csum_bytes in the inode and then returns the
4389  * correct amount of metadata that must either be reserved or freed.  We
4390  * calculate how many checksums we can fit into one leaf and then divide the
4391  * number of bytes that will need to be checksumed by this value to figure out
4392  * how many checksums will be required.  If we are adding bytes then the number
4393  * may go up and we will return the number of additional bytes that must be
4394  * reserved.  If it is going down we will return the number of bytes that must
4395  * be freed.
4396  *
4397  * This must be called with BTRFS_I(inode)->lock held.
4398  */
4399 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4400 				   int reserve)
4401 {
4402 	struct btrfs_root *root = BTRFS_I(inode)->root;
4403 	u64 csum_size;
4404 	int num_csums_per_leaf;
4405 	int num_csums;
4406 	int old_csums;
4407 
4408 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4409 	    BTRFS_I(inode)->csum_bytes == 0)
4410 		return 0;
4411 
4412 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4413 	if (reserve)
4414 		BTRFS_I(inode)->csum_bytes += num_bytes;
4415 	else
4416 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4417 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4418 	num_csums_per_leaf = (int)div64_u64(csum_size,
4419 					    sizeof(struct btrfs_csum_item) +
4420 					    sizeof(struct btrfs_disk_key));
4421 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4422 	num_csums = num_csums + num_csums_per_leaf - 1;
4423 	num_csums = num_csums / num_csums_per_leaf;
4424 
4425 	old_csums = old_csums + num_csums_per_leaf - 1;
4426 	old_csums = old_csums / num_csums_per_leaf;
4427 
4428 	/* No change, no need to reserve more */
4429 	if (old_csums == num_csums)
4430 		return 0;
4431 
4432 	if (reserve)
4433 		return btrfs_calc_trans_metadata_size(root,
4434 						      num_csums - old_csums);
4435 
4436 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4437 }
4438 
4439 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4440 {
4441 	struct btrfs_root *root = BTRFS_I(inode)->root;
4442 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4443 	u64 to_reserve = 0;
4444 	u64 csum_bytes;
4445 	unsigned nr_extents = 0;
4446 	int extra_reserve = 0;
4447 	int flush = 1;
4448 	int ret;
4449 
4450 	/* Need to be holding the i_mutex here if we aren't free space cache */
4451 	if (btrfs_is_free_space_inode(root, inode))
4452 		flush = 0;
4453 
4454 	if (flush && btrfs_transaction_in_commit(root->fs_info))
4455 		schedule_timeout(1);
4456 
4457 	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4458 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4459 
4460 	spin_lock(&BTRFS_I(inode)->lock);
4461 	BTRFS_I(inode)->outstanding_extents++;
4462 
4463 	if (BTRFS_I(inode)->outstanding_extents >
4464 	    BTRFS_I(inode)->reserved_extents)
4465 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4466 			BTRFS_I(inode)->reserved_extents;
4467 
4468 	/*
4469 	 * Add an item to reserve for updating the inode when we complete the
4470 	 * delalloc io.
4471 	 */
4472 	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4473 		nr_extents++;
4474 		extra_reserve = 1;
4475 	}
4476 
4477 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4478 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4479 	csum_bytes = BTRFS_I(inode)->csum_bytes;
4480 	spin_unlock(&BTRFS_I(inode)->lock);
4481 
4482 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4483 	if (ret) {
4484 		u64 to_free = 0;
4485 		unsigned dropped;
4486 
4487 		spin_lock(&BTRFS_I(inode)->lock);
4488 		dropped = drop_outstanding_extent(inode);
4489 		/*
4490 		 * If the inodes csum_bytes is the same as the original
4491 		 * csum_bytes then we know we haven't raced with any free()ers
4492 		 * so we can just reduce our inodes csum bytes and carry on.
4493 		 * Otherwise we have to do the normal free thing to account for
4494 		 * the case that the free side didn't free up its reserve
4495 		 * because of this outstanding reservation.
4496 		 */
4497 		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4498 			calc_csum_metadata_size(inode, num_bytes, 0);
4499 		else
4500 			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4501 		spin_unlock(&BTRFS_I(inode)->lock);
4502 		if (dropped)
4503 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
4504 
4505 		if (to_free) {
4506 			btrfs_block_rsv_release(root, block_rsv, to_free);
4507 			trace_btrfs_space_reservation(root->fs_info,
4508 						      "delalloc",
4509 						      btrfs_ino(inode),
4510 						      to_free, 0);
4511 		}
4512 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4513 		return ret;
4514 	}
4515 
4516 	spin_lock(&BTRFS_I(inode)->lock);
4517 	if (extra_reserve) {
4518 		BTRFS_I(inode)->delalloc_meta_reserved = 1;
4519 		nr_extents--;
4520 	}
4521 	BTRFS_I(inode)->reserved_extents += nr_extents;
4522 	spin_unlock(&BTRFS_I(inode)->lock);
4523 	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4524 
4525 	if (to_reserve)
4526 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
4527 					      btrfs_ino(inode), to_reserve, 1);
4528 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4529 
4530 	return 0;
4531 }
4532 
4533 /**
4534  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4535  * @inode: the inode to release the reservation for
4536  * @num_bytes: the number of bytes we're releasing
4537  *
4538  * This will release the metadata reservation for an inode.  This can be called
4539  * once we complete IO for a given set of bytes to release their metadata
4540  * reservations.
4541  */
4542 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4543 {
4544 	struct btrfs_root *root = BTRFS_I(inode)->root;
4545 	u64 to_free = 0;
4546 	unsigned dropped;
4547 
4548 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4549 	spin_lock(&BTRFS_I(inode)->lock);
4550 	dropped = drop_outstanding_extent(inode);
4551 
4552 	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4553 	spin_unlock(&BTRFS_I(inode)->lock);
4554 	if (dropped > 0)
4555 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4556 
4557 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
4558 				      btrfs_ino(inode), to_free, 0);
4559 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4560 				to_free);
4561 }
4562 
4563 /**
4564  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4565  * @inode: inode we're writing to
4566  * @num_bytes: the number of bytes we want to allocate
4567  *
4568  * This will do the following things
4569  *
4570  * o reserve space in the data space info for num_bytes
4571  * o reserve space in the metadata space info based on number of outstanding
4572  *   extents and how much csums will be needed
4573  * o add to the inodes ->delalloc_bytes
4574  * o add it to the fs_info's delalloc inodes list.
4575  *
4576  * This will return 0 for success and -ENOSPC if there is no space left.
4577  */
4578 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4579 {
4580 	int ret;
4581 
4582 	ret = btrfs_check_data_free_space(inode, num_bytes);
4583 	if (ret)
4584 		return ret;
4585 
4586 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4587 	if (ret) {
4588 		btrfs_free_reserved_data_space(inode, num_bytes);
4589 		return ret;
4590 	}
4591 
4592 	return 0;
4593 }
4594 
4595 /**
4596  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4597  * @inode: inode we're releasing space for
4598  * @num_bytes: the number of bytes we want to free up
4599  *
4600  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4601  * called in the case that we don't need the metadata AND data reservations
4602  * anymore.  So if there is an error or we insert an inline extent.
4603  *
4604  * This function will release the metadata space that was not used and will
4605  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4606  * list if there are no delalloc bytes left.
4607  */
4608 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4609 {
4610 	btrfs_delalloc_release_metadata(inode, num_bytes);
4611 	btrfs_free_reserved_data_space(inode, num_bytes);
4612 }
4613 
4614 static int update_block_group(struct btrfs_trans_handle *trans,
4615 			      struct btrfs_root *root,
4616 			      u64 bytenr, u64 num_bytes, int alloc)
4617 {
4618 	struct btrfs_block_group_cache *cache = NULL;
4619 	struct btrfs_fs_info *info = root->fs_info;
4620 	u64 total = num_bytes;
4621 	u64 old_val;
4622 	u64 byte_in_group;
4623 	int factor;
4624 
4625 	/* block accounting for super block */
4626 	spin_lock(&info->delalloc_lock);
4627 	old_val = btrfs_super_bytes_used(info->super_copy);
4628 	if (alloc)
4629 		old_val += num_bytes;
4630 	else
4631 		old_val -= num_bytes;
4632 	btrfs_set_super_bytes_used(info->super_copy, old_val);
4633 	spin_unlock(&info->delalloc_lock);
4634 
4635 	while (total) {
4636 		cache = btrfs_lookup_block_group(info, bytenr);
4637 		if (!cache)
4638 			return -ENOENT;
4639 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4640 				    BTRFS_BLOCK_GROUP_RAID1 |
4641 				    BTRFS_BLOCK_GROUP_RAID10))
4642 			factor = 2;
4643 		else
4644 			factor = 1;
4645 		/*
4646 		 * If this block group has free space cache written out, we
4647 		 * need to make sure to load it if we are removing space.  This
4648 		 * is because we need the unpinning stage to actually add the
4649 		 * space back to the block group, otherwise we will leak space.
4650 		 */
4651 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
4652 			cache_block_group(cache, trans, NULL, 1);
4653 
4654 		byte_in_group = bytenr - cache->key.objectid;
4655 		WARN_ON(byte_in_group > cache->key.offset);
4656 
4657 		spin_lock(&cache->space_info->lock);
4658 		spin_lock(&cache->lock);
4659 
4660 		if (btrfs_test_opt(root, SPACE_CACHE) &&
4661 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
4662 			cache->disk_cache_state = BTRFS_DC_CLEAR;
4663 
4664 		cache->dirty = 1;
4665 		old_val = btrfs_block_group_used(&cache->item);
4666 		num_bytes = min(total, cache->key.offset - byte_in_group);
4667 		if (alloc) {
4668 			old_val += num_bytes;
4669 			btrfs_set_block_group_used(&cache->item, old_val);
4670 			cache->reserved -= num_bytes;
4671 			cache->space_info->bytes_reserved -= num_bytes;
4672 			cache->space_info->bytes_used += num_bytes;
4673 			cache->space_info->disk_used += num_bytes * factor;
4674 			spin_unlock(&cache->lock);
4675 			spin_unlock(&cache->space_info->lock);
4676 		} else {
4677 			old_val -= num_bytes;
4678 			btrfs_set_block_group_used(&cache->item, old_val);
4679 			cache->pinned += num_bytes;
4680 			cache->space_info->bytes_pinned += num_bytes;
4681 			cache->space_info->bytes_used -= num_bytes;
4682 			cache->space_info->disk_used -= num_bytes * factor;
4683 			spin_unlock(&cache->lock);
4684 			spin_unlock(&cache->space_info->lock);
4685 
4686 			set_extent_dirty(info->pinned_extents,
4687 					 bytenr, bytenr + num_bytes - 1,
4688 					 GFP_NOFS | __GFP_NOFAIL);
4689 		}
4690 		btrfs_put_block_group(cache);
4691 		total -= num_bytes;
4692 		bytenr += num_bytes;
4693 	}
4694 	return 0;
4695 }
4696 
4697 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4698 {
4699 	struct btrfs_block_group_cache *cache;
4700 	u64 bytenr;
4701 
4702 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4703 	if (!cache)
4704 		return 0;
4705 
4706 	bytenr = cache->key.objectid;
4707 	btrfs_put_block_group(cache);
4708 
4709 	return bytenr;
4710 }
4711 
4712 static int pin_down_extent(struct btrfs_root *root,
4713 			   struct btrfs_block_group_cache *cache,
4714 			   u64 bytenr, u64 num_bytes, int reserved)
4715 {
4716 	spin_lock(&cache->space_info->lock);
4717 	spin_lock(&cache->lock);
4718 	cache->pinned += num_bytes;
4719 	cache->space_info->bytes_pinned += num_bytes;
4720 	if (reserved) {
4721 		cache->reserved -= num_bytes;
4722 		cache->space_info->bytes_reserved -= num_bytes;
4723 	}
4724 	spin_unlock(&cache->lock);
4725 	spin_unlock(&cache->space_info->lock);
4726 
4727 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4728 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4729 	return 0;
4730 }
4731 
4732 /*
4733  * this function must be called within transaction
4734  */
4735 int btrfs_pin_extent(struct btrfs_root *root,
4736 		     u64 bytenr, u64 num_bytes, int reserved)
4737 {
4738 	struct btrfs_block_group_cache *cache;
4739 
4740 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4741 	BUG_ON(!cache); /* Logic error */
4742 
4743 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4744 
4745 	btrfs_put_block_group(cache);
4746 	return 0;
4747 }
4748 
4749 /*
4750  * this function must be called within transaction
4751  */
4752 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4753 				    struct btrfs_root *root,
4754 				    u64 bytenr, u64 num_bytes)
4755 {
4756 	struct btrfs_block_group_cache *cache;
4757 
4758 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4759 	BUG_ON(!cache); /* Logic error */
4760 
4761 	/*
4762 	 * pull in the free space cache (if any) so that our pin
4763 	 * removes the free space from the cache.  We have load_only set
4764 	 * to one because the slow code to read in the free extents does check
4765 	 * the pinned extents.
4766 	 */
4767 	cache_block_group(cache, trans, root, 1);
4768 
4769 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
4770 
4771 	/* remove us from the free space cache (if we're there at all) */
4772 	btrfs_remove_free_space(cache, bytenr, num_bytes);
4773 	btrfs_put_block_group(cache);
4774 	return 0;
4775 }
4776 
4777 /**
4778  * btrfs_update_reserved_bytes - update the block_group and space info counters
4779  * @cache:	The cache we are manipulating
4780  * @num_bytes:	The number of bytes in question
4781  * @reserve:	One of the reservation enums
4782  *
4783  * This is called by the allocator when it reserves space, or by somebody who is
4784  * freeing space that was never actually used on disk.  For example if you
4785  * reserve some space for a new leaf in transaction A and before transaction A
4786  * commits you free that leaf, you call this with reserve set to 0 in order to
4787  * clear the reservation.
4788  *
4789  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4790  * ENOSPC accounting.  For data we handle the reservation through clearing the
4791  * delalloc bits in the io_tree.  We have to do this since we could end up
4792  * allocating less disk space for the amount of data we have reserved in the
4793  * case of compression.
4794  *
4795  * If this is a reservation and the block group has become read only we cannot
4796  * make the reservation and return -EAGAIN, otherwise this function always
4797  * succeeds.
4798  */
4799 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4800 				       u64 num_bytes, int reserve)
4801 {
4802 	struct btrfs_space_info *space_info = cache->space_info;
4803 	int ret = 0;
4804 
4805 	spin_lock(&space_info->lock);
4806 	spin_lock(&cache->lock);
4807 	if (reserve != RESERVE_FREE) {
4808 		if (cache->ro) {
4809 			ret = -EAGAIN;
4810 		} else {
4811 			cache->reserved += num_bytes;
4812 			space_info->bytes_reserved += num_bytes;
4813 			if (reserve == RESERVE_ALLOC) {
4814 				trace_btrfs_space_reservation(cache->fs_info,
4815 						"space_info", space_info->flags,
4816 						num_bytes, 0);
4817 				space_info->bytes_may_use -= num_bytes;
4818 			}
4819 		}
4820 	} else {
4821 		if (cache->ro)
4822 			space_info->bytes_readonly += num_bytes;
4823 		cache->reserved -= num_bytes;
4824 		space_info->bytes_reserved -= num_bytes;
4825 		space_info->reservation_progress++;
4826 	}
4827 	spin_unlock(&cache->lock);
4828 	spin_unlock(&space_info->lock);
4829 	return ret;
4830 }
4831 
4832 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4833 				struct btrfs_root *root)
4834 {
4835 	struct btrfs_fs_info *fs_info = root->fs_info;
4836 	struct btrfs_caching_control *next;
4837 	struct btrfs_caching_control *caching_ctl;
4838 	struct btrfs_block_group_cache *cache;
4839 
4840 	down_write(&fs_info->extent_commit_sem);
4841 
4842 	list_for_each_entry_safe(caching_ctl, next,
4843 				 &fs_info->caching_block_groups, list) {
4844 		cache = caching_ctl->block_group;
4845 		if (block_group_cache_done(cache)) {
4846 			cache->last_byte_to_unpin = (u64)-1;
4847 			list_del_init(&caching_ctl->list);
4848 			put_caching_control(caching_ctl);
4849 		} else {
4850 			cache->last_byte_to_unpin = caching_ctl->progress;
4851 		}
4852 	}
4853 
4854 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4855 		fs_info->pinned_extents = &fs_info->freed_extents[1];
4856 	else
4857 		fs_info->pinned_extents = &fs_info->freed_extents[0];
4858 
4859 	up_write(&fs_info->extent_commit_sem);
4860 
4861 	update_global_block_rsv(fs_info);
4862 }
4863 
4864 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4865 {
4866 	struct btrfs_fs_info *fs_info = root->fs_info;
4867 	struct btrfs_block_group_cache *cache = NULL;
4868 	u64 len;
4869 
4870 	while (start <= end) {
4871 		if (!cache ||
4872 		    start >= cache->key.objectid + cache->key.offset) {
4873 			if (cache)
4874 				btrfs_put_block_group(cache);
4875 			cache = btrfs_lookup_block_group(fs_info, start);
4876 			BUG_ON(!cache); /* Logic error */
4877 		}
4878 
4879 		len = cache->key.objectid + cache->key.offset - start;
4880 		len = min(len, end + 1 - start);
4881 
4882 		if (start < cache->last_byte_to_unpin) {
4883 			len = min(len, cache->last_byte_to_unpin - start);
4884 			btrfs_add_free_space(cache, start, len);
4885 		}
4886 
4887 		start += len;
4888 
4889 		spin_lock(&cache->space_info->lock);
4890 		spin_lock(&cache->lock);
4891 		cache->pinned -= len;
4892 		cache->space_info->bytes_pinned -= len;
4893 		if (cache->ro)
4894 			cache->space_info->bytes_readonly += len;
4895 		spin_unlock(&cache->lock);
4896 		spin_unlock(&cache->space_info->lock);
4897 	}
4898 
4899 	if (cache)
4900 		btrfs_put_block_group(cache);
4901 	return 0;
4902 }
4903 
4904 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4905 			       struct btrfs_root *root)
4906 {
4907 	struct btrfs_fs_info *fs_info = root->fs_info;
4908 	struct extent_io_tree *unpin;
4909 	u64 start;
4910 	u64 end;
4911 	int ret;
4912 
4913 	if (trans->aborted)
4914 		return 0;
4915 
4916 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4917 		unpin = &fs_info->freed_extents[1];
4918 	else
4919 		unpin = &fs_info->freed_extents[0];
4920 
4921 	while (1) {
4922 		ret = find_first_extent_bit(unpin, 0, &start, &end,
4923 					    EXTENT_DIRTY);
4924 		if (ret)
4925 			break;
4926 
4927 		if (btrfs_test_opt(root, DISCARD))
4928 			ret = btrfs_discard_extent(root, start,
4929 						   end + 1 - start, NULL);
4930 
4931 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
4932 		unpin_extent_range(root, start, end);
4933 		cond_resched();
4934 	}
4935 
4936 	return 0;
4937 }
4938 
4939 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4940 				struct btrfs_root *root,
4941 				u64 bytenr, u64 num_bytes, u64 parent,
4942 				u64 root_objectid, u64 owner_objectid,
4943 				u64 owner_offset, int refs_to_drop,
4944 				struct btrfs_delayed_extent_op *extent_op)
4945 {
4946 	struct btrfs_key key;
4947 	struct btrfs_path *path;
4948 	struct btrfs_fs_info *info = root->fs_info;
4949 	struct btrfs_root *extent_root = info->extent_root;
4950 	struct extent_buffer *leaf;
4951 	struct btrfs_extent_item *ei;
4952 	struct btrfs_extent_inline_ref *iref;
4953 	int ret;
4954 	int is_data;
4955 	int extent_slot = 0;
4956 	int found_extent = 0;
4957 	int num_to_del = 1;
4958 	u32 item_size;
4959 	u64 refs;
4960 
4961 	path = btrfs_alloc_path();
4962 	if (!path)
4963 		return -ENOMEM;
4964 
4965 	path->reada = 1;
4966 	path->leave_spinning = 1;
4967 
4968 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4969 	BUG_ON(!is_data && refs_to_drop != 1);
4970 
4971 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
4972 				    bytenr, num_bytes, parent,
4973 				    root_objectid, owner_objectid,
4974 				    owner_offset);
4975 	if (ret == 0) {
4976 		extent_slot = path->slots[0];
4977 		while (extent_slot >= 0) {
4978 			btrfs_item_key_to_cpu(path->nodes[0], &key,
4979 					      extent_slot);
4980 			if (key.objectid != bytenr)
4981 				break;
4982 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4983 			    key.offset == num_bytes) {
4984 				found_extent = 1;
4985 				break;
4986 			}
4987 			if (path->slots[0] - extent_slot > 5)
4988 				break;
4989 			extent_slot--;
4990 		}
4991 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4992 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4993 		if (found_extent && item_size < sizeof(*ei))
4994 			found_extent = 0;
4995 #endif
4996 		if (!found_extent) {
4997 			BUG_ON(iref);
4998 			ret = remove_extent_backref(trans, extent_root, path,
4999 						    NULL, refs_to_drop,
5000 						    is_data);
5001 			if (ret)
5002 				goto abort;
5003 			btrfs_release_path(path);
5004 			path->leave_spinning = 1;
5005 
5006 			key.objectid = bytenr;
5007 			key.type = BTRFS_EXTENT_ITEM_KEY;
5008 			key.offset = num_bytes;
5009 
5010 			ret = btrfs_search_slot(trans, extent_root,
5011 						&key, path, -1, 1);
5012 			if (ret) {
5013 				printk(KERN_ERR "umm, got %d back from search"
5014 				       ", was looking for %llu\n", ret,
5015 				       (unsigned long long)bytenr);
5016 				if (ret > 0)
5017 					btrfs_print_leaf(extent_root,
5018 							 path->nodes[0]);
5019 			}
5020 			if (ret < 0)
5021 				goto abort;
5022 			extent_slot = path->slots[0];
5023 		}
5024 	} else if (ret == -ENOENT) {
5025 		btrfs_print_leaf(extent_root, path->nodes[0]);
5026 		WARN_ON(1);
5027 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5028 		       "parent %llu root %llu  owner %llu offset %llu\n",
5029 		       (unsigned long long)bytenr,
5030 		       (unsigned long long)parent,
5031 		       (unsigned long long)root_objectid,
5032 		       (unsigned long long)owner_objectid,
5033 		       (unsigned long long)owner_offset);
5034 	} else {
5035 		goto abort;
5036 	}
5037 
5038 	leaf = path->nodes[0];
5039 	item_size = btrfs_item_size_nr(leaf, extent_slot);
5040 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5041 	if (item_size < sizeof(*ei)) {
5042 		BUG_ON(found_extent || extent_slot != path->slots[0]);
5043 		ret = convert_extent_item_v0(trans, extent_root, path,
5044 					     owner_objectid, 0);
5045 		if (ret < 0)
5046 			goto abort;
5047 
5048 		btrfs_release_path(path);
5049 		path->leave_spinning = 1;
5050 
5051 		key.objectid = bytenr;
5052 		key.type = BTRFS_EXTENT_ITEM_KEY;
5053 		key.offset = num_bytes;
5054 
5055 		ret = btrfs_search_slot(trans, extent_root, &key, path,
5056 					-1, 1);
5057 		if (ret) {
5058 			printk(KERN_ERR "umm, got %d back from search"
5059 			       ", was looking for %llu\n", ret,
5060 			       (unsigned long long)bytenr);
5061 			btrfs_print_leaf(extent_root, path->nodes[0]);
5062 		}
5063 		if (ret < 0)
5064 			goto abort;
5065 		extent_slot = path->slots[0];
5066 		leaf = path->nodes[0];
5067 		item_size = btrfs_item_size_nr(leaf, extent_slot);
5068 	}
5069 #endif
5070 	BUG_ON(item_size < sizeof(*ei));
5071 	ei = btrfs_item_ptr(leaf, extent_slot,
5072 			    struct btrfs_extent_item);
5073 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5074 		struct btrfs_tree_block_info *bi;
5075 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5076 		bi = (struct btrfs_tree_block_info *)(ei + 1);
5077 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5078 	}
5079 
5080 	refs = btrfs_extent_refs(leaf, ei);
5081 	BUG_ON(refs < refs_to_drop);
5082 	refs -= refs_to_drop;
5083 
5084 	if (refs > 0) {
5085 		if (extent_op)
5086 			__run_delayed_extent_op(extent_op, leaf, ei);
5087 		/*
5088 		 * In the case of inline back ref, reference count will
5089 		 * be updated by remove_extent_backref
5090 		 */
5091 		if (iref) {
5092 			BUG_ON(!found_extent);
5093 		} else {
5094 			btrfs_set_extent_refs(leaf, ei, refs);
5095 			btrfs_mark_buffer_dirty(leaf);
5096 		}
5097 		if (found_extent) {
5098 			ret = remove_extent_backref(trans, extent_root, path,
5099 						    iref, refs_to_drop,
5100 						    is_data);
5101 			if (ret)
5102 				goto abort;
5103 		}
5104 	} else {
5105 		if (found_extent) {
5106 			BUG_ON(is_data && refs_to_drop !=
5107 			       extent_data_ref_count(root, path, iref));
5108 			if (iref) {
5109 				BUG_ON(path->slots[0] != extent_slot);
5110 			} else {
5111 				BUG_ON(path->slots[0] != extent_slot + 1);
5112 				path->slots[0] = extent_slot;
5113 				num_to_del = 2;
5114 			}
5115 		}
5116 
5117 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5118 				      num_to_del);
5119 		if (ret)
5120 			goto abort;
5121 		btrfs_release_path(path);
5122 
5123 		if (is_data) {
5124 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5125 			if (ret)
5126 				goto abort;
5127 		}
5128 
5129 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5130 		if (ret)
5131 			goto abort;
5132 	}
5133 out:
5134 	btrfs_free_path(path);
5135 	return ret;
5136 
5137 abort:
5138 	btrfs_abort_transaction(trans, extent_root, ret);
5139 	goto out;
5140 }
5141 
5142 /*
5143  * when we free an block, it is possible (and likely) that we free the last
5144  * delayed ref for that extent as well.  This searches the delayed ref tree for
5145  * a given extent, and if there are no other delayed refs to be processed, it
5146  * removes it from the tree.
5147  */
5148 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5149 				      struct btrfs_root *root, u64 bytenr)
5150 {
5151 	struct btrfs_delayed_ref_head *head;
5152 	struct btrfs_delayed_ref_root *delayed_refs;
5153 	struct btrfs_delayed_ref_node *ref;
5154 	struct rb_node *node;
5155 	int ret = 0;
5156 
5157 	delayed_refs = &trans->transaction->delayed_refs;
5158 	spin_lock(&delayed_refs->lock);
5159 	head = btrfs_find_delayed_ref_head(trans, bytenr);
5160 	if (!head)
5161 		goto out;
5162 
5163 	node = rb_prev(&head->node.rb_node);
5164 	if (!node)
5165 		goto out;
5166 
5167 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5168 
5169 	/* there are still entries for this ref, we can't drop it */
5170 	if (ref->bytenr == bytenr)
5171 		goto out;
5172 
5173 	if (head->extent_op) {
5174 		if (!head->must_insert_reserved)
5175 			goto out;
5176 		kfree(head->extent_op);
5177 		head->extent_op = NULL;
5178 	}
5179 
5180 	/*
5181 	 * waiting for the lock here would deadlock.  If someone else has it
5182 	 * locked they are already in the process of dropping it anyway
5183 	 */
5184 	if (!mutex_trylock(&head->mutex))
5185 		goto out;
5186 
5187 	/*
5188 	 * at this point we have a head with no other entries.  Go
5189 	 * ahead and process it.
5190 	 */
5191 	head->node.in_tree = 0;
5192 	rb_erase(&head->node.rb_node, &delayed_refs->root);
5193 
5194 	delayed_refs->num_entries--;
5195 	if (waitqueue_active(&delayed_refs->seq_wait))
5196 		wake_up(&delayed_refs->seq_wait);
5197 
5198 	/*
5199 	 * we don't take a ref on the node because we're removing it from the
5200 	 * tree, so we just steal the ref the tree was holding.
5201 	 */
5202 	delayed_refs->num_heads--;
5203 	if (list_empty(&head->cluster))
5204 		delayed_refs->num_heads_ready--;
5205 
5206 	list_del_init(&head->cluster);
5207 	spin_unlock(&delayed_refs->lock);
5208 
5209 	BUG_ON(head->extent_op);
5210 	if (head->must_insert_reserved)
5211 		ret = 1;
5212 
5213 	mutex_unlock(&head->mutex);
5214 	btrfs_put_delayed_ref(&head->node);
5215 	return ret;
5216 out:
5217 	spin_unlock(&delayed_refs->lock);
5218 	return 0;
5219 }
5220 
5221 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5222 			   struct btrfs_root *root,
5223 			   struct extent_buffer *buf,
5224 			   u64 parent, int last_ref, int for_cow)
5225 {
5226 	struct btrfs_block_group_cache *cache = NULL;
5227 	int ret;
5228 
5229 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5230 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5231 					buf->start, buf->len,
5232 					parent, root->root_key.objectid,
5233 					btrfs_header_level(buf),
5234 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5235 		BUG_ON(ret); /* -ENOMEM */
5236 	}
5237 
5238 	if (!last_ref)
5239 		return;
5240 
5241 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5242 
5243 	if (btrfs_header_generation(buf) == trans->transid) {
5244 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5245 			ret = check_ref_cleanup(trans, root, buf->start);
5246 			if (!ret)
5247 				goto out;
5248 		}
5249 
5250 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5251 			pin_down_extent(root, cache, buf->start, buf->len, 1);
5252 			goto out;
5253 		}
5254 
5255 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5256 
5257 		btrfs_add_free_space(cache, buf->start, buf->len);
5258 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5259 	}
5260 out:
5261 	/*
5262 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5263 	 * anymore.
5264 	 */
5265 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5266 	btrfs_put_block_group(cache);
5267 }
5268 
5269 /* Can return -ENOMEM */
5270 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5271 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5272 		      u64 owner, u64 offset, int for_cow)
5273 {
5274 	int ret;
5275 	struct btrfs_fs_info *fs_info = root->fs_info;
5276 
5277 	/*
5278 	 * tree log blocks never actually go into the extent allocation
5279 	 * tree, just update pinning info and exit early.
5280 	 */
5281 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5282 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5283 		/* unlocks the pinned mutex */
5284 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
5285 		ret = 0;
5286 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5287 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5288 					num_bytes,
5289 					parent, root_objectid, (int)owner,
5290 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5291 	} else {
5292 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5293 						num_bytes,
5294 						parent, root_objectid, owner,
5295 						offset, BTRFS_DROP_DELAYED_REF,
5296 						NULL, for_cow);
5297 	}
5298 	return ret;
5299 }
5300 
5301 static u64 stripe_align(struct btrfs_root *root, u64 val)
5302 {
5303 	u64 mask = ((u64)root->stripesize - 1);
5304 	u64 ret = (val + mask) & ~mask;
5305 	return ret;
5306 }
5307 
5308 /*
5309  * when we wait for progress in the block group caching, its because
5310  * our allocation attempt failed at least once.  So, we must sleep
5311  * and let some progress happen before we try again.
5312  *
5313  * This function will sleep at least once waiting for new free space to
5314  * show up, and then it will check the block group free space numbers
5315  * for our min num_bytes.  Another option is to have it go ahead
5316  * and look in the rbtree for a free extent of a given size, but this
5317  * is a good start.
5318  */
5319 static noinline int
5320 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5321 				u64 num_bytes)
5322 {
5323 	struct btrfs_caching_control *caching_ctl;
5324 	DEFINE_WAIT(wait);
5325 
5326 	caching_ctl = get_caching_control(cache);
5327 	if (!caching_ctl)
5328 		return 0;
5329 
5330 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5331 		   (cache->free_space_ctl->free_space >= num_bytes));
5332 
5333 	put_caching_control(caching_ctl);
5334 	return 0;
5335 }
5336 
5337 static noinline int
5338 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5339 {
5340 	struct btrfs_caching_control *caching_ctl;
5341 	DEFINE_WAIT(wait);
5342 
5343 	caching_ctl = get_caching_control(cache);
5344 	if (!caching_ctl)
5345 		return 0;
5346 
5347 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
5348 
5349 	put_caching_control(caching_ctl);
5350 	return 0;
5351 }
5352 
5353 static int __get_block_group_index(u64 flags)
5354 {
5355 	int index;
5356 
5357 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
5358 		index = 0;
5359 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5360 		index = 1;
5361 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
5362 		index = 2;
5363 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5364 		index = 3;
5365 	else
5366 		index = 4;
5367 
5368 	return index;
5369 }
5370 
5371 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5372 {
5373 	return __get_block_group_index(cache->flags);
5374 }
5375 
5376 enum btrfs_loop_type {
5377 	LOOP_CACHING_NOWAIT = 0,
5378 	LOOP_CACHING_WAIT = 1,
5379 	LOOP_ALLOC_CHUNK = 2,
5380 	LOOP_NO_EMPTY_SIZE = 3,
5381 };
5382 
5383 /*
5384  * walks the btree of allocated extents and find a hole of a given size.
5385  * The key ins is changed to record the hole:
5386  * ins->objectid == block start
5387  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5388  * ins->offset == number of blocks
5389  * Any available blocks before search_start are skipped.
5390  */
5391 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5392 				     struct btrfs_root *orig_root,
5393 				     u64 num_bytes, u64 empty_size,
5394 				     u64 hint_byte, struct btrfs_key *ins,
5395 				     u64 data)
5396 {
5397 	int ret = 0;
5398 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5399 	struct btrfs_free_cluster *last_ptr = NULL;
5400 	struct btrfs_block_group_cache *block_group = NULL;
5401 	struct btrfs_block_group_cache *used_block_group;
5402 	u64 search_start = 0;
5403 	int empty_cluster = 2 * 1024 * 1024;
5404 	int allowed_chunk_alloc = 0;
5405 	int done_chunk_alloc = 0;
5406 	struct btrfs_space_info *space_info;
5407 	int loop = 0;
5408 	int index = 0;
5409 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5410 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5411 	bool found_uncached_bg = false;
5412 	bool failed_cluster_refill = false;
5413 	bool failed_alloc = false;
5414 	bool use_cluster = true;
5415 	bool have_caching_bg = false;
5416 
5417 	WARN_ON(num_bytes < root->sectorsize);
5418 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5419 	ins->objectid = 0;
5420 	ins->offset = 0;
5421 
5422 	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5423 
5424 	space_info = __find_space_info(root->fs_info, data);
5425 	if (!space_info) {
5426 		printk(KERN_ERR "No space info for %llu\n", data);
5427 		return -ENOSPC;
5428 	}
5429 
5430 	/*
5431 	 * If the space info is for both data and metadata it means we have a
5432 	 * small filesystem and we can't use the clustering stuff.
5433 	 */
5434 	if (btrfs_mixed_space_info(space_info))
5435 		use_cluster = false;
5436 
5437 	if (orig_root->ref_cows || empty_size)
5438 		allowed_chunk_alloc = 1;
5439 
5440 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5441 		last_ptr = &root->fs_info->meta_alloc_cluster;
5442 		if (!btrfs_test_opt(root, SSD))
5443 			empty_cluster = 64 * 1024;
5444 	}
5445 
5446 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5447 	    btrfs_test_opt(root, SSD)) {
5448 		last_ptr = &root->fs_info->data_alloc_cluster;
5449 	}
5450 
5451 	if (last_ptr) {
5452 		spin_lock(&last_ptr->lock);
5453 		if (last_ptr->block_group)
5454 			hint_byte = last_ptr->window_start;
5455 		spin_unlock(&last_ptr->lock);
5456 	}
5457 
5458 	search_start = max(search_start, first_logical_byte(root, 0));
5459 	search_start = max(search_start, hint_byte);
5460 
5461 	if (!last_ptr)
5462 		empty_cluster = 0;
5463 
5464 	if (search_start == hint_byte) {
5465 		block_group = btrfs_lookup_block_group(root->fs_info,
5466 						       search_start);
5467 		used_block_group = block_group;
5468 		/*
5469 		 * we don't want to use the block group if it doesn't match our
5470 		 * allocation bits, or if its not cached.
5471 		 *
5472 		 * However if we are re-searching with an ideal block group
5473 		 * picked out then we don't care that the block group is cached.
5474 		 */
5475 		if (block_group && block_group_bits(block_group, data) &&
5476 		    block_group->cached != BTRFS_CACHE_NO) {
5477 			down_read(&space_info->groups_sem);
5478 			if (list_empty(&block_group->list) ||
5479 			    block_group->ro) {
5480 				/*
5481 				 * someone is removing this block group,
5482 				 * we can't jump into the have_block_group
5483 				 * target because our list pointers are not
5484 				 * valid
5485 				 */
5486 				btrfs_put_block_group(block_group);
5487 				up_read(&space_info->groups_sem);
5488 			} else {
5489 				index = get_block_group_index(block_group);
5490 				goto have_block_group;
5491 			}
5492 		} else if (block_group) {
5493 			btrfs_put_block_group(block_group);
5494 		}
5495 	}
5496 search:
5497 	have_caching_bg = false;
5498 	down_read(&space_info->groups_sem);
5499 	list_for_each_entry(block_group, &space_info->block_groups[index],
5500 			    list) {
5501 		u64 offset;
5502 		int cached;
5503 
5504 		used_block_group = block_group;
5505 		btrfs_get_block_group(block_group);
5506 		search_start = block_group->key.objectid;
5507 
5508 		/*
5509 		 * this can happen if we end up cycling through all the
5510 		 * raid types, but we want to make sure we only allocate
5511 		 * for the proper type.
5512 		 */
5513 		if (!block_group_bits(block_group, data)) {
5514 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5515 				BTRFS_BLOCK_GROUP_RAID1 |
5516 				BTRFS_BLOCK_GROUP_RAID10;
5517 
5518 			/*
5519 			 * if they asked for extra copies and this block group
5520 			 * doesn't provide them, bail.  This does allow us to
5521 			 * fill raid0 from raid1.
5522 			 */
5523 			if ((data & extra) && !(block_group->flags & extra))
5524 				goto loop;
5525 		}
5526 
5527 have_block_group:
5528 		cached = block_group_cache_done(block_group);
5529 		if (unlikely(!cached)) {
5530 			found_uncached_bg = true;
5531 			ret = cache_block_group(block_group, trans,
5532 						orig_root, 0);
5533 			BUG_ON(ret < 0);
5534 			ret = 0;
5535 		}
5536 
5537 		if (unlikely(block_group->ro))
5538 			goto loop;
5539 
5540 		/*
5541 		 * Ok we want to try and use the cluster allocator, so
5542 		 * lets look there
5543 		 */
5544 		if (last_ptr) {
5545 			/*
5546 			 * the refill lock keeps out other
5547 			 * people trying to start a new cluster
5548 			 */
5549 			spin_lock(&last_ptr->refill_lock);
5550 			used_block_group = last_ptr->block_group;
5551 			if (used_block_group != block_group &&
5552 			    (!used_block_group ||
5553 			     used_block_group->ro ||
5554 			     !block_group_bits(used_block_group, data))) {
5555 				used_block_group = block_group;
5556 				goto refill_cluster;
5557 			}
5558 
5559 			if (used_block_group != block_group)
5560 				btrfs_get_block_group(used_block_group);
5561 
5562 			offset = btrfs_alloc_from_cluster(used_block_group,
5563 			  last_ptr, num_bytes, used_block_group->key.objectid);
5564 			if (offset) {
5565 				/* we have a block, we're done */
5566 				spin_unlock(&last_ptr->refill_lock);
5567 				trace_btrfs_reserve_extent_cluster(root,
5568 					block_group, search_start, num_bytes);
5569 				goto checks;
5570 			}
5571 
5572 			WARN_ON(last_ptr->block_group != used_block_group);
5573 			if (used_block_group != block_group) {
5574 				btrfs_put_block_group(used_block_group);
5575 				used_block_group = block_group;
5576 			}
5577 refill_cluster:
5578 			BUG_ON(used_block_group != block_group);
5579 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
5580 			 * set up a new clusters, so lets just skip it
5581 			 * and let the allocator find whatever block
5582 			 * it can find.  If we reach this point, we
5583 			 * will have tried the cluster allocator
5584 			 * plenty of times and not have found
5585 			 * anything, so we are likely way too
5586 			 * fragmented for the clustering stuff to find
5587 			 * anything.
5588 			 *
5589 			 * However, if the cluster is taken from the
5590 			 * current block group, release the cluster
5591 			 * first, so that we stand a better chance of
5592 			 * succeeding in the unclustered
5593 			 * allocation.  */
5594 			if (loop >= LOOP_NO_EMPTY_SIZE &&
5595 			    last_ptr->block_group != block_group) {
5596 				spin_unlock(&last_ptr->refill_lock);
5597 				goto unclustered_alloc;
5598 			}
5599 
5600 			/*
5601 			 * this cluster didn't work out, free it and
5602 			 * start over
5603 			 */
5604 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5605 
5606 			if (loop >= LOOP_NO_EMPTY_SIZE) {
5607 				spin_unlock(&last_ptr->refill_lock);
5608 				goto unclustered_alloc;
5609 			}
5610 
5611 			/* allocate a cluster in this block group */
5612 			ret = btrfs_find_space_cluster(trans, root,
5613 					       block_group, last_ptr,
5614 					       search_start, num_bytes,
5615 					       empty_cluster + empty_size);
5616 			if (ret == 0) {
5617 				/*
5618 				 * now pull our allocation out of this
5619 				 * cluster
5620 				 */
5621 				offset = btrfs_alloc_from_cluster(block_group,
5622 						  last_ptr, num_bytes,
5623 						  search_start);
5624 				if (offset) {
5625 					/* we found one, proceed */
5626 					spin_unlock(&last_ptr->refill_lock);
5627 					trace_btrfs_reserve_extent_cluster(root,
5628 						block_group, search_start,
5629 						num_bytes);
5630 					goto checks;
5631 				}
5632 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
5633 				   && !failed_cluster_refill) {
5634 				spin_unlock(&last_ptr->refill_lock);
5635 
5636 				failed_cluster_refill = true;
5637 				wait_block_group_cache_progress(block_group,
5638 				       num_bytes + empty_cluster + empty_size);
5639 				goto have_block_group;
5640 			}
5641 
5642 			/*
5643 			 * at this point we either didn't find a cluster
5644 			 * or we weren't able to allocate a block from our
5645 			 * cluster.  Free the cluster we've been trying
5646 			 * to use, and go to the next block group
5647 			 */
5648 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5649 			spin_unlock(&last_ptr->refill_lock);
5650 			goto loop;
5651 		}
5652 
5653 unclustered_alloc:
5654 		spin_lock(&block_group->free_space_ctl->tree_lock);
5655 		if (cached &&
5656 		    block_group->free_space_ctl->free_space <
5657 		    num_bytes + empty_cluster + empty_size) {
5658 			spin_unlock(&block_group->free_space_ctl->tree_lock);
5659 			goto loop;
5660 		}
5661 		spin_unlock(&block_group->free_space_ctl->tree_lock);
5662 
5663 		offset = btrfs_find_space_for_alloc(block_group, search_start,
5664 						    num_bytes, empty_size);
5665 		/*
5666 		 * If we didn't find a chunk, and we haven't failed on this
5667 		 * block group before, and this block group is in the middle of
5668 		 * caching and we are ok with waiting, then go ahead and wait
5669 		 * for progress to be made, and set failed_alloc to true.
5670 		 *
5671 		 * If failed_alloc is true then we've already waited on this
5672 		 * block group once and should move on to the next block group.
5673 		 */
5674 		if (!offset && !failed_alloc && !cached &&
5675 		    loop > LOOP_CACHING_NOWAIT) {
5676 			wait_block_group_cache_progress(block_group,
5677 						num_bytes + empty_size);
5678 			failed_alloc = true;
5679 			goto have_block_group;
5680 		} else if (!offset) {
5681 			if (!cached)
5682 				have_caching_bg = true;
5683 			goto loop;
5684 		}
5685 checks:
5686 		search_start = stripe_align(root, offset);
5687 
5688 		/* move on to the next group */
5689 		if (search_start + num_bytes >
5690 		    used_block_group->key.objectid + used_block_group->key.offset) {
5691 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5692 			goto loop;
5693 		}
5694 
5695 		if (offset < search_start)
5696 			btrfs_add_free_space(used_block_group, offset,
5697 					     search_start - offset);
5698 		BUG_ON(offset > search_start);
5699 
5700 		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5701 						  alloc_type);
5702 		if (ret == -EAGAIN) {
5703 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5704 			goto loop;
5705 		}
5706 
5707 		/* we are all good, lets return */
5708 		ins->objectid = search_start;
5709 		ins->offset = num_bytes;
5710 
5711 		trace_btrfs_reserve_extent(orig_root, block_group,
5712 					   search_start, num_bytes);
5713 		if (offset < search_start)
5714 			btrfs_add_free_space(used_block_group, offset,
5715 					     search_start - offset);
5716 		BUG_ON(offset > search_start);
5717 		if (used_block_group != block_group)
5718 			btrfs_put_block_group(used_block_group);
5719 		btrfs_put_block_group(block_group);
5720 		break;
5721 loop:
5722 		failed_cluster_refill = false;
5723 		failed_alloc = false;
5724 		BUG_ON(index != get_block_group_index(block_group));
5725 		if (used_block_group != block_group)
5726 			btrfs_put_block_group(used_block_group);
5727 		btrfs_put_block_group(block_group);
5728 	}
5729 	up_read(&space_info->groups_sem);
5730 
5731 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5732 		goto search;
5733 
5734 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5735 		goto search;
5736 
5737 	/*
5738 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5739 	 *			caching kthreads as we move along
5740 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5741 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5742 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5743 	 *			again
5744 	 */
5745 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5746 		index = 0;
5747 		loop++;
5748 		if (loop == LOOP_ALLOC_CHUNK) {
5749 		       if (allowed_chunk_alloc) {
5750 				ret = do_chunk_alloc(trans, root, num_bytes +
5751 						     2 * 1024 * 1024, data,
5752 						     CHUNK_ALLOC_LIMITED);
5753 				if (ret < 0) {
5754 					btrfs_abort_transaction(trans,
5755 								root, ret);
5756 					goto out;
5757 				}
5758 				allowed_chunk_alloc = 0;
5759 				if (ret == 1)
5760 					done_chunk_alloc = 1;
5761 			} else if (!done_chunk_alloc &&
5762 				   space_info->force_alloc ==
5763 				   CHUNK_ALLOC_NO_FORCE) {
5764 				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5765 			}
5766 
5767 		       /*
5768 			* We didn't allocate a chunk, go ahead and drop the
5769 			* empty size and loop again.
5770 			*/
5771 		       if (!done_chunk_alloc)
5772 			       loop = LOOP_NO_EMPTY_SIZE;
5773 		}
5774 
5775 		if (loop == LOOP_NO_EMPTY_SIZE) {
5776 			empty_size = 0;
5777 			empty_cluster = 0;
5778 		}
5779 
5780 		goto search;
5781 	} else if (!ins->objectid) {
5782 		ret = -ENOSPC;
5783 	} else if (ins->objectid) {
5784 		ret = 0;
5785 	}
5786 out:
5787 
5788 	return ret;
5789 }
5790 
5791 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5792 			    int dump_block_groups)
5793 {
5794 	struct btrfs_block_group_cache *cache;
5795 	int index = 0;
5796 
5797 	spin_lock(&info->lock);
5798 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5799 	       (unsigned long long)info->flags,
5800 	       (unsigned long long)(info->total_bytes - info->bytes_used -
5801 				    info->bytes_pinned - info->bytes_reserved -
5802 				    info->bytes_readonly),
5803 	       (info->full) ? "" : "not ");
5804 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5805 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
5806 	       (unsigned long long)info->total_bytes,
5807 	       (unsigned long long)info->bytes_used,
5808 	       (unsigned long long)info->bytes_pinned,
5809 	       (unsigned long long)info->bytes_reserved,
5810 	       (unsigned long long)info->bytes_may_use,
5811 	       (unsigned long long)info->bytes_readonly);
5812 	spin_unlock(&info->lock);
5813 
5814 	if (!dump_block_groups)
5815 		return;
5816 
5817 	down_read(&info->groups_sem);
5818 again:
5819 	list_for_each_entry(cache, &info->block_groups[index], list) {
5820 		spin_lock(&cache->lock);
5821 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5822 		       "%llu pinned %llu reserved\n",
5823 		       (unsigned long long)cache->key.objectid,
5824 		       (unsigned long long)cache->key.offset,
5825 		       (unsigned long long)btrfs_block_group_used(&cache->item),
5826 		       (unsigned long long)cache->pinned,
5827 		       (unsigned long long)cache->reserved);
5828 		btrfs_dump_free_space(cache, bytes);
5829 		spin_unlock(&cache->lock);
5830 	}
5831 	if (++index < BTRFS_NR_RAID_TYPES)
5832 		goto again;
5833 	up_read(&info->groups_sem);
5834 }
5835 
5836 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5837 			 struct btrfs_root *root,
5838 			 u64 num_bytes, u64 min_alloc_size,
5839 			 u64 empty_size, u64 hint_byte,
5840 			 struct btrfs_key *ins, u64 data)
5841 {
5842 	bool final_tried = false;
5843 	int ret;
5844 
5845 	data = btrfs_get_alloc_profile(root, data);
5846 again:
5847 	/*
5848 	 * the only place that sets empty_size is btrfs_realloc_node, which
5849 	 * is not called recursively on allocations
5850 	 */
5851 	if (empty_size || root->ref_cows) {
5852 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5853 				     num_bytes + 2 * 1024 * 1024, data,
5854 				     CHUNK_ALLOC_NO_FORCE);
5855 		if (ret < 0 && ret != -ENOSPC) {
5856 			btrfs_abort_transaction(trans, root, ret);
5857 			return ret;
5858 		}
5859 	}
5860 
5861 	WARN_ON(num_bytes < root->sectorsize);
5862 	ret = find_free_extent(trans, root, num_bytes, empty_size,
5863 			       hint_byte, ins, data);
5864 
5865 	if (ret == -ENOSPC) {
5866 		if (!final_tried) {
5867 			num_bytes = num_bytes >> 1;
5868 			num_bytes = num_bytes & ~(root->sectorsize - 1);
5869 			num_bytes = max(num_bytes, min_alloc_size);
5870 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5871 				       num_bytes, data, CHUNK_ALLOC_FORCE);
5872 			if (ret < 0 && ret != -ENOSPC) {
5873 				btrfs_abort_transaction(trans, root, ret);
5874 				return ret;
5875 			}
5876 			if (num_bytes == min_alloc_size)
5877 				final_tried = true;
5878 			goto again;
5879 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
5880 			struct btrfs_space_info *sinfo;
5881 
5882 			sinfo = __find_space_info(root->fs_info, data);
5883 			printk(KERN_ERR "btrfs allocation failed flags %llu, "
5884 			       "wanted %llu\n", (unsigned long long)data,
5885 			       (unsigned long long)num_bytes);
5886 			if (sinfo)
5887 				dump_space_info(sinfo, num_bytes, 1);
5888 		}
5889 	}
5890 
5891 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5892 
5893 	return ret;
5894 }
5895 
5896 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5897 					u64 start, u64 len, int pin)
5898 {
5899 	struct btrfs_block_group_cache *cache;
5900 	int ret = 0;
5901 
5902 	cache = btrfs_lookup_block_group(root->fs_info, start);
5903 	if (!cache) {
5904 		printk(KERN_ERR "Unable to find block group for %llu\n",
5905 		       (unsigned long long)start);
5906 		return -ENOSPC;
5907 	}
5908 
5909 	if (btrfs_test_opt(root, DISCARD))
5910 		ret = btrfs_discard_extent(root, start, len, NULL);
5911 
5912 	if (pin)
5913 		pin_down_extent(root, cache, start, len, 1);
5914 	else {
5915 		btrfs_add_free_space(cache, start, len);
5916 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5917 	}
5918 	btrfs_put_block_group(cache);
5919 
5920 	trace_btrfs_reserved_extent_free(root, start, len);
5921 
5922 	return ret;
5923 }
5924 
5925 int btrfs_free_reserved_extent(struct btrfs_root *root,
5926 					u64 start, u64 len)
5927 {
5928 	return __btrfs_free_reserved_extent(root, start, len, 0);
5929 }
5930 
5931 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5932 				       u64 start, u64 len)
5933 {
5934 	return __btrfs_free_reserved_extent(root, start, len, 1);
5935 }
5936 
5937 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5938 				      struct btrfs_root *root,
5939 				      u64 parent, u64 root_objectid,
5940 				      u64 flags, u64 owner, u64 offset,
5941 				      struct btrfs_key *ins, int ref_mod)
5942 {
5943 	int ret;
5944 	struct btrfs_fs_info *fs_info = root->fs_info;
5945 	struct btrfs_extent_item *extent_item;
5946 	struct btrfs_extent_inline_ref *iref;
5947 	struct btrfs_path *path;
5948 	struct extent_buffer *leaf;
5949 	int type;
5950 	u32 size;
5951 
5952 	if (parent > 0)
5953 		type = BTRFS_SHARED_DATA_REF_KEY;
5954 	else
5955 		type = BTRFS_EXTENT_DATA_REF_KEY;
5956 
5957 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5958 
5959 	path = btrfs_alloc_path();
5960 	if (!path)
5961 		return -ENOMEM;
5962 
5963 	path->leave_spinning = 1;
5964 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5965 				      ins, size);
5966 	if (ret) {
5967 		btrfs_free_path(path);
5968 		return ret;
5969 	}
5970 
5971 	leaf = path->nodes[0];
5972 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
5973 				     struct btrfs_extent_item);
5974 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5975 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5976 	btrfs_set_extent_flags(leaf, extent_item,
5977 			       flags | BTRFS_EXTENT_FLAG_DATA);
5978 
5979 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5980 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
5981 	if (parent > 0) {
5982 		struct btrfs_shared_data_ref *ref;
5983 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
5984 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5985 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5986 	} else {
5987 		struct btrfs_extent_data_ref *ref;
5988 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5989 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5990 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5991 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5992 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5993 	}
5994 
5995 	btrfs_mark_buffer_dirty(path->nodes[0]);
5996 	btrfs_free_path(path);
5997 
5998 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5999 	if (ret) { /* -ENOENT, logic error */
6000 		printk(KERN_ERR "btrfs update block group failed for %llu "
6001 		       "%llu\n", (unsigned long long)ins->objectid,
6002 		       (unsigned long long)ins->offset);
6003 		BUG();
6004 	}
6005 	return ret;
6006 }
6007 
6008 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6009 				     struct btrfs_root *root,
6010 				     u64 parent, u64 root_objectid,
6011 				     u64 flags, struct btrfs_disk_key *key,
6012 				     int level, struct btrfs_key *ins)
6013 {
6014 	int ret;
6015 	struct btrfs_fs_info *fs_info = root->fs_info;
6016 	struct btrfs_extent_item *extent_item;
6017 	struct btrfs_tree_block_info *block_info;
6018 	struct btrfs_extent_inline_ref *iref;
6019 	struct btrfs_path *path;
6020 	struct extent_buffer *leaf;
6021 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6022 
6023 	path = btrfs_alloc_path();
6024 	if (!path)
6025 		return -ENOMEM;
6026 
6027 	path->leave_spinning = 1;
6028 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6029 				      ins, size);
6030 	if (ret) {
6031 		btrfs_free_path(path);
6032 		return ret;
6033 	}
6034 
6035 	leaf = path->nodes[0];
6036 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6037 				     struct btrfs_extent_item);
6038 	btrfs_set_extent_refs(leaf, extent_item, 1);
6039 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6040 	btrfs_set_extent_flags(leaf, extent_item,
6041 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6042 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6043 
6044 	btrfs_set_tree_block_key(leaf, block_info, key);
6045 	btrfs_set_tree_block_level(leaf, block_info, level);
6046 
6047 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6048 	if (parent > 0) {
6049 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6050 		btrfs_set_extent_inline_ref_type(leaf, iref,
6051 						 BTRFS_SHARED_BLOCK_REF_KEY);
6052 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6053 	} else {
6054 		btrfs_set_extent_inline_ref_type(leaf, iref,
6055 						 BTRFS_TREE_BLOCK_REF_KEY);
6056 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6057 	}
6058 
6059 	btrfs_mark_buffer_dirty(leaf);
6060 	btrfs_free_path(path);
6061 
6062 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6063 	if (ret) { /* -ENOENT, logic error */
6064 		printk(KERN_ERR "btrfs update block group failed for %llu "
6065 		       "%llu\n", (unsigned long long)ins->objectid,
6066 		       (unsigned long long)ins->offset);
6067 		BUG();
6068 	}
6069 	return ret;
6070 }
6071 
6072 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6073 				     struct btrfs_root *root,
6074 				     u64 root_objectid, u64 owner,
6075 				     u64 offset, struct btrfs_key *ins)
6076 {
6077 	int ret;
6078 
6079 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6080 
6081 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6082 					 ins->offset, 0,
6083 					 root_objectid, owner, offset,
6084 					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6085 	return ret;
6086 }
6087 
6088 /*
6089  * this is used by the tree logging recovery code.  It records that
6090  * an extent has been allocated and makes sure to clear the free
6091  * space cache bits as well
6092  */
6093 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6094 				   struct btrfs_root *root,
6095 				   u64 root_objectid, u64 owner, u64 offset,
6096 				   struct btrfs_key *ins)
6097 {
6098 	int ret;
6099 	struct btrfs_block_group_cache *block_group;
6100 	struct btrfs_caching_control *caching_ctl;
6101 	u64 start = ins->objectid;
6102 	u64 num_bytes = ins->offset;
6103 
6104 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6105 	cache_block_group(block_group, trans, NULL, 0);
6106 	caching_ctl = get_caching_control(block_group);
6107 
6108 	if (!caching_ctl) {
6109 		BUG_ON(!block_group_cache_done(block_group));
6110 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6111 		BUG_ON(ret); /* -ENOMEM */
6112 	} else {
6113 		mutex_lock(&caching_ctl->mutex);
6114 
6115 		if (start >= caching_ctl->progress) {
6116 			ret = add_excluded_extent(root, start, num_bytes);
6117 			BUG_ON(ret); /* -ENOMEM */
6118 		} else if (start + num_bytes <= caching_ctl->progress) {
6119 			ret = btrfs_remove_free_space(block_group,
6120 						      start, num_bytes);
6121 			BUG_ON(ret); /* -ENOMEM */
6122 		} else {
6123 			num_bytes = caching_ctl->progress - start;
6124 			ret = btrfs_remove_free_space(block_group,
6125 						      start, num_bytes);
6126 			BUG_ON(ret); /* -ENOMEM */
6127 
6128 			start = caching_ctl->progress;
6129 			num_bytes = ins->objectid + ins->offset -
6130 				    caching_ctl->progress;
6131 			ret = add_excluded_extent(root, start, num_bytes);
6132 			BUG_ON(ret); /* -ENOMEM */
6133 		}
6134 
6135 		mutex_unlock(&caching_ctl->mutex);
6136 		put_caching_control(caching_ctl);
6137 	}
6138 
6139 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6140 					  RESERVE_ALLOC_NO_ACCOUNT);
6141 	BUG_ON(ret); /* logic error */
6142 	btrfs_put_block_group(block_group);
6143 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6144 					 0, owner, offset, ins, 1);
6145 	return ret;
6146 }
6147 
6148 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6149 					    struct btrfs_root *root,
6150 					    u64 bytenr, u32 blocksize,
6151 					    int level)
6152 {
6153 	struct extent_buffer *buf;
6154 
6155 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6156 	if (!buf)
6157 		return ERR_PTR(-ENOMEM);
6158 	btrfs_set_header_generation(buf, trans->transid);
6159 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6160 	btrfs_tree_lock(buf);
6161 	clean_tree_block(trans, root, buf);
6162 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6163 
6164 	btrfs_set_lock_blocking(buf);
6165 	btrfs_set_buffer_uptodate(buf);
6166 
6167 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6168 		/*
6169 		 * we allow two log transactions at a time, use different
6170 		 * EXENT bit to differentiate dirty pages.
6171 		 */
6172 		if (root->log_transid % 2 == 0)
6173 			set_extent_dirty(&root->dirty_log_pages, buf->start,
6174 					buf->start + buf->len - 1, GFP_NOFS);
6175 		else
6176 			set_extent_new(&root->dirty_log_pages, buf->start,
6177 					buf->start + buf->len - 1, GFP_NOFS);
6178 	} else {
6179 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6180 			 buf->start + buf->len - 1, GFP_NOFS);
6181 	}
6182 	trans->blocks_used++;
6183 	/* this returns a buffer locked for blocking */
6184 	return buf;
6185 }
6186 
6187 static struct btrfs_block_rsv *
6188 use_block_rsv(struct btrfs_trans_handle *trans,
6189 	      struct btrfs_root *root, u32 blocksize)
6190 {
6191 	struct btrfs_block_rsv *block_rsv;
6192 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6193 	int ret;
6194 
6195 	block_rsv = get_block_rsv(trans, root);
6196 
6197 	if (block_rsv->size == 0) {
6198 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6199 		/*
6200 		 * If we couldn't reserve metadata bytes try and use some from
6201 		 * the global reserve.
6202 		 */
6203 		if (ret && block_rsv != global_rsv) {
6204 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6205 			if (!ret)
6206 				return global_rsv;
6207 			return ERR_PTR(ret);
6208 		} else if (ret) {
6209 			return ERR_PTR(ret);
6210 		}
6211 		return block_rsv;
6212 	}
6213 
6214 	ret = block_rsv_use_bytes(block_rsv, blocksize);
6215 	if (!ret)
6216 		return block_rsv;
6217 	if (ret) {
6218 		static DEFINE_RATELIMIT_STATE(_rs,
6219 				DEFAULT_RATELIMIT_INTERVAL,
6220 				/*DEFAULT_RATELIMIT_BURST*/ 2);
6221 		if (__ratelimit(&_rs)) {
6222 			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
6223 			WARN_ON(1);
6224 		}
6225 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6226 		if (!ret) {
6227 			return block_rsv;
6228 		} else if (ret && block_rsv != global_rsv) {
6229 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6230 			if (!ret)
6231 				return global_rsv;
6232 		}
6233 	}
6234 
6235 	return ERR_PTR(-ENOSPC);
6236 }
6237 
6238 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6239 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
6240 {
6241 	block_rsv_add_bytes(block_rsv, blocksize, 0);
6242 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6243 }
6244 
6245 /*
6246  * finds a free extent and does all the dirty work required for allocation
6247  * returns the key for the extent through ins, and a tree buffer for
6248  * the first block of the extent through buf.
6249  *
6250  * returns the tree buffer or NULL.
6251  */
6252 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6253 					struct btrfs_root *root, u32 blocksize,
6254 					u64 parent, u64 root_objectid,
6255 					struct btrfs_disk_key *key, int level,
6256 					u64 hint, u64 empty_size, int for_cow)
6257 {
6258 	struct btrfs_key ins;
6259 	struct btrfs_block_rsv *block_rsv;
6260 	struct extent_buffer *buf;
6261 	u64 flags = 0;
6262 	int ret;
6263 
6264 
6265 	block_rsv = use_block_rsv(trans, root, blocksize);
6266 	if (IS_ERR(block_rsv))
6267 		return ERR_CAST(block_rsv);
6268 
6269 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6270 				   empty_size, hint, &ins, 0);
6271 	if (ret) {
6272 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6273 		return ERR_PTR(ret);
6274 	}
6275 
6276 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6277 				    blocksize, level);
6278 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6279 
6280 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6281 		if (parent == 0)
6282 			parent = ins.objectid;
6283 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6284 	} else
6285 		BUG_ON(parent > 0);
6286 
6287 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6288 		struct btrfs_delayed_extent_op *extent_op;
6289 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
6290 		BUG_ON(!extent_op); /* -ENOMEM */
6291 		if (key)
6292 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
6293 		else
6294 			memset(&extent_op->key, 0, sizeof(extent_op->key));
6295 		extent_op->flags_to_set = flags;
6296 		extent_op->update_key = 1;
6297 		extent_op->update_flags = 1;
6298 		extent_op->is_data = 0;
6299 
6300 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6301 					ins.objectid,
6302 					ins.offset, parent, root_objectid,
6303 					level, BTRFS_ADD_DELAYED_EXTENT,
6304 					extent_op, for_cow);
6305 		BUG_ON(ret); /* -ENOMEM */
6306 	}
6307 	return buf;
6308 }
6309 
6310 struct walk_control {
6311 	u64 refs[BTRFS_MAX_LEVEL];
6312 	u64 flags[BTRFS_MAX_LEVEL];
6313 	struct btrfs_key update_progress;
6314 	int stage;
6315 	int level;
6316 	int shared_level;
6317 	int update_ref;
6318 	int keep_locks;
6319 	int reada_slot;
6320 	int reada_count;
6321 	int for_reloc;
6322 };
6323 
6324 #define DROP_REFERENCE	1
6325 #define UPDATE_BACKREF	2
6326 
6327 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6328 				     struct btrfs_root *root,
6329 				     struct walk_control *wc,
6330 				     struct btrfs_path *path)
6331 {
6332 	u64 bytenr;
6333 	u64 generation;
6334 	u64 refs;
6335 	u64 flags;
6336 	u32 nritems;
6337 	u32 blocksize;
6338 	struct btrfs_key key;
6339 	struct extent_buffer *eb;
6340 	int ret;
6341 	int slot;
6342 	int nread = 0;
6343 
6344 	if (path->slots[wc->level] < wc->reada_slot) {
6345 		wc->reada_count = wc->reada_count * 2 / 3;
6346 		wc->reada_count = max(wc->reada_count, 2);
6347 	} else {
6348 		wc->reada_count = wc->reada_count * 3 / 2;
6349 		wc->reada_count = min_t(int, wc->reada_count,
6350 					BTRFS_NODEPTRS_PER_BLOCK(root));
6351 	}
6352 
6353 	eb = path->nodes[wc->level];
6354 	nritems = btrfs_header_nritems(eb);
6355 	blocksize = btrfs_level_size(root, wc->level - 1);
6356 
6357 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6358 		if (nread >= wc->reada_count)
6359 			break;
6360 
6361 		cond_resched();
6362 		bytenr = btrfs_node_blockptr(eb, slot);
6363 		generation = btrfs_node_ptr_generation(eb, slot);
6364 
6365 		if (slot == path->slots[wc->level])
6366 			goto reada;
6367 
6368 		if (wc->stage == UPDATE_BACKREF &&
6369 		    generation <= root->root_key.offset)
6370 			continue;
6371 
6372 		/* We don't lock the tree block, it's OK to be racy here */
6373 		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6374 					       &refs, &flags);
6375 		/* We don't care about errors in readahead. */
6376 		if (ret < 0)
6377 			continue;
6378 		BUG_ON(refs == 0);
6379 
6380 		if (wc->stage == DROP_REFERENCE) {
6381 			if (refs == 1)
6382 				goto reada;
6383 
6384 			if (wc->level == 1 &&
6385 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6386 				continue;
6387 			if (!wc->update_ref ||
6388 			    generation <= root->root_key.offset)
6389 				continue;
6390 			btrfs_node_key_to_cpu(eb, &key, slot);
6391 			ret = btrfs_comp_cpu_keys(&key,
6392 						  &wc->update_progress);
6393 			if (ret < 0)
6394 				continue;
6395 		} else {
6396 			if (wc->level == 1 &&
6397 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6398 				continue;
6399 		}
6400 reada:
6401 		ret = readahead_tree_block(root, bytenr, blocksize,
6402 					   generation);
6403 		if (ret)
6404 			break;
6405 		nread++;
6406 	}
6407 	wc->reada_slot = slot;
6408 }
6409 
6410 /*
6411  * hepler to process tree block while walking down the tree.
6412  *
6413  * when wc->stage == UPDATE_BACKREF, this function updates
6414  * back refs for pointers in the block.
6415  *
6416  * NOTE: return value 1 means we should stop walking down.
6417  */
6418 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6419 				   struct btrfs_root *root,
6420 				   struct btrfs_path *path,
6421 				   struct walk_control *wc, int lookup_info)
6422 {
6423 	int level = wc->level;
6424 	struct extent_buffer *eb = path->nodes[level];
6425 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6426 	int ret;
6427 
6428 	if (wc->stage == UPDATE_BACKREF &&
6429 	    btrfs_header_owner(eb) != root->root_key.objectid)
6430 		return 1;
6431 
6432 	/*
6433 	 * when reference count of tree block is 1, it won't increase
6434 	 * again. once full backref flag is set, we never clear it.
6435 	 */
6436 	if (lookup_info &&
6437 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6438 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6439 		BUG_ON(!path->locks[level]);
6440 		ret = btrfs_lookup_extent_info(trans, root,
6441 					       eb->start, eb->len,
6442 					       &wc->refs[level],
6443 					       &wc->flags[level]);
6444 		BUG_ON(ret == -ENOMEM);
6445 		if (ret)
6446 			return ret;
6447 		BUG_ON(wc->refs[level] == 0);
6448 	}
6449 
6450 	if (wc->stage == DROP_REFERENCE) {
6451 		if (wc->refs[level] > 1)
6452 			return 1;
6453 
6454 		if (path->locks[level] && !wc->keep_locks) {
6455 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6456 			path->locks[level] = 0;
6457 		}
6458 		return 0;
6459 	}
6460 
6461 	/* wc->stage == UPDATE_BACKREF */
6462 	if (!(wc->flags[level] & flag)) {
6463 		BUG_ON(!path->locks[level]);
6464 		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6465 		BUG_ON(ret); /* -ENOMEM */
6466 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6467 		BUG_ON(ret); /* -ENOMEM */
6468 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6469 						  eb->len, flag, 0);
6470 		BUG_ON(ret); /* -ENOMEM */
6471 		wc->flags[level] |= flag;
6472 	}
6473 
6474 	/*
6475 	 * the block is shared by multiple trees, so it's not good to
6476 	 * keep the tree lock
6477 	 */
6478 	if (path->locks[level] && level > 0) {
6479 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6480 		path->locks[level] = 0;
6481 	}
6482 	return 0;
6483 }
6484 
6485 /*
6486  * hepler to process tree block pointer.
6487  *
6488  * when wc->stage == DROP_REFERENCE, this function checks
6489  * reference count of the block pointed to. if the block
6490  * is shared and we need update back refs for the subtree
6491  * rooted at the block, this function changes wc->stage to
6492  * UPDATE_BACKREF. if the block is shared and there is no
6493  * need to update back, this function drops the reference
6494  * to the block.
6495  *
6496  * NOTE: return value 1 means we should stop walking down.
6497  */
6498 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6499 				 struct btrfs_root *root,
6500 				 struct btrfs_path *path,
6501 				 struct walk_control *wc, int *lookup_info)
6502 {
6503 	u64 bytenr;
6504 	u64 generation;
6505 	u64 parent;
6506 	u32 blocksize;
6507 	struct btrfs_key key;
6508 	struct extent_buffer *next;
6509 	int level = wc->level;
6510 	int reada = 0;
6511 	int ret = 0;
6512 
6513 	generation = btrfs_node_ptr_generation(path->nodes[level],
6514 					       path->slots[level]);
6515 	/*
6516 	 * if the lower level block was created before the snapshot
6517 	 * was created, we know there is no need to update back refs
6518 	 * for the subtree
6519 	 */
6520 	if (wc->stage == UPDATE_BACKREF &&
6521 	    generation <= root->root_key.offset) {
6522 		*lookup_info = 1;
6523 		return 1;
6524 	}
6525 
6526 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6527 	blocksize = btrfs_level_size(root, level - 1);
6528 
6529 	next = btrfs_find_tree_block(root, bytenr, blocksize);
6530 	if (!next) {
6531 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6532 		if (!next)
6533 			return -ENOMEM;
6534 		reada = 1;
6535 	}
6536 	btrfs_tree_lock(next);
6537 	btrfs_set_lock_blocking(next);
6538 
6539 	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6540 				       &wc->refs[level - 1],
6541 				       &wc->flags[level - 1]);
6542 	if (ret < 0) {
6543 		btrfs_tree_unlock(next);
6544 		return ret;
6545 	}
6546 
6547 	BUG_ON(wc->refs[level - 1] == 0);
6548 	*lookup_info = 0;
6549 
6550 	if (wc->stage == DROP_REFERENCE) {
6551 		if (wc->refs[level - 1] > 1) {
6552 			if (level == 1 &&
6553 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6554 				goto skip;
6555 
6556 			if (!wc->update_ref ||
6557 			    generation <= root->root_key.offset)
6558 				goto skip;
6559 
6560 			btrfs_node_key_to_cpu(path->nodes[level], &key,
6561 					      path->slots[level]);
6562 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6563 			if (ret < 0)
6564 				goto skip;
6565 
6566 			wc->stage = UPDATE_BACKREF;
6567 			wc->shared_level = level - 1;
6568 		}
6569 	} else {
6570 		if (level == 1 &&
6571 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6572 			goto skip;
6573 	}
6574 
6575 	if (!btrfs_buffer_uptodate(next, generation)) {
6576 		btrfs_tree_unlock(next);
6577 		free_extent_buffer(next);
6578 		next = NULL;
6579 		*lookup_info = 1;
6580 	}
6581 
6582 	if (!next) {
6583 		if (reada && level == 1)
6584 			reada_walk_down(trans, root, wc, path);
6585 		next = read_tree_block(root, bytenr, blocksize, generation);
6586 		if (!next)
6587 			return -EIO;
6588 		btrfs_tree_lock(next);
6589 		btrfs_set_lock_blocking(next);
6590 	}
6591 
6592 	level--;
6593 	BUG_ON(level != btrfs_header_level(next));
6594 	path->nodes[level] = next;
6595 	path->slots[level] = 0;
6596 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6597 	wc->level = level;
6598 	if (wc->level == 1)
6599 		wc->reada_slot = 0;
6600 	return 0;
6601 skip:
6602 	wc->refs[level - 1] = 0;
6603 	wc->flags[level - 1] = 0;
6604 	if (wc->stage == DROP_REFERENCE) {
6605 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6606 			parent = path->nodes[level]->start;
6607 		} else {
6608 			BUG_ON(root->root_key.objectid !=
6609 			       btrfs_header_owner(path->nodes[level]));
6610 			parent = 0;
6611 		}
6612 
6613 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6614 				root->root_key.objectid, level - 1, 0, 0);
6615 		BUG_ON(ret); /* -ENOMEM */
6616 	}
6617 	btrfs_tree_unlock(next);
6618 	free_extent_buffer(next);
6619 	*lookup_info = 1;
6620 	return 1;
6621 }
6622 
6623 /*
6624  * hepler to process tree block while walking up the tree.
6625  *
6626  * when wc->stage == DROP_REFERENCE, this function drops
6627  * reference count on the block.
6628  *
6629  * when wc->stage == UPDATE_BACKREF, this function changes
6630  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6631  * to UPDATE_BACKREF previously while processing the block.
6632  *
6633  * NOTE: return value 1 means we should stop walking up.
6634  */
6635 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6636 				 struct btrfs_root *root,
6637 				 struct btrfs_path *path,
6638 				 struct walk_control *wc)
6639 {
6640 	int ret;
6641 	int level = wc->level;
6642 	struct extent_buffer *eb = path->nodes[level];
6643 	u64 parent = 0;
6644 
6645 	if (wc->stage == UPDATE_BACKREF) {
6646 		BUG_ON(wc->shared_level < level);
6647 		if (level < wc->shared_level)
6648 			goto out;
6649 
6650 		ret = find_next_key(path, level + 1, &wc->update_progress);
6651 		if (ret > 0)
6652 			wc->update_ref = 0;
6653 
6654 		wc->stage = DROP_REFERENCE;
6655 		wc->shared_level = -1;
6656 		path->slots[level] = 0;
6657 
6658 		/*
6659 		 * check reference count again if the block isn't locked.
6660 		 * we should start walking down the tree again if reference
6661 		 * count is one.
6662 		 */
6663 		if (!path->locks[level]) {
6664 			BUG_ON(level == 0);
6665 			btrfs_tree_lock(eb);
6666 			btrfs_set_lock_blocking(eb);
6667 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6668 
6669 			ret = btrfs_lookup_extent_info(trans, root,
6670 						       eb->start, eb->len,
6671 						       &wc->refs[level],
6672 						       &wc->flags[level]);
6673 			if (ret < 0) {
6674 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6675 				return ret;
6676 			}
6677 			BUG_ON(wc->refs[level] == 0);
6678 			if (wc->refs[level] == 1) {
6679 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6680 				return 1;
6681 			}
6682 		}
6683 	}
6684 
6685 	/* wc->stage == DROP_REFERENCE */
6686 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6687 
6688 	if (wc->refs[level] == 1) {
6689 		if (level == 0) {
6690 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6691 				ret = btrfs_dec_ref(trans, root, eb, 1,
6692 						    wc->for_reloc);
6693 			else
6694 				ret = btrfs_dec_ref(trans, root, eb, 0,
6695 						    wc->for_reloc);
6696 			BUG_ON(ret); /* -ENOMEM */
6697 		}
6698 		/* make block locked assertion in clean_tree_block happy */
6699 		if (!path->locks[level] &&
6700 		    btrfs_header_generation(eb) == trans->transid) {
6701 			btrfs_tree_lock(eb);
6702 			btrfs_set_lock_blocking(eb);
6703 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6704 		}
6705 		clean_tree_block(trans, root, eb);
6706 	}
6707 
6708 	if (eb == root->node) {
6709 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6710 			parent = eb->start;
6711 		else
6712 			BUG_ON(root->root_key.objectid !=
6713 			       btrfs_header_owner(eb));
6714 	} else {
6715 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6716 			parent = path->nodes[level + 1]->start;
6717 		else
6718 			BUG_ON(root->root_key.objectid !=
6719 			       btrfs_header_owner(path->nodes[level + 1]));
6720 	}
6721 
6722 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6723 out:
6724 	wc->refs[level] = 0;
6725 	wc->flags[level] = 0;
6726 	return 0;
6727 }
6728 
6729 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6730 				   struct btrfs_root *root,
6731 				   struct btrfs_path *path,
6732 				   struct walk_control *wc)
6733 {
6734 	int level = wc->level;
6735 	int lookup_info = 1;
6736 	int ret;
6737 
6738 	while (level >= 0) {
6739 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
6740 		if (ret > 0)
6741 			break;
6742 
6743 		if (level == 0)
6744 			break;
6745 
6746 		if (path->slots[level] >=
6747 		    btrfs_header_nritems(path->nodes[level]))
6748 			break;
6749 
6750 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
6751 		if (ret > 0) {
6752 			path->slots[level]++;
6753 			continue;
6754 		} else if (ret < 0)
6755 			return ret;
6756 		level = wc->level;
6757 	}
6758 	return 0;
6759 }
6760 
6761 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6762 				 struct btrfs_root *root,
6763 				 struct btrfs_path *path,
6764 				 struct walk_control *wc, int max_level)
6765 {
6766 	int level = wc->level;
6767 	int ret;
6768 
6769 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6770 	while (level < max_level && path->nodes[level]) {
6771 		wc->level = level;
6772 		if (path->slots[level] + 1 <
6773 		    btrfs_header_nritems(path->nodes[level])) {
6774 			path->slots[level]++;
6775 			return 0;
6776 		} else {
6777 			ret = walk_up_proc(trans, root, path, wc);
6778 			if (ret > 0)
6779 				return 0;
6780 
6781 			if (path->locks[level]) {
6782 				btrfs_tree_unlock_rw(path->nodes[level],
6783 						     path->locks[level]);
6784 				path->locks[level] = 0;
6785 			}
6786 			free_extent_buffer(path->nodes[level]);
6787 			path->nodes[level] = NULL;
6788 			level++;
6789 		}
6790 	}
6791 	return 1;
6792 }
6793 
6794 /*
6795  * drop a subvolume tree.
6796  *
6797  * this function traverses the tree freeing any blocks that only
6798  * referenced by the tree.
6799  *
6800  * when a shared tree block is found. this function decreases its
6801  * reference count by one. if update_ref is true, this function
6802  * also make sure backrefs for the shared block and all lower level
6803  * blocks are properly updated.
6804  */
6805 int btrfs_drop_snapshot(struct btrfs_root *root,
6806 			 struct btrfs_block_rsv *block_rsv, int update_ref,
6807 			 int for_reloc)
6808 {
6809 	struct btrfs_path *path;
6810 	struct btrfs_trans_handle *trans;
6811 	struct btrfs_root *tree_root = root->fs_info->tree_root;
6812 	struct btrfs_root_item *root_item = &root->root_item;
6813 	struct walk_control *wc;
6814 	struct btrfs_key key;
6815 	int err = 0;
6816 	int ret;
6817 	int level;
6818 
6819 	path = btrfs_alloc_path();
6820 	if (!path) {
6821 		err = -ENOMEM;
6822 		goto out;
6823 	}
6824 
6825 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6826 	if (!wc) {
6827 		btrfs_free_path(path);
6828 		err = -ENOMEM;
6829 		goto out;
6830 	}
6831 
6832 	trans = btrfs_start_transaction(tree_root, 0);
6833 	if (IS_ERR(trans)) {
6834 		err = PTR_ERR(trans);
6835 		goto out_free;
6836 	}
6837 
6838 	if (block_rsv)
6839 		trans->block_rsv = block_rsv;
6840 
6841 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6842 		level = btrfs_header_level(root->node);
6843 		path->nodes[level] = btrfs_lock_root_node(root);
6844 		btrfs_set_lock_blocking(path->nodes[level]);
6845 		path->slots[level] = 0;
6846 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6847 		memset(&wc->update_progress, 0,
6848 		       sizeof(wc->update_progress));
6849 	} else {
6850 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6851 		memcpy(&wc->update_progress, &key,
6852 		       sizeof(wc->update_progress));
6853 
6854 		level = root_item->drop_level;
6855 		BUG_ON(level == 0);
6856 		path->lowest_level = level;
6857 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6858 		path->lowest_level = 0;
6859 		if (ret < 0) {
6860 			err = ret;
6861 			goto out_end_trans;
6862 		}
6863 		WARN_ON(ret > 0);
6864 
6865 		/*
6866 		 * unlock our path, this is safe because only this
6867 		 * function is allowed to delete this snapshot
6868 		 */
6869 		btrfs_unlock_up_safe(path, 0);
6870 
6871 		level = btrfs_header_level(root->node);
6872 		while (1) {
6873 			btrfs_tree_lock(path->nodes[level]);
6874 			btrfs_set_lock_blocking(path->nodes[level]);
6875 
6876 			ret = btrfs_lookup_extent_info(trans, root,
6877 						path->nodes[level]->start,
6878 						path->nodes[level]->len,
6879 						&wc->refs[level],
6880 						&wc->flags[level]);
6881 			if (ret < 0) {
6882 				err = ret;
6883 				goto out_end_trans;
6884 			}
6885 			BUG_ON(wc->refs[level] == 0);
6886 
6887 			if (level == root_item->drop_level)
6888 				break;
6889 
6890 			btrfs_tree_unlock(path->nodes[level]);
6891 			WARN_ON(wc->refs[level] != 1);
6892 			level--;
6893 		}
6894 	}
6895 
6896 	wc->level = level;
6897 	wc->shared_level = -1;
6898 	wc->stage = DROP_REFERENCE;
6899 	wc->update_ref = update_ref;
6900 	wc->keep_locks = 0;
6901 	wc->for_reloc = for_reloc;
6902 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6903 
6904 	while (1) {
6905 		ret = walk_down_tree(trans, root, path, wc);
6906 		if (ret < 0) {
6907 			err = ret;
6908 			break;
6909 		}
6910 
6911 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6912 		if (ret < 0) {
6913 			err = ret;
6914 			break;
6915 		}
6916 
6917 		if (ret > 0) {
6918 			BUG_ON(wc->stage != DROP_REFERENCE);
6919 			break;
6920 		}
6921 
6922 		if (wc->stage == DROP_REFERENCE) {
6923 			level = wc->level;
6924 			btrfs_node_key(path->nodes[level],
6925 				       &root_item->drop_progress,
6926 				       path->slots[level]);
6927 			root_item->drop_level = level;
6928 		}
6929 
6930 		BUG_ON(wc->level == 0);
6931 		if (btrfs_should_end_transaction(trans, tree_root)) {
6932 			ret = btrfs_update_root(trans, tree_root,
6933 						&root->root_key,
6934 						root_item);
6935 			if (ret) {
6936 				btrfs_abort_transaction(trans, tree_root, ret);
6937 				err = ret;
6938 				goto out_end_trans;
6939 			}
6940 
6941 			btrfs_end_transaction_throttle(trans, tree_root);
6942 			trans = btrfs_start_transaction(tree_root, 0);
6943 			if (IS_ERR(trans)) {
6944 				err = PTR_ERR(trans);
6945 				goto out_free;
6946 			}
6947 			if (block_rsv)
6948 				trans->block_rsv = block_rsv;
6949 		}
6950 	}
6951 	btrfs_release_path(path);
6952 	if (err)
6953 		goto out_end_trans;
6954 
6955 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
6956 	if (ret) {
6957 		btrfs_abort_transaction(trans, tree_root, ret);
6958 		goto out_end_trans;
6959 	}
6960 
6961 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6962 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6963 					   NULL, NULL);
6964 		if (ret < 0) {
6965 			btrfs_abort_transaction(trans, tree_root, ret);
6966 			err = ret;
6967 			goto out_end_trans;
6968 		} else if (ret > 0) {
6969 			/* if we fail to delete the orphan item this time
6970 			 * around, it'll get picked up the next time.
6971 			 *
6972 			 * The most common failure here is just -ENOENT.
6973 			 */
6974 			btrfs_del_orphan_item(trans, tree_root,
6975 					      root->root_key.objectid);
6976 		}
6977 	}
6978 
6979 	if (root->in_radix) {
6980 		btrfs_free_fs_root(tree_root->fs_info, root);
6981 	} else {
6982 		free_extent_buffer(root->node);
6983 		free_extent_buffer(root->commit_root);
6984 		kfree(root);
6985 	}
6986 out_end_trans:
6987 	btrfs_end_transaction_throttle(trans, tree_root);
6988 out_free:
6989 	kfree(wc);
6990 	btrfs_free_path(path);
6991 out:
6992 	if (err)
6993 		btrfs_std_error(root->fs_info, err);
6994 	return err;
6995 }
6996 
6997 /*
6998  * drop subtree rooted at tree block 'node'.
6999  *
7000  * NOTE: this function will unlock and release tree block 'node'
7001  * only used by relocation code
7002  */
7003 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7004 			struct btrfs_root *root,
7005 			struct extent_buffer *node,
7006 			struct extent_buffer *parent)
7007 {
7008 	struct btrfs_path *path;
7009 	struct walk_control *wc;
7010 	int level;
7011 	int parent_level;
7012 	int ret = 0;
7013 	int wret;
7014 
7015 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7016 
7017 	path = btrfs_alloc_path();
7018 	if (!path)
7019 		return -ENOMEM;
7020 
7021 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7022 	if (!wc) {
7023 		btrfs_free_path(path);
7024 		return -ENOMEM;
7025 	}
7026 
7027 	btrfs_assert_tree_locked(parent);
7028 	parent_level = btrfs_header_level(parent);
7029 	extent_buffer_get(parent);
7030 	path->nodes[parent_level] = parent;
7031 	path->slots[parent_level] = btrfs_header_nritems(parent);
7032 
7033 	btrfs_assert_tree_locked(node);
7034 	level = btrfs_header_level(node);
7035 	path->nodes[level] = node;
7036 	path->slots[level] = 0;
7037 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7038 
7039 	wc->refs[parent_level] = 1;
7040 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7041 	wc->level = level;
7042 	wc->shared_level = -1;
7043 	wc->stage = DROP_REFERENCE;
7044 	wc->update_ref = 0;
7045 	wc->keep_locks = 1;
7046 	wc->for_reloc = 1;
7047 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7048 
7049 	while (1) {
7050 		wret = walk_down_tree(trans, root, path, wc);
7051 		if (wret < 0) {
7052 			ret = wret;
7053 			break;
7054 		}
7055 
7056 		wret = walk_up_tree(trans, root, path, wc, parent_level);
7057 		if (wret < 0)
7058 			ret = wret;
7059 		if (wret != 0)
7060 			break;
7061 	}
7062 
7063 	kfree(wc);
7064 	btrfs_free_path(path);
7065 	return ret;
7066 }
7067 
7068 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7069 {
7070 	u64 num_devices;
7071 	u64 stripped;
7072 
7073 	/*
7074 	 * if restripe for this chunk_type is on pick target profile and
7075 	 * return, otherwise do the usual balance
7076 	 */
7077 	stripped = get_restripe_target(root->fs_info, flags);
7078 	if (stripped)
7079 		return extended_to_chunk(stripped);
7080 
7081 	/*
7082 	 * we add in the count of missing devices because we want
7083 	 * to make sure that any RAID levels on a degraded FS
7084 	 * continue to be honored.
7085 	 */
7086 	num_devices = root->fs_info->fs_devices->rw_devices +
7087 		root->fs_info->fs_devices->missing_devices;
7088 
7089 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
7090 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7091 
7092 	if (num_devices == 1) {
7093 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7094 		stripped = flags & ~stripped;
7095 
7096 		/* turn raid0 into single device chunks */
7097 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
7098 			return stripped;
7099 
7100 		/* turn mirroring into duplication */
7101 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7102 			     BTRFS_BLOCK_GROUP_RAID10))
7103 			return stripped | BTRFS_BLOCK_GROUP_DUP;
7104 	} else {
7105 		/* they already had raid on here, just return */
7106 		if (flags & stripped)
7107 			return flags;
7108 
7109 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7110 		stripped = flags & ~stripped;
7111 
7112 		/* switch duplicated blocks with raid1 */
7113 		if (flags & BTRFS_BLOCK_GROUP_DUP)
7114 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
7115 
7116 		/* this is drive concat, leave it alone */
7117 	}
7118 
7119 	return flags;
7120 }
7121 
7122 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7123 {
7124 	struct btrfs_space_info *sinfo = cache->space_info;
7125 	u64 num_bytes;
7126 	u64 min_allocable_bytes;
7127 	int ret = -ENOSPC;
7128 
7129 
7130 	/*
7131 	 * We need some metadata space and system metadata space for
7132 	 * allocating chunks in some corner cases until we force to set
7133 	 * it to be readonly.
7134 	 */
7135 	if ((sinfo->flags &
7136 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7137 	    !force)
7138 		min_allocable_bytes = 1 * 1024 * 1024;
7139 	else
7140 		min_allocable_bytes = 0;
7141 
7142 	spin_lock(&sinfo->lock);
7143 	spin_lock(&cache->lock);
7144 
7145 	if (cache->ro) {
7146 		ret = 0;
7147 		goto out;
7148 	}
7149 
7150 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7151 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7152 
7153 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7154 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7155 	    min_allocable_bytes <= sinfo->total_bytes) {
7156 		sinfo->bytes_readonly += num_bytes;
7157 		cache->ro = 1;
7158 		ret = 0;
7159 	}
7160 out:
7161 	spin_unlock(&cache->lock);
7162 	spin_unlock(&sinfo->lock);
7163 	return ret;
7164 }
7165 
7166 int btrfs_set_block_group_ro(struct btrfs_root *root,
7167 			     struct btrfs_block_group_cache *cache)
7168 
7169 {
7170 	struct btrfs_trans_handle *trans;
7171 	u64 alloc_flags;
7172 	int ret;
7173 
7174 	BUG_ON(cache->ro);
7175 
7176 	trans = btrfs_join_transaction(root);
7177 	if (IS_ERR(trans))
7178 		return PTR_ERR(trans);
7179 
7180 	alloc_flags = update_block_group_flags(root, cache->flags);
7181 	if (alloc_flags != cache->flags) {
7182 		ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7183 				     CHUNK_ALLOC_FORCE);
7184 		if (ret < 0)
7185 			goto out;
7186 	}
7187 
7188 	ret = set_block_group_ro(cache, 0);
7189 	if (!ret)
7190 		goto out;
7191 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7192 	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7193 			     CHUNK_ALLOC_FORCE);
7194 	if (ret < 0)
7195 		goto out;
7196 	ret = set_block_group_ro(cache, 0);
7197 out:
7198 	btrfs_end_transaction(trans, root);
7199 	return ret;
7200 }
7201 
7202 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7203 			    struct btrfs_root *root, u64 type)
7204 {
7205 	u64 alloc_flags = get_alloc_profile(root, type);
7206 	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7207 			      CHUNK_ALLOC_FORCE);
7208 }
7209 
7210 /*
7211  * helper to account the unused space of all the readonly block group in the
7212  * list. takes mirrors into account.
7213  */
7214 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7215 {
7216 	struct btrfs_block_group_cache *block_group;
7217 	u64 free_bytes = 0;
7218 	int factor;
7219 
7220 	list_for_each_entry(block_group, groups_list, list) {
7221 		spin_lock(&block_group->lock);
7222 
7223 		if (!block_group->ro) {
7224 			spin_unlock(&block_group->lock);
7225 			continue;
7226 		}
7227 
7228 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7229 					  BTRFS_BLOCK_GROUP_RAID10 |
7230 					  BTRFS_BLOCK_GROUP_DUP))
7231 			factor = 2;
7232 		else
7233 			factor = 1;
7234 
7235 		free_bytes += (block_group->key.offset -
7236 			       btrfs_block_group_used(&block_group->item)) *
7237 			       factor;
7238 
7239 		spin_unlock(&block_group->lock);
7240 	}
7241 
7242 	return free_bytes;
7243 }
7244 
7245 /*
7246  * helper to account the unused space of all the readonly block group in the
7247  * space_info. takes mirrors into account.
7248  */
7249 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7250 {
7251 	int i;
7252 	u64 free_bytes = 0;
7253 
7254 	spin_lock(&sinfo->lock);
7255 
7256 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7257 		if (!list_empty(&sinfo->block_groups[i]))
7258 			free_bytes += __btrfs_get_ro_block_group_free_space(
7259 						&sinfo->block_groups[i]);
7260 
7261 	spin_unlock(&sinfo->lock);
7262 
7263 	return free_bytes;
7264 }
7265 
7266 void btrfs_set_block_group_rw(struct btrfs_root *root,
7267 			      struct btrfs_block_group_cache *cache)
7268 {
7269 	struct btrfs_space_info *sinfo = cache->space_info;
7270 	u64 num_bytes;
7271 
7272 	BUG_ON(!cache->ro);
7273 
7274 	spin_lock(&sinfo->lock);
7275 	spin_lock(&cache->lock);
7276 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7277 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7278 	sinfo->bytes_readonly -= num_bytes;
7279 	cache->ro = 0;
7280 	spin_unlock(&cache->lock);
7281 	spin_unlock(&sinfo->lock);
7282 }
7283 
7284 /*
7285  * checks to see if its even possible to relocate this block group.
7286  *
7287  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7288  * ok to go ahead and try.
7289  */
7290 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7291 {
7292 	struct btrfs_block_group_cache *block_group;
7293 	struct btrfs_space_info *space_info;
7294 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7295 	struct btrfs_device *device;
7296 	u64 min_free;
7297 	u64 dev_min = 1;
7298 	u64 dev_nr = 0;
7299 	u64 target;
7300 	int index;
7301 	int full = 0;
7302 	int ret = 0;
7303 
7304 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7305 
7306 	/* odd, couldn't find the block group, leave it alone */
7307 	if (!block_group)
7308 		return -1;
7309 
7310 	min_free = btrfs_block_group_used(&block_group->item);
7311 
7312 	/* no bytes used, we're good */
7313 	if (!min_free)
7314 		goto out;
7315 
7316 	space_info = block_group->space_info;
7317 	spin_lock(&space_info->lock);
7318 
7319 	full = space_info->full;
7320 
7321 	/*
7322 	 * if this is the last block group we have in this space, we can't
7323 	 * relocate it unless we're able to allocate a new chunk below.
7324 	 *
7325 	 * Otherwise, we need to make sure we have room in the space to handle
7326 	 * all of the extents from this block group.  If we can, we're good
7327 	 */
7328 	if ((space_info->total_bytes != block_group->key.offset) &&
7329 	    (space_info->bytes_used + space_info->bytes_reserved +
7330 	     space_info->bytes_pinned + space_info->bytes_readonly +
7331 	     min_free < space_info->total_bytes)) {
7332 		spin_unlock(&space_info->lock);
7333 		goto out;
7334 	}
7335 	spin_unlock(&space_info->lock);
7336 
7337 	/*
7338 	 * ok we don't have enough space, but maybe we have free space on our
7339 	 * devices to allocate new chunks for relocation, so loop through our
7340 	 * alloc devices and guess if we have enough space.  if this block
7341 	 * group is going to be restriped, run checks against the target
7342 	 * profile instead of the current one.
7343 	 */
7344 	ret = -1;
7345 
7346 	/*
7347 	 * index:
7348 	 *      0: raid10
7349 	 *      1: raid1
7350 	 *      2: dup
7351 	 *      3: raid0
7352 	 *      4: single
7353 	 */
7354 	target = get_restripe_target(root->fs_info, block_group->flags);
7355 	if (target) {
7356 		index = __get_block_group_index(extended_to_chunk(target));
7357 	} else {
7358 		/*
7359 		 * this is just a balance, so if we were marked as full
7360 		 * we know there is no space for a new chunk
7361 		 */
7362 		if (full)
7363 			goto out;
7364 
7365 		index = get_block_group_index(block_group);
7366 	}
7367 
7368 	if (index == 0) {
7369 		dev_min = 4;
7370 		/* Divide by 2 */
7371 		min_free >>= 1;
7372 	} else if (index == 1) {
7373 		dev_min = 2;
7374 	} else if (index == 2) {
7375 		/* Multiply by 2 */
7376 		min_free <<= 1;
7377 	} else if (index == 3) {
7378 		dev_min = fs_devices->rw_devices;
7379 		do_div(min_free, dev_min);
7380 	}
7381 
7382 	mutex_lock(&root->fs_info->chunk_mutex);
7383 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7384 		u64 dev_offset;
7385 
7386 		/*
7387 		 * check to make sure we can actually find a chunk with enough
7388 		 * space to fit our block group in.
7389 		 */
7390 		if (device->total_bytes > device->bytes_used + min_free) {
7391 			ret = find_free_dev_extent(device, min_free,
7392 						   &dev_offset, NULL);
7393 			if (!ret)
7394 				dev_nr++;
7395 
7396 			if (dev_nr >= dev_min)
7397 				break;
7398 
7399 			ret = -1;
7400 		}
7401 	}
7402 	mutex_unlock(&root->fs_info->chunk_mutex);
7403 out:
7404 	btrfs_put_block_group(block_group);
7405 	return ret;
7406 }
7407 
7408 static int find_first_block_group(struct btrfs_root *root,
7409 		struct btrfs_path *path, struct btrfs_key *key)
7410 {
7411 	int ret = 0;
7412 	struct btrfs_key found_key;
7413 	struct extent_buffer *leaf;
7414 	int slot;
7415 
7416 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7417 	if (ret < 0)
7418 		goto out;
7419 
7420 	while (1) {
7421 		slot = path->slots[0];
7422 		leaf = path->nodes[0];
7423 		if (slot >= btrfs_header_nritems(leaf)) {
7424 			ret = btrfs_next_leaf(root, path);
7425 			if (ret == 0)
7426 				continue;
7427 			if (ret < 0)
7428 				goto out;
7429 			break;
7430 		}
7431 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7432 
7433 		if (found_key.objectid >= key->objectid &&
7434 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7435 			ret = 0;
7436 			goto out;
7437 		}
7438 		path->slots[0]++;
7439 	}
7440 out:
7441 	return ret;
7442 }
7443 
7444 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7445 {
7446 	struct btrfs_block_group_cache *block_group;
7447 	u64 last = 0;
7448 
7449 	while (1) {
7450 		struct inode *inode;
7451 
7452 		block_group = btrfs_lookup_first_block_group(info, last);
7453 		while (block_group) {
7454 			spin_lock(&block_group->lock);
7455 			if (block_group->iref)
7456 				break;
7457 			spin_unlock(&block_group->lock);
7458 			block_group = next_block_group(info->tree_root,
7459 						       block_group);
7460 		}
7461 		if (!block_group) {
7462 			if (last == 0)
7463 				break;
7464 			last = 0;
7465 			continue;
7466 		}
7467 
7468 		inode = block_group->inode;
7469 		block_group->iref = 0;
7470 		block_group->inode = NULL;
7471 		spin_unlock(&block_group->lock);
7472 		iput(inode);
7473 		last = block_group->key.objectid + block_group->key.offset;
7474 		btrfs_put_block_group(block_group);
7475 	}
7476 }
7477 
7478 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7479 {
7480 	struct btrfs_block_group_cache *block_group;
7481 	struct btrfs_space_info *space_info;
7482 	struct btrfs_caching_control *caching_ctl;
7483 	struct rb_node *n;
7484 
7485 	down_write(&info->extent_commit_sem);
7486 	while (!list_empty(&info->caching_block_groups)) {
7487 		caching_ctl = list_entry(info->caching_block_groups.next,
7488 					 struct btrfs_caching_control, list);
7489 		list_del(&caching_ctl->list);
7490 		put_caching_control(caching_ctl);
7491 	}
7492 	up_write(&info->extent_commit_sem);
7493 
7494 	spin_lock(&info->block_group_cache_lock);
7495 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7496 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7497 				       cache_node);
7498 		rb_erase(&block_group->cache_node,
7499 			 &info->block_group_cache_tree);
7500 		spin_unlock(&info->block_group_cache_lock);
7501 
7502 		down_write(&block_group->space_info->groups_sem);
7503 		list_del(&block_group->list);
7504 		up_write(&block_group->space_info->groups_sem);
7505 
7506 		if (block_group->cached == BTRFS_CACHE_STARTED)
7507 			wait_block_group_cache_done(block_group);
7508 
7509 		/*
7510 		 * We haven't cached this block group, which means we could
7511 		 * possibly have excluded extents on this block group.
7512 		 */
7513 		if (block_group->cached == BTRFS_CACHE_NO)
7514 			free_excluded_extents(info->extent_root, block_group);
7515 
7516 		btrfs_remove_free_space_cache(block_group);
7517 		btrfs_put_block_group(block_group);
7518 
7519 		spin_lock(&info->block_group_cache_lock);
7520 	}
7521 	spin_unlock(&info->block_group_cache_lock);
7522 
7523 	/* now that all the block groups are freed, go through and
7524 	 * free all the space_info structs.  This is only called during
7525 	 * the final stages of unmount, and so we know nobody is
7526 	 * using them.  We call synchronize_rcu() once before we start,
7527 	 * just to be on the safe side.
7528 	 */
7529 	synchronize_rcu();
7530 
7531 	release_global_block_rsv(info);
7532 
7533 	while(!list_empty(&info->space_info)) {
7534 		space_info = list_entry(info->space_info.next,
7535 					struct btrfs_space_info,
7536 					list);
7537 		if (space_info->bytes_pinned > 0 ||
7538 		    space_info->bytes_reserved > 0 ||
7539 		    space_info->bytes_may_use > 0) {
7540 			WARN_ON(1);
7541 			dump_space_info(space_info, 0, 0);
7542 		}
7543 		list_del(&space_info->list);
7544 		kfree(space_info);
7545 	}
7546 	return 0;
7547 }
7548 
7549 static void __link_block_group(struct btrfs_space_info *space_info,
7550 			       struct btrfs_block_group_cache *cache)
7551 {
7552 	int index = get_block_group_index(cache);
7553 
7554 	down_write(&space_info->groups_sem);
7555 	list_add_tail(&cache->list, &space_info->block_groups[index]);
7556 	up_write(&space_info->groups_sem);
7557 }
7558 
7559 int btrfs_read_block_groups(struct btrfs_root *root)
7560 {
7561 	struct btrfs_path *path;
7562 	int ret;
7563 	struct btrfs_block_group_cache *cache;
7564 	struct btrfs_fs_info *info = root->fs_info;
7565 	struct btrfs_space_info *space_info;
7566 	struct btrfs_key key;
7567 	struct btrfs_key found_key;
7568 	struct extent_buffer *leaf;
7569 	int need_clear = 0;
7570 	u64 cache_gen;
7571 
7572 	root = info->extent_root;
7573 	key.objectid = 0;
7574 	key.offset = 0;
7575 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7576 	path = btrfs_alloc_path();
7577 	if (!path)
7578 		return -ENOMEM;
7579 	path->reada = 1;
7580 
7581 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7582 	if (btrfs_test_opt(root, SPACE_CACHE) &&
7583 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7584 		need_clear = 1;
7585 	if (btrfs_test_opt(root, CLEAR_CACHE))
7586 		need_clear = 1;
7587 
7588 	while (1) {
7589 		ret = find_first_block_group(root, path, &key);
7590 		if (ret > 0)
7591 			break;
7592 		if (ret != 0)
7593 			goto error;
7594 		leaf = path->nodes[0];
7595 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7596 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7597 		if (!cache) {
7598 			ret = -ENOMEM;
7599 			goto error;
7600 		}
7601 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7602 						GFP_NOFS);
7603 		if (!cache->free_space_ctl) {
7604 			kfree(cache);
7605 			ret = -ENOMEM;
7606 			goto error;
7607 		}
7608 
7609 		atomic_set(&cache->count, 1);
7610 		spin_lock_init(&cache->lock);
7611 		cache->fs_info = info;
7612 		INIT_LIST_HEAD(&cache->list);
7613 		INIT_LIST_HEAD(&cache->cluster_list);
7614 
7615 		if (need_clear)
7616 			cache->disk_cache_state = BTRFS_DC_CLEAR;
7617 
7618 		read_extent_buffer(leaf, &cache->item,
7619 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7620 				   sizeof(cache->item));
7621 		memcpy(&cache->key, &found_key, sizeof(found_key));
7622 
7623 		key.objectid = found_key.objectid + found_key.offset;
7624 		btrfs_release_path(path);
7625 		cache->flags = btrfs_block_group_flags(&cache->item);
7626 		cache->sectorsize = root->sectorsize;
7627 
7628 		btrfs_init_free_space_ctl(cache);
7629 
7630 		/*
7631 		 * We need to exclude the super stripes now so that the space
7632 		 * info has super bytes accounted for, otherwise we'll think
7633 		 * we have more space than we actually do.
7634 		 */
7635 		exclude_super_stripes(root, cache);
7636 
7637 		/*
7638 		 * check for two cases, either we are full, and therefore
7639 		 * don't need to bother with the caching work since we won't
7640 		 * find any space, or we are empty, and we can just add all
7641 		 * the space in and be done with it.  This saves us _alot_ of
7642 		 * time, particularly in the full case.
7643 		 */
7644 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7645 			cache->last_byte_to_unpin = (u64)-1;
7646 			cache->cached = BTRFS_CACHE_FINISHED;
7647 			free_excluded_extents(root, cache);
7648 		} else if (btrfs_block_group_used(&cache->item) == 0) {
7649 			cache->last_byte_to_unpin = (u64)-1;
7650 			cache->cached = BTRFS_CACHE_FINISHED;
7651 			add_new_free_space(cache, root->fs_info,
7652 					   found_key.objectid,
7653 					   found_key.objectid +
7654 					   found_key.offset);
7655 			free_excluded_extents(root, cache);
7656 		}
7657 
7658 		ret = update_space_info(info, cache->flags, found_key.offset,
7659 					btrfs_block_group_used(&cache->item),
7660 					&space_info);
7661 		BUG_ON(ret); /* -ENOMEM */
7662 		cache->space_info = space_info;
7663 		spin_lock(&cache->space_info->lock);
7664 		cache->space_info->bytes_readonly += cache->bytes_super;
7665 		spin_unlock(&cache->space_info->lock);
7666 
7667 		__link_block_group(space_info, cache);
7668 
7669 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7670 		BUG_ON(ret); /* Logic error */
7671 
7672 		set_avail_alloc_bits(root->fs_info, cache->flags);
7673 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7674 			set_block_group_ro(cache, 1);
7675 	}
7676 
7677 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7678 		if (!(get_alloc_profile(root, space_info->flags) &
7679 		      (BTRFS_BLOCK_GROUP_RAID10 |
7680 		       BTRFS_BLOCK_GROUP_RAID1 |
7681 		       BTRFS_BLOCK_GROUP_DUP)))
7682 			continue;
7683 		/*
7684 		 * avoid allocating from un-mirrored block group if there are
7685 		 * mirrored block groups.
7686 		 */
7687 		list_for_each_entry(cache, &space_info->block_groups[3], list)
7688 			set_block_group_ro(cache, 1);
7689 		list_for_each_entry(cache, &space_info->block_groups[4], list)
7690 			set_block_group_ro(cache, 1);
7691 	}
7692 
7693 	init_global_block_rsv(info);
7694 	ret = 0;
7695 error:
7696 	btrfs_free_path(path);
7697 	return ret;
7698 }
7699 
7700 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7701 			   struct btrfs_root *root, u64 bytes_used,
7702 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7703 			   u64 size)
7704 {
7705 	int ret;
7706 	struct btrfs_root *extent_root;
7707 	struct btrfs_block_group_cache *cache;
7708 
7709 	extent_root = root->fs_info->extent_root;
7710 
7711 	root->fs_info->last_trans_log_full_commit = trans->transid;
7712 
7713 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7714 	if (!cache)
7715 		return -ENOMEM;
7716 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7717 					GFP_NOFS);
7718 	if (!cache->free_space_ctl) {
7719 		kfree(cache);
7720 		return -ENOMEM;
7721 	}
7722 
7723 	cache->key.objectid = chunk_offset;
7724 	cache->key.offset = size;
7725 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7726 	cache->sectorsize = root->sectorsize;
7727 	cache->fs_info = root->fs_info;
7728 
7729 	atomic_set(&cache->count, 1);
7730 	spin_lock_init(&cache->lock);
7731 	INIT_LIST_HEAD(&cache->list);
7732 	INIT_LIST_HEAD(&cache->cluster_list);
7733 
7734 	btrfs_init_free_space_ctl(cache);
7735 
7736 	btrfs_set_block_group_used(&cache->item, bytes_used);
7737 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7738 	cache->flags = type;
7739 	btrfs_set_block_group_flags(&cache->item, type);
7740 
7741 	cache->last_byte_to_unpin = (u64)-1;
7742 	cache->cached = BTRFS_CACHE_FINISHED;
7743 	exclude_super_stripes(root, cache);
7744 
7745 	add_new_free_space(cache, root->fs_info, chunk_offset,
7746 			   chunk_offset + size);
7747 
7748 	free_excluded_extents(root, cache);
7749 
7750 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7751 				&cache->space_info);
7752 	BUG_ON(ret); /* -ENOMEM */
7753 	update_global_block_rsv(root->fs_info);
7754 
7755 	spin_lock(&cache->space_info->lock);
7756 	cache->space_info->bytes_readonly += cache->bytes_super;
7757 	spin_unlock(&cache->space_info->lock);
7758 
7759 	__link_block_group(cache->space_info, cache);
7760 
7761 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7762 	BUG_ON(ret); /* Logic error */
7763 
7764 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7765 				sizeof(cache->item));
7766 	if (ret) {
7767 		btrfs_abort_transaction(trans, extent_root, ret);
7768 		return ret;
7769 	}
7770 
7771 	set_avail_alloc_bits(extent_root->fs_info, type);
7772 
7773 	return 0;
7774 }
7775 
7776 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7777 {
7778 	u64 extra_flags = chunk_to_extended(flags) &
7779 				BTRFS_EXTENDED_PROFILE_MASK;
7780 
7781 	if (flags & BTRFS_BLOCK_GROUP_DATA)
7782 		fs_info->avail_data_alloc_bits &= ~extra_flags;
7783 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
7784 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7785 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7786 		fs_info->avail_system_alloc_bits &= ~extra_flags;
7787 }
7788 
7789 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7790 			     struct btrfs_root *root, u64 group_start)
7791 {
7792 	struct btrfs_path *path;
7793 	struct btrfs_block_group_cache *block_group;
7794 	struct btrfs_free_cluster *cluster;
7795 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7796 	struct btrfs_key key;
7797 	struct inode *inode;
7798 	int ret;
7799 	int index;
7800 	int factor;
7801 
7802 	root = root->fs_info->extent_root;
7803 
7804 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7805 	BUG_ON(!block_group);
7806 	BUG_ON(!block_group->ro);
7807 
7808 	/*
7809 	 * Free the reserved super bytes from this block group before
7810 	 * remove it.
7811 	 */
7812 	free_excluded_extents(root, block_group);
7813 
7814 	memcpy(&key, &block_group->key, sizeof(key));
7815 	index = get_block_group_index(block_group);
7816 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7817 				  BTRFS_BLOCK_GROUP_RAID1 |
7818 				  BTRFS_BLOCK_GROUP_RAID10))
7819 		factor = 2;
7820 	else
7821 		factor = 1;
7822 
7823 	/* make sure this block group isn't part of an allocation cluster */
7824 	cluster = &root->fs_info->data_alloc_cluster;
7825 	spin_lock(&cluster->refill_lock);
7826 	btrfs_return_cluster_to_free_space(block_group, cluster);
7827 	spin_unlock(&cluster->refill_lock);
7828 
7829 	/*
7830 	 * make sure this block group isn't part of a metadata
7831 	 * allocation cluster
7832 	 */
7833 	cluster = &root->fs_info->meta_alloc_cluster;
7834 	spin_lock(&cluster->refill_lock);
7835 	btrfs_return_cluster_to_free_space(block_group, cluster);
7836 	spin_unlock(&cluster->refill_lock);
7837 
7838 	path = btrfs_alloc_path();
7839 	if (!path) {
7840 		ret = -ENOMEM;
7841 		goto out;
7842 	}
7843 
7844 	inode = lookup_free_space_inode(tree_root, block_group, path);
7845 	if (!IS_ERR(inode)) {
7846 		ret = btrfs_orphan_add(trans, inode);
7847 		if (ret) {
7848 			btrfs_add_delayed_iput(inode);
7849 			goto out;
7850 		}
7851 		clear_nlink(inode);
7852 		/* One for the block groups ref */
7853 		spin_lock(&block_group->lock);
7854 		if (block_group->iref) {
7855 			block_group->iref = 0;
7856 			block_group->inode = NULL;
7857 			spin_unlock(&block_group->lock);
7858 			iput(inode);
7859 		} else {
7860 			spin_unlock(&block_group->lock);
7861 		}
7862 		/* One for our lookup ref */
7863 		btrfs_add_delayed_iput(inode);
7864 	}
7865 
7866 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7867 	key.offset = block_group->key.objectid;
7868 	key.type = 0;
7869 
7870 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7871 	if (ret < 0)
7872 		goto out;
7873 	if (ret > 0)
7874 		btrfs_release_path(path);
7875 	if (ret == 0) {
7876 		ret = btrfs_del_item(trans, tree_root, path);
7877 		if (ret)
7878 			goto out;
7879 		btrfs_release_path(path);
7880 	}
7881 
7882 	spin_lock(&root->fs_info->block_group_cache_lock);
7883 	rb_erase(&block_group->cache_node,
7884 		 &root->fs_info->block_group_cache_tree);
7885 	spin_unlock(&root->fs_info->block_group_cache_lock);
7886 
7887 	down_write(&block_group->space_info->groups_sem);
7888 	/*
7889 	 * we must use list_del_init so people can check to see if they
7890 	 * are still on the list after taking the semaphore
7891 	 */
7892 	list_del_init(&block_group->list);
7893 	if (list_empty(&block_group->space_info->block_groups[index]))
7894 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
7895 	up_write(&block_group->space_info->groups_sem);
7896 
7897 	if (block_group->cached == BTRFS_CACHE_STARTED)
7898 		wait_block_group_cache_done(block_group);
7899 
7900 	btrfs_remove_free_space_cache(block_group);
7901 
7902 	spin_lock(&block_group->space_info->lock);
7903 	block_group->space_info->total_bytes -= block_group->key.offset;
7904 	block_group->space_info->bytes_readonly -= block_group->key.offset;
7905 	block_group->space_info->disk_total -= block_group->key.offset * factor;
7906 	spin_unlock(&block_group->space_info->lock);
7907 
7908 	memcpy(&key, &block_group->key, sizeof(key));
7909 
7910 	btrfs_clear_space_info_full(root->fs_info);
7911 
7912 	btrfs_put_block_group(block_group);
7913 	btrfs_put_block_group(block_group);
7914 
7915 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7916 	if (ret > 0)
7917 		ret = -EIO;
7918 	if (ret < 0)
7919 		goto out;
7920 
7921 	ret = btrfs_del_item(trans, root, path);
7922 out:
7923 	btrfs_free_path(path);
7924 	return ret;
7925 }
7926 
7927 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7928 {
7929 	struct btrfs_space_info *space_info;
7930 	struct btrfs_super_block *disk_super;
7931 	u64 features;
7932 	u64 flags;
7933 	int mixed = 0;
7934 	int ret;
7935 
7936 	disk_super = fs_info->super_copy;
7937 	if (!btrfs_super_root(disk_super))
7938 		return 1;
7939 
7940 	features = btrfs_super_incompat_flags(disk_super);
7941 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7942 		mixed = 1;
7943 
7944 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
7945 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7946 	if (ret)
7947 		goto out;
7948 
7949 	if (mixed) {
7950 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7951 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7952 	} else {
7953 		flags = BTRFS_BLOCK_GROUP_METADATA;
7954 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7955 		if (ret)
7956 			goto out;
7957 
7958 		flags = BTRFS_BLOCK_GROUP_DATA;
7959 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7960 	}
7961 out:
7962 	return ret;
7963 }
7964 
7965 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7966 {
7967 	return unpin_extent_range(root, start, end);
7968 }
7969 
7970 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7971 			       u64 num_bytes, u64 *actual_bytes)
7972 {
7973 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7974 }
7975 
7976 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7977 {
7978 	struct btrfs_fs_info *fs_info = root->fs_info;
7979 	struct btrfs_block_group_cache *cache = NULL;
7980 	u64 group_trimmed;
7981 	u64 start;
7982 	u64 end;
7983 	u64 trimmed = 0;
7984 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
7985 	int ret = 0;
7986 
7987 	/*
7988 	 * try to trim all FS space, our block group may start from non-zero.
7989 	 */
7990 	if (range->len == total_bytes)
7991 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
7992 	else
7993 		cache = btrfs_lookup_block_group(fs_info, range->start);
7994 
7995 	while (cache) {
7996 		if (cache->key.objectid >= (range->start + range->len)) {
7997 			btrfs_put_block_group(cache);
7998 			break;
7999 		}
8000 
8001 		start = max(range->start, cache->key.objectid);
8002 		end = min(range->start + range->len,
8003 				cache->key.objectid + cache->key.offset);
8004 
8005 		if (end - start >= range->minlen) {
8006 			if (!block_group_cache_done(cache)) {
8007 				ret = cache_block_group(cache, NULL, root, 0);
8008 				if (!ret)
8009 					wait_block_group_cache_done(cache);
8010 			}
8011 			ret = btrfs_trim_block_group(cache,
8012 						     &group_trimmed,
8013 						     start,
8014 						     end,
8015 						     range->minlen);
8016 
8017 			trimmed += group_trimmed;
8018 			if (ret) {
8019 				btrfs_put_block_group(cache);
8020 				break;
8021 			}
8022 		}
8023 
8024 		cache = next_block_group(fs_info->tree_root, cache);
8025 	}
8026 
8027 	range->len = trimmed;
8028 	return ret;
8029 }
8030