xref: /linux/fs/btrfs/extent-tree.c (revision 25aee3debe0464f6c680173041fa3de30ec9ff54)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 #undef SCRAMBLE_DELAYED_REFS
38 
39 /*
40  * control flags for do_chunk_alloc's force field
41  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
42  * if we really need one.
43  *
44  * CHUNK_ALLOC_LIMITED means to only try and allocate one
45  * if we have very few chunks already allocated.  This is
46  * used as part of the clustering code to help make sure
47  * we have a good pool of storage to cluster in, without
48  * filling the FS with empty chunks
49  *
50  * CHUNK_ALLOC_FORCE means it must try to allocate one
51  *
52  */
53 enum {
54 	CHUNK_ALLOC_NO_FORCE = 0,
55 	CHUNK_ALLOC_LIMITED = 1,
56 	CHUNK_ALLOC_FORCE = 2,
57 };
58 
59 /*
60  * Control how reservations are dealt with.
61  *
62  * RESERVE_FREE - freeing a reservation.
63  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
64  *   ENOSPC accounting
65  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
66  *   bytes_may_use as the ENOSPC accounting is done elsewhere
67  */
68 enum {
69 	RESERVE_FREE = 0,
70 	RESERVE_ALLOC = 1,
71 	RESERVE_ALLOC_NO_ACCOUNT = 2,
72 };
73 
74 static int update_block_group(struct btrfs_trans_handle *trans,
75 			      struct btrfs_root *root,
76 			      u64 bytenr, u64 num_bytes, int alloc);
77 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
78 				struct btrfs_root *root,
79 				u64 bytenr, u64 num_bytes, u64 parent,
80 				u64 root_objectid, u64 owner_objectid,
81 				u64 owner_offset, int refs_to_drop,
82 				struct btrfs_delayed_extent_op *extra_op);
83 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
84 				    struct extent_buffer *leaf,
85 				    struct btrfs_extent_item *ei);
86 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
87 				      struct btrfs_root *root,
88 				      u64 parent, u64 root_objectid,
89 				      u64 flags, u64 owner, u64 offset,
90 				      struct btrfs_key *ins, int ref_mod);
91 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
92 				     struct btrfs_root *root,
93 				     u64 parent, u64 root_objectid,
94 				     u64 flags, struct btrfs_disk_key *key,
95 				     int level, struct btrfs_key *ins);
96 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97 			  struct btrfs_root *extent_root, u64 alloc_bytes,
98 			  u64 flags, int force);
99 static int find_next_key(struct btrfs_path *path, int level,
100 			 struct btrfs_key *key);
101 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
102 			    int dump_block_groups);
103 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
104 				       u64 num_bytes, int reserve);
105 
106 static noinline int
107 block_group_cache_done(struct btrfs_block_group_cache *cache)
108 {
109 	smp_mb();
110 	return cache->cached == BTRFS_CACHE_FINISHED;
111 }
112 
113 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
114 {
115 	return (cache->flags & bits) == bits;
116 }
117 
118 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
119 {
120 	atomic_inc(&cache->count);
121 }
122 
123 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
124 {
125 	if (atomic_dec_and_test(&cache->count)) {
126 		WARN_ON(cache->pinned > 0);
127 		WARN_ON(cache->reserved > 0);
128 		kfree(cache->free_space_ctl);
129 		kfree(cache);
130 	}
131 }
132 
133 /*
134  * this adds the block group to the fs_info rb tree for the block group
135  * cache
136  */
137 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
138 				struct btrfs_block_group_cache *block_group)
139 {
140 	struct rb_node **p;
141 	struct rb_node *parent = NULL;
142 	struct btrfs_block_group_cache *cache;
143 
144 	spin_lock(&info->block_group_cache_lock);
145 	p = &info->block_group_cache_tree.rb_node;
146 
147 	while (*p) {
148 		parent = *p;
149 		cache = rb_entry(parent, struct btrfs_block_group_cache,
150 				 cache_node);
151 		if (block_group->key.objectid < cache->key.objectid) {
152 			p = &(*p)->rb_left;
153 		} else if (block_group->key.objectid > cache->key.objectid) {
154 			p = &(*p)->rb_right;
155 		} else {
156 			spin_unlock(&info->block_group_cache_lock);
157 			return -EEXIST;
158 		}
159 	}
160 
161 	rb_link_node(&block_group->cache_node, parent, p);
162 	rb_insert_color(&block_group->cache_node,
163 			&info->block_group_cache_tree);
164 	spin_unlock(&info->block_group_cache_lock);
165 
166 	return 0;
167 }
168 
169 /*
170  * This will return the block group at or after bytenr if contains is 0, else
171  * it will return the block group that contains the bytenr
172  */
173 static struct btrfs_block_group_cache *
174 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
175 			      int contains)
176 {
177 	struct btrfs_block_group_cache *cache, *ret = NULL;
178 	struct rb_node *n;
179 	u64 end, start;
180 
181 	spin_lock(&info->block_group_cache_lock);
182 	n = info->block_group_cache_tree.rb_node;
183 
184 	while (n) {
185 		cache = rb_entry(n, struct btrfs_block_group_cache,
186 				 cache_node);
187 		end = cache->key.objectid + cache->key.offset - 1;
188 		start = cache->key.objectid;
189 
190 		if (bytenr < start) {
191 			if (!contains && (!ret || start < ret->key.objectid))
192 				ret = cache;
193 			n = n->rb_left;
194 		} else if (bytenr > start) {
195 			if (contains && bytenr <= end) {
196 				ret = cache;
197 				break;
198 			}
199 			n = n->rb_right;
200 		} else {
201 			ret = cache;
202 			break;
203 		}
204 	}
205 	if (ret)
206 		btrfs_get_block_group(ret);
207 	spin_unlock(&info->block_group_cache_lock);
208 
209 	return ret;
210 }
211 
212 static int add_excluded_extent(struct btrfs_root *root,
213 			       u64 start, u64 num_bytes)
214 {
215 	u64 end = start + num_bytes - 1;
216 	set_extent_bits(&root->fs_info->freed_extents[0],
217 			start, end, EXTENT_UPTODATE, GFP_NOFS);
218 	set_extent_bits(&root->fs_info->freed_extents[1],
219 			start, end, EXTENT_UPTODATE, GFP_NOFS);
220 	return 0;
221 }
222 
223 static void free_excluded_extents(struct btrfs_root *root,
224 				  struct btrfs_block_group_cache *cache)
225 {
226 	u64 start, end;
227 
228 	start = cache->key.objectid;
229 	end = start + cache->key.offset - 1;
230 
231 	clear_extent_bits(&root->fs_info->freed_extents[0],
232 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
233 	clear_extent_bits(&root->fs_info->freed_extents[1],
234 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
235 }
236 
237 static int exclude_super_stripes(struct btrfs_root *root,
238 				 struct btrfs_block_group_cache *cache)
239 {
240 	u64 bytenr;
241 	u64 *logical;
242 	int stripe_len;
243 	int i, nr, ret;
244 
245 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
246 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
247 		cache->bytes_super += stripe_len;
248 		ret = add_excluded_extent(root, cache->key.objectid,
249 					  stripe_len);
250 		BUG_ON(ret); /* -ENOMEM */
251 	}
252 
253 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
254 		bytenr = btrfs_sb_offset(i);
255 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
256 				       cache->key.objectid, bytenr,
257 				       0, &logical, &nr, &stripe_len);
258 		BUG_ON(ret); /* -ENOMEM */
259 
260 		while (nr--) {
261 			cache->bytes_super += stripe_len;
262 			ret = add_excluded_extent(root, logical[nr],
263 						  stripe_len);
264 			BUG_ON(ret); /* -ENOMEM */
265 		}
266 
267 		kfree(logical);
268 	}
269 	return 0;
270 }
271 
272 static struct btrfs_caching_control *
273 get_caching_control(struct btrfs_block_group_cache *cache)
274 {
275 	struct btrfs_caching_control *ctl;
276 
277 	spin_lock(&cache->lock);
278 	if (cache->cached != BTRFS_CACHE_STARTED) {
279 		spin_unlock(&cache->lock);
280 		return NULL;
281 	}
282 
283 	/* We're loading it the fast way, so we don't have a caching_ctl. */
284 	if (!cache->caching_ctl) {
285 		spin_unlock(&cache->lock);
286 		return NULL;
287 	}
288 
289 	ctl = cache->caching_ctl;
290 	atomic_inc(&ctl->count);
291 	spin_unlock(&cache->lock);
292 	return ctl;
293 }
294 
295 static void put_caching_control(struct btrfs_caching_control *ctl)
296 {
297 	if (atomic_dec_and_test(&ctl->count))
298 		kfree(ctl);
299 }
300 
301 /*
302  * this is only called by cache_block_group, since we could have freed extents
303  * we need to check the pinned_extents for any extents that can't be used yet
304  * since their free space will be released as soon as the transaction commits.
305  */
306 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
307 			      struct btrfs_fs_info *info, u64 start, u64 end)
308 {
309 	u64 extent_start, extent_end, size, total_added = 0;
310 	int ret;
311 
312 	while (start < end) {
313 		ret = find_first_extent_bit(info->pinned_extents, start,
314 					    &extent_start, &extent_end,
315 					    EXTENT_DIRTY | EXTENT_UPTODATE);
316 		if (ret)
317 			break;
318 
319 		if (extent_start <= start) {
320 			start = extent_end + 1;
321 		} else if (extent_start > start && extent_start < end) {
322 			size = extent_start - start;
323 			total_added += size;
324 			ret = btrfs_add_free_space(block_group, start,
325 						   size);
326 			BUG_ON(ret); /* -ENOMEM or logic error */
327 			start = extent_end + 1;
328 		} else {
329 			break;
330 		}
331 	}
332 
333 	if (start < end) {
334 		size = end - start;
335 		total_added += size;
336 		ret = btrfs_add_free_space(block_group, start, size);
337 		BUG_ON(ret); /* -ENOMEM or logic error */
338 	}
339 
340 	return total_added;
341 }
342 
343 static noinline void caching_thread(struct btrfs_work *work)
344 {
345 	struct btrfs_block_group_cache *block_group;
346 	struct btrfs_fs_info *fs_info;
347 	struct btrfs_caching_control *caching_ctl;
348 	struct btrfs_root *extent_root;
349 	struct btrfs_path *path;
350 	struct extent_buffer *leaf;
351 	struct btrfs_key key;
352 	u64 total_found = 0;
353 	u64 last = 0;
354 	u32 nritems;
355 	int ret = 0;
356 
357 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
358 	block_group = caching_ctl->block_group;
359 	fs_info = block_group->fs_info;
360 	extent_root = fs_info->extent_root;
361 
362 	path = btrfs_alloc_path();
363 	if (!path)
364 		goto out;
365 
366 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
367 
368 	/*
369 	 * We don't want to deadlock with somebody trying to allocate a new
370 	 * extent for the extent root while also trying to search the extent
371 	 * root to add free space.  So we skip locking and search the commit
372 	 * root, since its read-only
373 	 */
374 	path->skip_locking = 1;
375 	path->search_commit_root = 1;
376 	path->reada = 1;
377 
378 	key.objectid = last;
379 	key.offset = 0;
380 	key.type = BTRFS_EXTENT_ITEM_KEY;
381 again:
382 	mutex_lock(&caching_ctl->mutex);
383 	/* need to make sure the commit_root doesn't disappear */
384 	down_read(&fs_info->extent_commit_sem);
385 
386 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
387 	if (ret < 0)
388 		goto err;
389 
390 	leaf = path->nodes[0];
391 	nritems = btrfs_header_nritems(leaf);
392 
393 	while (1) {
394 		if (btrfs_fs_closing(fs_info) > 1) {
395 			last = (u64)-1;
396 			break;
397 		}
398 
399 		if (path->slots[0] < nritems) {
400 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
401 		} else {
402 			ret = find_next_key(path, 0, &key);
403 			if (ret)
404 				break;
405 
406 			if (need_resched() ||
407 			    btrfs_next_leaf(extent_root, path)) {
408 				caching_ctl->progress = last;
409 				btrfs_release_path(path);
410 				up_read(&fs_info->extent_commit_sem);
411 				mutex_unlock(&caching_ctl->mutex);
412 				cond_resched();
413 				goto again;
414 			}
415 			leaf = path->nodes[0];
416 			nritems = btrfs_header_nritems(leaf);
417 			continue;
418 		}
419 
420 		if (key.objectid < block_group->key.objectid) {
421 			path->slots[0]++;
422 			continue;
423 		}
424 
425 		if (key.objectid >= block_group->key.objectid +
426 		    block_group->key.offset)
427 			break;
428 
429 		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
430 			total_found += add_new_free_space(block_group,
431 							  fs_info, last,
432 							  key.objectid);
433 			last = key.objectid + key.offset;
434 
435 			if (total_found > (1024 * 1024 * 2)) {
436 				total_found = 0;
437 				wake_up(&caching_ctl->wait);
438 			}
439 		}
440 		path->slots[0]++;
441 	}
442 	ret = 0;
443 
444 	total_found += add_new_free_space(block_group, fs_info, last,
445 					  block_group->key.objectid +
446 					  block_group->key.offset);
447 	caching_ctl->progress = (u64)-1;
448 
449 	spin_lock(&block_group->lock);
450 	block_group->caching_ctl = NULL;
451 	block_group->cached = BTRFS_CACHE_FINISHED;
452 	spin_unlock(&block_group->lock);
453 
454 err:
455 	btrfs_free_path(path);
456 	up_read(&fs_info->extent_commit_sem);
457 
458 	free_excluded_extents(extent_root, block_group);
459 
460 	mutex_unlock(&caching_ctl->mutex);
461 out:
462 	wake_up(&caching_ctl->wait);
463 
464 	put_caching_control(caching_ctl);
465 	btrfs_put_block_group(block_group);
466 }
467 
468 static int cache_block_group(struct btrfs_block_group_cache *cache,
469 			     struct btrfs_trans_handle *trans,
470 			     struct btrfs_root *root,
471 			     int load_cache_only)
472 {
473 	DEFINE_WAIT(wait);
474 	struct btrfs_fs_info *fs_info = cache->fs_info;
475 	struct btrfs_caching_control *caching_ctl;
476 	int ret = 0;
477 
478 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
479 	if (!caching_ctl)
480 		return -ENOMEM;
481 
482 	INIT_LIST_HEAD(&caching_ctl->list);
483 	mutex_init(&caching_ctl->mutex);
484 	init_waitqueue_head(&caching_ctl->wait);
485 	caching_ctl->block_group = cache;
486 	caching_ctl->progress = cache->key.objectid;
487 	atomic_set(&caching_ctl->count, 1);
488 	caching_ctl->work.func = caching_thread;
489 
490 	spin_lock(&cache->lock);
491 	/*
492 	 * This should be a rare occasion, but this could happen I think in the
493 	 * case where one thread starts to load the space cache info, and then
494 	 * some other thread starts a transaction commit which tries to do an
495 	 * allocation while the other thread is still loading the space cache
496 	 * info.  The previous loop should have kept us from choosing this block
497 	 * group, but if we've moved to the state where we will wait on caching
498 	 * block groups we need to first check if we're doing a fast load here,
499 	 * so we can wait for it to finish, otherwise we could end up allocating
500 	 * from a block group who's cache gets evicted for one reason or
501 	 * another.
502 	 */
503 	while (cache->cached == BTRFS_CACHE_FAST) {
504 		struct btrfs_caching_control *ctl;
505 
506 		ctl = cache->caching_ctl;
507 		atomic_inc(&ctl->count);
508 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
509 		spin_unlock(&cache->lock);
510 
511 		schedule();
512 
513 		finish_wait(&ctl->wait, &wait);
514 		put_caching_control(ctl);
515 		spin_lock(&cache->lock);
516 	}
517 
518 	if (cache->cached != BTRFS_CACHE_NO) {
519 		spin_unlock(&cache->lock);
520 		kfree(caching_ctl);
521 		return 0;
522 	}
523 	WARN_ON(cache->caching_ctl);
524 	cache->caching_ctl = caching_ctl;
525 	cache->cached = BTRFS_CACHE_FAST;
526 	spin_unlock(&cache->lock);
527 
528 	/*
529 	 * We can't do the read from on-disk cache during a commit since we need
530 	 * to have the normal tree locking.  Also if we are currently trying to
531 	 * allocate blocks for the tree root we can't do the fast caching since
532 	 * we likely hold important locks.
533 	 */
534 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
535 		ret = load_free_space_cache(fs_info, cache);
536 
537 		spin_lock(&cache->lock);
538 		if (ret == 1) {
539 			cache->caching_ctl = NULL;
540 			cache->cached = BTRFS_CACHE_FINISHED;
541 			cache->last_byte_to_unpin = (u64)-1;
542 		} else {
543 			if (load_cache_only) {
544 				cache->caching_ctl = NULL;
545 				cache->cached = BTRFS_CACHE_NO;
546 			} else {
547 				cache->cached = BTRFS_CACHE_STARTED;
548 			}
549 		}
550 		spin_unlock(&cache->lock);
551 		wake_up(&caching_ctl->wait);
552 		if (ret == 1) {
553 			put_caching_control(caching_ctl);
554 			free_excluded_extents(fs_info->extent_root, cache);
555 			return 0;
556 		}
557 	} else {
558 		/*
559 		 * We are not going to do the fast caching, set cached to the
560 		 * appropriate value and wakeup any waiters.
561 		 */
562 		spin_lock(&cache->lock);
563 		if (load_cache_only) {
564 			cache->caching_ctl = NULL;
565 			cache->cached = BTRFS_CACHE_NO;
566 		} else {
567 			cache->cached = BTRFS_CACHE_STARTED;
568 		}
569 		spin_unlock(&cache->lock);
570 		wake_up(&caching_ctl->wait);
571 	}
572 
573 	if (load_cache_only) {
574 		put_caching_control(caching_ctl);
575 		return 0;
576 	}
577 
578 	down_write(&fs_info->extent_commit_sem);
579 	atomic_inc(&caching_ctl->count);
580 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
581 	up_write(&fs_info->extent_commit_sem);
582 
583 	btrfs_get_block_group(cache);
584 
585 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
586 
587 	return ret;
588 }
589 
590 /*
591  * return the block group that starts at or after bytenr
592  */
593 static struct btrfs_block_group_cache *
594 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
595 {
596 	struct btrfs_block_group_cache *cache;
597 
598 	cache = block_group_cache_tree_search(info, bytenr, 0);
599 
600 	return cache;
601 }
602 
603 /*
604  * return the block group that contains the given bytenr
605  */
606 struct btrfs_block_group_cache *btrfs_lookup_block_group(
607 						 struct btrfs_fs_info *info,
608 						 u64 bytenr)
609 {
610 	struct btrfs_block_group_cache *cache;
611 
612 	cache = block_group_cache_tree_search(info, bytenr, 1);
613 
614 	return cache;
615 }
616 
617 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 						  u64 flags)
619 {
620 	struct list_head *head = &info->space_info;
621 	struct btrfs_space_info *found;
622 
623 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
624 
625 	rcu_read_lock();
626 	list_for_each_entry_rcu(found, head, list) {
627 		if (found->flags & flags) {
628 			rcu_read_unlock();
629 			return found;
630 		}
631 	}
632 	rcu_read_unlock();
633 	return NULL;
634 }
635 
636 /*
637  * after adding space to the filesystem, we need to clear the full flags
638  * on all the space infos.
639  */
640 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
641 {
642 	struct list_head *head = &info->space_info;
643 	struct btrfs_space_info *found;
644 
645 	rcu_read_lock();
646 	list_for_each_entry_rcu(found, head, list)
647 		found->full = 0;
648 	rcu_read_unlock();
649 }
650 
651 static u64 div_factor(u64 num, int factor)
652 {
653 	if (factor == 10)
654 		return num;
655 	num *= factor;
656 	do_div(num, 10);
657 	return num;
658 }
659 
660 static u64 div_factor_fine(u64 num, int factor)
661 {
662 	if (factor == 100)
663 		return num;
664 	num *= factor;
665 	do_div(num, 100);
666 	return num;
667 }
668 
669 u64 btrfs_find_block_group(struct btrfs_root *root,
670 			   u64 search_start, u64 search_hint, int owner)
671 {
672 	struct btrfs_block_group_cache *cache;
673 	u64 used;
674 	u64 last = max(search_hint, search_start);
675 	u64 group_start = 0;
676 	int full_search = 0;
677 	int factor = 9;
678 	int wrapped = 0;
679 again:
680 	while (1) {
681 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
682 		if (!cache)
683 			break;
684 
685 		spin_lock(&cache->lock);
686 		last = cache->key.objectid + cache->key.offset;
687 		used = btrfs_block_group_used(&cache->item);
688 
689 		if ((full_search || !cache->ro) &&
690 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
691 			if (used + cache->pinned + cache->reserved <
692 			    div_factor(cache->key.offset, factor)) {
693 				group_start = cache->key.objectid;
694 				spin_unlock(&cache->lock);
695 				btrfs_put_block_group(cache);
696 				goto found;
697 			}
698 		}
699 		spin_unlock(&cache->lock);
700 		btrfs_put_block_group(cache);
701 		cond_resched();
702 	}
703 	if (!wrapped) {
704 		last = search_start;
705 		wrapped = 1;
706 		goto again;
707 	}
708 	if (!full_search && factor < 10) {
709 		last = search_start;
710 		full_search = 1;
711 		factor = 10;
712 		goto again;
713 	}
714 found:
715 	return group_start;
716 }
717 
718 /* simple helper to search for an existing extent at a given offset */
719 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
720 {
721 	int ret;
722 	struct btrfs_key key;
723 	struct btrfs_path *path;
724 
725 	path = btrfs_alloc_path();
726 	if (!path)
727 		return -ENOMEM;
728 
729 	key.objectid = start;
730 	key.offset = len;
731 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
732 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
733 				0, 0);
734 	btrfs_free_path(path);
735 	return ret;
736 }
737 
738 /*
739  * helper function to lookup reference count and flags of extent.
740  *
741  * the head node for delayed ref is used to store the sum of all the
742  * reference count modifications queued up in the rbtree. the head
743  * node may also store the extent flags to set. This way you can check
744  * to see what the reference count and extent flags would be if all of
745  * the delayed refs are not processed.
746  */
747 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
748 			     struct btrfs_root *root, u64 bytenr,
749 			     u64 num_bytes, u64 *refs, u64 *flags)
750 {
751 	struct btrfs_delayed_ref_head *head;
752 	struct btrfs_delayed_ref_root *delayed_refs;
753 	struct btrfs_path *path;
754 	struct btrfs_extent_item *ei;
755 	struct extent_buffer *leaf;
756 	struct btrfs_key key;
757 	u32 item_size;
758 	u64 num_refs;
759 	u64 extent_flags;
760 	int ret;
761 
762 	path = btrfs_alloc_path();
763 	if (!path)
764 		return -ENOMEM;
765 
766 	key.objectid = bytenr;
767 	key.type = BTRFS_EXTENT_ITEM_KEY;
768 	key.offset = num_bytes;
769 	if (!trans) {
770 		path->skip_locking = 1;
771 		path->search_commit_root = 1;
772 	}
773 again:
774 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
775 				&key, path, 0, 0);
776 	if (ret < 0)
777 		goto out_free;
778 
779 	if (ret == 0) {
780 		leaf = path->nodes[0];
781 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
782 		if (item_size >= sizeof(*ei)) {
783 			ei = btrfs_item_ptr(leaf, path->slots[0],
784 					    struct btrfs_extent_item);
785 			num_refs = btrfs_extent_refs(leaf, ei);
786 			extent_flags = btrfs_extent_flags(leaf, ei);
787 		} else {
788 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
789 			struct btrfs_extent_item_v0 *ei0;
790 			BUG_ON(item_size != sizeof(*ei0));
791 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
792 					     struct btrfs_extent_item_v0);
793 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
794 			/* FIXME: this isn't correct for data */
795 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
796 #else
797 			BUG();
798 #endif
799 		}
800 		BUG_ON(num_refs == 0);
801 	} else {
802 		num_refs = 0;
803 		extent_flags = 0;
804 		ret = 0;
805 	}
806 
807 	if (!trans)
808 		goto out;
809 
810 	delayed_refs = &trans->transaction->delayed_refs;
811 	spin_lock(&delayed_refs->lock);
812 	head = btrfs_find_delayed_ref_head(trans, bytenr);
813 	if (head) {
814 		if (!mutex_trylock(&head->mutex)) {
815 			atomic_inc(&head->node.refs);
816 			spin_unlock(&delayed_refs->lock);
817 
818 			btrfs_release_path(path);
819 
820 			/*
821 			 * Mutex was contended, block until it's released and try
822 			 * again
823 			 */
824 			mutex_lock(&head->mutex);
825 			mutex_unlock(&head->mutex);
826 			btrfs_put_delayed_ref(&head->node);
827 			goto again;
828 		}
829 		if (head->extent_op && head->extent_op->update_flags)
830 			extent_flags |= head->extent_op->flags_to_set;
831 		else
832 			BUG_ON(num_refs == 0);
833 
834 		num_refs += head->node.ref_mod;
835 		mutex_unlock(&head->mutex);
836 	}
837 	spin_unlock(&delayed_refs->lock);
838 out:
839 	WARN_ON(num_refs == 0);
840 	if (refs)
841 		*refs = num_refs;
842 	if (flags)
843 		*flags = extent_flags;
844 out_free:
845 	btrfs_free_path(path);
846 	return ret;
847 }
848 
849 /*
850  * Back reference rules.  Back refs have three main goals:
851  *
852  * 1) differentiate between all holders of references to an extent so that
853  *    when a reference is dropped we can make sure it was a valid reference
854  *    before freeing the extent.
855  *
856  * 2) Provide enough information to quickly find the holders of an extent
857  *    if we notice a given block is corrupted or bad.
858  *
859  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
860  *    maintenance.  This is actually the same as #2, but with a slightly
861  *    different use case.
862  *
863  * There are two kinds of back refs. The implicit back refs is optimized
864  * for pointers in non-shared tree blocks. For a given pointer in a block,
865  * back refs of this kind provide information about the block's owner tree
866  * and the pointer's key. These information allow us to find the block by
867  * b-tree searching. The full back refs is for pointers in tree blocks not
868  * referenced by their owner trees. The location of tree block is recorded
869  * in the back refs. Actually the full back refs is generic, and can be
870  * used in all cases the implicit back refs is used. The major shortcoming
871  * of the full back refs is its overhead. Every time a tree block gets
872  * COWed, we have to update back refs entry for all pointers in it.
873  *
874  * For a newly allocated tree block, we use implicit back refs for
875  * pointers in it. This means most tree related operations only involve
876  * implicit back refs. For a tree block created in old transaction, the
877  * only way to drop a reference to it is COW it. So we can detect the
878  * event that tree block loses its owner tree's reference and do the
879  * back refs conversion.
880  *
881  * When a tree block is COW'd through a tree, there are four cases:
882  *
883  * The reference count of the block is one and the tree is the block's
884  * owner tree. Nothing to do in this case.
885  *
886  * The reference count of the block is one and the tree is not the
887  * block's owner tree. In this case, full back refs is used for pointers
888  * in the block. Remove these full back refs, add implicit back refs for
889  * every pointers in the new block.
890  *
891  * The reference count of the block is greater than one and the tree is
892  * the block's owner tree. In this case, implicit back refs is used for
893  * pointers in the block. Add full back refs for every pointers in the
894  * block, increase lower level extents' reference counts. The original
895  * implicit back refs are entailed to the new block.
896  *
897  * The reference count of the block is greater than one and the tree is
898  * not the block's owner tree. Add implicit back refs for every pointer in
899  * the new block, increase lower level extents' reference count.
900  *
901  * Back Reference Key composing:
902  *
903  * The key objectid corresponds to the first byte in the extent,
904  * The key type is used to differentiate between types of back refs.
905  * There are different meanings of the key offset for different types
906  * of back refs.
907  *
908  * File extents can be referenced by:
909  *
910  * - multiple snapshots, subvolumes, or different generations in one subvol
911  * - different files inside a single subvolume
912  * - different offsets inside a file (bookend extents in file.c)
913  *
914  * The extent ref structure for the implicit back refs has fields for:
915  *
916  * - Objectid of the subvolume root
917  * - objectid of the file holding the reference
918  * - original offset in the file
919  * - how many bookend extents
920  *
921  * The key offset for the implicit back refs is hash of the first
922  * three fields.
923  *
924  * The extent ref structure for the full back refs has field for:
925  *
926  * - number of pointers in the tree leaf
927  *
928  * The key offset for the implicit back refs is the first byte of
929  * the tree leaf
930  *
931  * When a file extent is allocated, The implicit back refs is used.
932  * the fields are filled in:
933  *
934  *     (root_key.objectid, inode objectid, offset in file, 1)
935  *
936  * When a file extent is removed file truncation, we find the
937  * corresponding implicit back refs and check the following fields:
938  *
939  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
940  *
941  * Btree extents can be referenced by:
942  *
943  * - Different subvolumes
944  *
945  * Both the implicit back refs and the full back refs for tree blocks
946  * only consist of key. The key offset for the implicit back refs is
947  * objectid of block's owner tree. The key offset for the full back refs
948  * is the first byte of parent block.
949  *
950  * When implicit back refs is used, information about the lowest key and
951  * level of the tree block are required. These information are stored in
952  * tree block info structure.
953  */
954 
955 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
956 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
957 				  struct btrfs_root *root,
958 				  struct btrfs_path *path,
959 				  u64 owner, u32 extra_size)
960 {
961 	struct btrfs_extent_item *item;
962 	struct btrfs_extent_item_v0 *ei0;
963 	struct btrfs_extent_ref_v0 *ref0;
964 	struct btrfs_tree_block_info *bi;
965 	struct extent_buffer *leaf;
966 	struct btrfs_key key;
967 	struct btrfs_key found_key;
968 	u32 new_size = sizeof(*item);
969 	u64 refs;
970 	int ret;
971 
972 	leaf = path->nodes[0];
973 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
974 
975 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
976 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
977 			     struct btrfs_extent_item_v0);
978 	refs = btrfs_extent_refs_v0(leaf, ei0);
979 
980 	if (owner == (u64)-1) {
981 		while (1) {
982 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
983 				ret = btrfs_next_leaf(root, path);
984 				if (ret < 0)
985 					return ret;
986 				BUG_ON(ret > 0); /* Corruption */
987 				leaf = path->nodes[0];
988 			}
989 			btrfs_item_key_to_cpu(leaf, &found_key,
990 					      path->slots[0]);
991 			BUG_ON(key.objectid != found_key.objectid);
992 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
993 				path->slots[0]++;
994 				continue;
995 			}
996 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
997 					      struct btrfs_extent_ref_v0);
998 			owner = btrfs_ref_objectid_v0(leaf, ref0);
999 			break;
1000 		}
1001 	}
1002 	btrfs_release_path(path);
1003 
1004 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1005 		new_size += sizeof(*bi);
1006 
1007 	new_size -= sizeof(*ei0);
1008 	ret = btrfs_search_slot(trans, root, &key, path,
1009 				new_size + extra_size, 1);
1010 	if (ret < 0)
1011 		return ret;
1012 	BUG_ON(ret); /* Corruption */
1013 
1014 	btrfs_extend_item(trans, root, path, new_size);
1015 
1016 	leaf = path->nodes[0];
1017 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1018 	btrfs_set_extent_refs(leaf, item, refs);
1019 	/* FIXME: get real generation */
1020 	btrfs_set_extent_generation(leaf, item, 0);
1021 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1022 		btrfs_set_extent_flags(leaf, item,
1023 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1024 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1025 		bi = (struct btrfs_tree_block_info *)(item + 1);
1026 		/* FIXME: get first key of the block */
1027 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1028 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1029 	} else {
1030 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1031 	}
1032 	btrfs_mark_buffer_dirty(leaf);
1033 	return 0;
1034 }
1035 #endif
1036 
1037 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1038 {
1039 	u32 high_crc = ~(u32)0;
1040 	u32 low_crc = ~(u32)0;
1041 	__le64 lenum;
1042 
1043 	lenum = cpu_to_le64(root_objectid);
1044 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1045 	lenum = cpu_to_le64(owner);
1046 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1047 	lenum = cpu_to_le64(offset);
1048 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1049 
1050 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1051 }
1052 
1053 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1054 				     struct btrfs_extent_data_ref *ref)
1055 {
1056 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1057 				    btrfs_extent_data_ref_objectid(leaf, ref),
1058 				    btrfs_extent_data_ref_offset(leaf, ref));
1059 }
1060 
1061 static int match_extent_data_ref(struct extent_buffer *leaf,
1062 				 struct btrfs_extent_data_ref *ref,
1063 				 u64 root_objectid, u64 owner, u64 offset)
1064 {
1065 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1066 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1067 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1068 		return 0;
1069 	return 1;
1070 }
1071 
1072 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1073 					   struct btrfs_root *root,
1074 					   struct btrfs_path *path,
1075 					   u64 bytenr, u64 parent,
1076 					   u64 root_objectid,
1077 					   u64 owner, u64 offset)
1078 {
1079 	struct btrfs_key key;
1080 	struct btrfs_extent_data_ref *ref;
1081 	struct extent_buffer *leaf;
1082 	u32 nritems;
1083 	int ret;
1084 	int recow;
1085 	int err = -ENOENT;
1086 
1087 	key.objectid = bytenr;
1088 	if (parent) {
1089 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1090 		key.offset = parent;
1091 	} else {
1092 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1093 		key.offset = hash_extent_data_ref(root_objectid,
1094 						  owner, offset);
1095 	}
1096 again:
1097 	recow = 0;
1098 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1099 	if (ret < 0) {
1100 		err = ret;
1101 		goto fail;
1102 	}
1103 
1104 	if (parent) {
1105 		if (!ret)
1106 			return 0;
1107 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1108 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1109 		btrfs_release_path(path);
1110 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111 		if (ret < 0) {
1112 			err = ret;
1113 			goto fail;
1114 		}
1115 		if (!ret)
1116 			return 0;
1117 #endif
1118 		goto fail;
1119 	}
1120 
1121 	leaf = path->nodes[0];
1122 	nritems = btrfs_header_nritems(leaf);
1123 	while (1) {
1124 		if (path->slots[0] >= nritems) {
1125 			ret = btrfs_next_leaf(root, path);
1126 			if (ret < 0)
1127 				err = ret;
1128 			if (ret)
1129 				goto fail;
1130 
1131 			leaf = path->nodes[0];
1132 			nritems = btrfs_header_nritems(leaf);
1133 			recow = 1;
1134 		}
1135 
1136 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1137 		if (key.objectid != bytenr ||
1138 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1139 			goto fail;
1140 
1141 		ref = btrfs_item_ptr(leaf, path->slots[0],
1142 				     struct btrfs_extent_data_ref);
1143 
1144 		if (match_extent_data_ref(leaf, ref, root_objectid,
1145 					  owner, offset)) {
1146 			if (recow) {
1147 				btrfs_release_path(path);
1148 				goto again;
1149 			}
1150 			err = 0;
1151 			break;
1152 		}
1153 		path->slots[0]++;
1154 	}
1155 fail:
1156 	return err;
1157 }
1158 
1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1160 					   struct btrfs_root *root,
1161 					   struct btrfs_path *path,
1162 					   u64 bytenr, u64 parent,
1163 					   u64 root_objectid, u64 owner,
1164 					   u64 offset, int refs_to_add)
1165 {
1166 	struct btrfs_key key;
1167 	struct extent_buffer *leaf;
1168 	u32 size;
1169 	u32 num_refs;
1170 	int ret;
1171 
1172 	key.objectid = bytenr;
1173 	if (parent) {
1174 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1175 		key.offset = parent;
1176 		size = sizeof(struct btrfs_shared_data_ref);
1177 	} else {
1178 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1179 		key.offset = hash_extent_data_ref(root_objectid,
1180 						  owner, offset);
1181 		size = sizeof(struct btrfs_extent_data_ref);
1182 	}
1183 
1184 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1185 	if (ret && ret != -EEXIST)
1186 		goto fail;
1187 
1188 	leaf = path->nodes[0];
1189 	if (parent) {
1190 		struct btrfs_shared_data_ref *ref;
1191 		ref = btrfs_item_ptr(leaf, path->slots[0],
1192 				     struct btrfs_shared_data_ref);
1193 		if (ret == 0) {
1194 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1195 		} else {
1196 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1197 			num_refs += refs_to_add;
1198 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1199 		}
1200 	} else {
1201 		struct btrfs_extent_data_ref *ref;
1202 		while (ret == -EEXIST) {
1203 			ref = btrfs_item_ptr(leaf, path->slots[0],
1204 					     struct btrfs_extent_data_ref);
1205 			if (match_extent_data_ref(leaf, ref, root_objectid,
1206 						  owner, offset))
1207 				break;
1208 			btrfs_release_path(path);
1209 			key.offset++;
1210 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1211 						      size);
1212 			if (ret && ret != -EEXIST)
1213 				goto fail;
1214 
1215 			leaf = path->nodes[0];
1216 		}
1217 		ref = btrfs_item_ptr(leaf, path->slots[0],
1218 				     struct btrfs_extent_data_ref);
1219 		if (ret == 0) {
1220 			btrfs_set_extent_data_ref_root(leaf, ref,
1221 						       root_objectid);
1222 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1223 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1224 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1225 		} else {
1226 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1227 			num_refs += refs_to_add;
1228 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1229 		}
1230 	}
1231 	btrfs_mark_buffer_dirty(leaf);
1232 	ret = 0;
1233 fail:
1234 	btrfs_release_path(path);
1235 	return ret;
1236 }
1237 
1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1239 					   struct btrfs_root *root,
1240 					   struct btrfs_path *path,
1241 					   int refs_to_drop)
1242 {
1243 	struct btrfs_key key;
1244 	struct btrfs_extent_data_ref *ref1 = NULL;
1245 	struct btrfs_shared_data_ref *ref2 = NULL;
1246 	struct extent_buffer *leaf;
1247 	u32 num_refs = 0;
1248 	int ret = 0;
1249 
1250 	leaf = path->nodes[0];
1251 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1252 
1253 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1254 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1255 				      struct btrfs_extent_data_ref);
1256 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1257 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1258 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1259 				      struct btrfs_shared_data_ref);
1260 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1262 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1263 		struct btrfs_extent_ref_v0 *ref0;
1264 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1265 				      struct btrfs_extent_ref_v0);
1266 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1267 #endif
1268 	} else {
1269 		BUG();
1270 	}
1271 
1272 	BUG_ON(num_refs < refs_to_drop);
1273 	num_refs -= refs_to_drop;
1274 
1275 	if (num_refs == 0) {
1276 		ret = btrfs_del_item(trans, root, path);
1277 	} else {
1278 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1279 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1280 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1281 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1283 		else {
1284 			struct btrfs_extent_ref_v0 *ref0;
1285 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1286 					struct btrfs_extent_ref_v0);
1287 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1288 		}
1289 #endif
1290 		btrfs_mark_buffer_dirty(leaf);
1291 	}
1292 	return ret;
1293 }
1294 
1295 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1296 					  struct btrfs_path *path,
1297 					  struct btrfs_extent_inline_ref *iref)
1298 {
1299 	struct btrfs_key key;
1300 	struct extent_buffer *leaf;
1301 	struct btrfs_extent_data_ref *ref1;
1302 	struct btrfs_shared_data_ref *ref2;
1303 	u32 num_refs = 0;
1304 
1305 	leaf = path->nodes[0];
1306 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1307 	if (iref) {
1308 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1309 		    BTRFS_EXTENT_DATA_REF_KEY) {
1310 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1311 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1312 		} else {
1313 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1314 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1315 		}
1316 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1317 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1318 				      struct btrfs_extent_data_ref);
1319 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1320 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1321 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1322 				      struct btrfs_shared_data_ref);
1323 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1325 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1326 		struct btrfs_extent_ref_v0 *ref0;
1327 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1328 				      struct btrfs_extent_ref_v0);
1329 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1330 #endif
1331 	} else {
1332 		WARN_ON(1);
1333 	}
1334 	return num_refs;
1335 }
1336 
1337 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1338 					  struct btrfs_root *root,
1339 					  struct btrfs_path *path,
1340 					  u64 bytenr, u64 parent,
1341 					  u64 root_objectid)
1342 {
1343 	struct btrfs_key key;
1344 	int ret;
1345 
1346 	key.objectid = bytenr;
1347 	if (parent) {
1348 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1349 		key.offset = parent;
1350 	} else {
1351 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1352 		key.offset = root_objectid;
1353 	}
1354 
1355 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1356 	if (ret > 0)
1357 		ret = -ENOENT;
1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1359 	if (ret == -ENOENT && parent) {
1360 		btrfs_release_path(path);
1361 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1362 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1363 		if (ret > 0)
1364 			ret = -ENOENT;
1365 	}
1366 #endif
1367 	return ret;
1368 }
1369 
1370 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1371 					  struct btrfs_root *root,
1372 					  struct btrfs_path *path,
1373 					  u64 bytenr, u64 parent,
1374 					  u64 root_objectid)
1375 {
1376 	struct btrfs_key key;
1377 	int ret;
1378 
1379 	key.objectid = bytenr;
1380 	if (parent) {
1381 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1382 		key.offset = parent;
1383 	} else {
1384 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1385 		key.offset = root_objectid;
1386 	}
1387 
1388 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1389 	btrfs_release_path(path);
1390 	return ret;
1391 }
1392 
1393 static inline int extent_ref_type(u64 parent, u64 owner)
1394 {
1395 	int type;
1396 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1397 		if (parent > 0)
1398 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1399 		else
1400 			type = BTRFS_TREE_BLOCK_REF_KEY;
1401 	} else {
1402 		if (parent > 0)
1403 			type = BTRFS_SHARED_DATA_REF_KEY;
1404 		else
1405 			type = BTRFS_EXTENT_DATA_REF_KEY;
1406 	}
1407 	return type;
1408 }
1409 
1410 static int find_next_key(struct btrfs_path *path, int level,
1411 			 struct btrfs_key *key)
1412 
1413 {
1414 	for (; level < BTRFS_MAX_LEVEL; level++) {
1415 		if (!path->nodes[level])
1416 			break;
1417 		if (path->slots[level] + 1 >=
1418 		    btrfs_header_nritems(path->nodes[level]))
1419 			continue;
1420 		if (level == 0)
1421 			btrfs_item_key_to_cpu(path->nodes[level], key,
1422 					      path->slots[level] + 1);
1423 		else
1424 			btrfs_node_key_to_cpu(path->nodes[level], key,
1425 					      path->slots[level] + 1);
1426 		return 0;
1427 	}
1428 	return 1;
1429 }
1430 
1431 /*
1432  * look for inline back ref. if back ref is found, *ref_ret is set
1433  * to the address of inline back ref, and 0 is returned.
1434  *
1435  * if back ref isn't found, *ref_ret is set to the address where it
1436  * should be inserted, and -ENOENT is returned.
1437  *
1438  * if insert is true and there are too many inline back refs, the path
1439  * points to the extent item, and -EAGAIN is returned.
1440  *
1441  * NOTE: inline back refs are ordered in the same way that back ref
1442  *	 items in the tree are ordered.
1443  */
1444 static noinline_for_stack
1445 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1446 				 struct btrfs_root *root,
1447 				 struct btrfs_path *path,
1448 				 struct btrfs_extent_inline_ref **ref_ret,
1449 				 u64 bytenr, u64 num_bytes,
1450 				 u64 parent, u64 root_objectid,
1451 				 u64 owner, u64 offset, int insert)
1452 {
1453 	struct btrfs_key key;
1454 	struct extent_buffer *leaf;
1455 	struct btrfs_extent_item *ei;
1456 	struct btrfs_extent_inline_ref *iref;
1457 	u64 flags;
1458 	u64 item_size;
1459 	unsigned long ptr;
1460 	unsigned long end;
1461 	int extra_size;
1462 	int type;
1463 	int want;
1464 	int ret;
1465 	int err = 0;
1466 
1467 	key.objectid = bytenr;
1468 	key.type = BTRFS_EXTENT_ITEM_KEY;
1469 	key.offset = num_bytes;
1470 
1471 	want = extent_ref_type(parent, owner);
1472 	if (insert) {
1473 		extra_size = btrfs_extent_inline_ref_size(want);
1474 		path->keep_locks = 1;
1475 	} else
1476 		extra_size = -1;
1477 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1478 	if (ret < 0) {
1479 		err = ret;
1480 		goto out;
1481 	}
1482 	if (ret && !insert) {
1483 		err = -ENOENT;
1484 		goto out;
1485 	}
1486 	BUG_ON(ret); /* Corruption */
1487 
1488 	leaf = path->nodes[0];
1489 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1490 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1491 	if (item_size < sizeof(*ei)) {
1492 		if (!insert) {
1493 			err = -ENOENT;
1494 			goto out;
1495 		}
1496 		ret = convert_extent_item_v0(trans, root, path, owner,
1497 					     extra_size);
1498 		if (ret < 0) {
1499 			err = ret;
1500 			goto out;
1501 		}
1502 		leaf = path->nodes[0];
1503 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1504 	}
1505 #endif
1506 	BUG_ON(item_size < sizeof(*ei));
1507 
1508 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1509 	flags = btrfs_extent_flags(leaf, ei);
1510 
1511 	ptr = (unsigned long)(ei + 1);
1512 	end = (unsigned long)ei + item_size;
1513 
1514 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1515 		ptr += sizeof(struct btrfs_tree_block_info);
1516 		BUG_ON(ptr > end);
1517 	} else {
1518 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1519 	}
1520 
1521 	err = -ENOENT;
1522 	while (1) {
1523 		if (ptr >= end) {
1524 			WARN_ON(ptr > end);
1525 			break;
1526 		}
1527 		iref = (struct btrfs_extent_inline_ref *)ptr;
1528 		type = btrfs_extent_inline_ref_type(leaf, iref);
1529 		if (want < type)
1530 			break;
1531 		if (want > type) {
1532 			ptr += btrfs_extent_inline_ref_size(type);
1533 			continue;
1534 		}
1535 
1536 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1537 			struct btrfs_extent_data_ref *dref;
1538 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1539 			if (match_extent_data_ref(leaf, dref, root_objectid,
1540 						  owner, offset)) {
1541 				err = 0;
1542 				break;
1543 			}
1544 			if (hash_extent_data_ref_item(leaf, dref) <
1545 			    hash_extent_data_ref(root_objectid, owner, offset))
1546 				break;
1547 		} else {
1548 			u64 ref_offset;
1549 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1550 			if (parent > 0) {
1551 				if (parent == ref_offset) {
1552 					err = 0;
1553 					break;
1554 				}
1555 				if (ref_offset < parent)
1556 					break;
1557 			} else {
1558 				if (root_objectid == ref_offset) {
1559 					err = 0;
1560 					break;
1561 				}
1562 				if (ref_offset < root_objectid)
1563 					break;
1564 			}
1565 		}
1566 		ptr += btrfs_extent_inline_ref_size(type);
1567 	}
1568 	if (err == -ENOENT && insert) {
1569 		if (item_size + extra_size >=
1570 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1571 			err = -EAGAIN;
1572 			goto out;
1573 		}
1574 		/*
1575 		 * To add new inline back ref, we have to make sure
1576 		 * there is no corresponding back ref item.
1577 		 * For simplicity, we just do not add new inline back
1578 		 * ref if there is any kind of item for this block
1579 		 */
1580 		if (find_next_key(path, 0, &key) == 0 &&
1581 		    key.objectid == bytenr &&
1582 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1583 			err = -EAGAIN;
1584 			goto out;
1585 		}
1586 	}
1587 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1588 out:
1589 	if (insert) {
1590 		path->keep_locks = 0;
1591 		btrfs_unlock_up_safe(path, 1);
1592 	}
1593 	return err;
1594 }
1595 
1596 /*
1597  * helper to add new inline back ref
1598  */
1599 static noinline_for_stack
1600 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1601 				 struct btrfs_root *root,
1602 				 struct btrfs_path *path,
1603 				 struct btrfs_extent_inline_ref *iref,
1604 				 u64 parent, u64 root_objectid,
1605 				 u64 owner, u64 offset, int refs_to_add,
1606 				 struct btrfs_delayed_extent_op *extent_op)
1607 {
1608 	struct extent_buffer *leaf;
1609 	struct btrfs_extent_item *ei;
1610 	unsigned long ptr;
1611 	unsigned long end;
1612 	unsigned long item_offset;
1613 	u64 refs;
1614 	int size;
1615 	int type;
1616 
1617 	leaf = path->nodes[0];
1618 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1619 	item_offset = (unsigned long)iref - (unsigned long)ei;
1620 
1621 	type = extent_ref_type(parent, owner);
1622 	size = btrfs_extent_inline_ref_size(type);
1623 
1624 	btrfs_extend_item(trans, root, path, size);
1625 
1626 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1627 	refs = btrfs_extent_refs(leaf, ei);
1628 	refs += refs_to_add;
1629 	btrfs_set_extent_refs(leaf, ei, refs);
1630 	if (extent_op)
1631 		__run_delayed_extent_op(extent_op, leaf, ei);
1632 
1633 	ptr = (unsigned long)ei + item_offset;
1634 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1635 	if (ptr < end - size)
1636 		memmove_extent_buffer(leaf, ptr + size, ptr,
1637 				      end - size - ptr);
1638 
1639 	iref = (struct btrfs_extent_inline_ref *)ptr;
1640 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1641 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1642 		struct btrfs_extent_data_ref *dref;
1643 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1644 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1645 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1646 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1647 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1648 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1649 		struct btrfs_shared_data_ref *sref;
1650 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1651 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1652 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1653 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1654 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1655 	} else {
1656 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1657 	}
1658 	btrfs_mark_buffer_dirty(leaf);
1659 }
1660 
1661 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1662 				 struct btrfs_root *root,
1663 				 struct btrfs_path *path,
1664 				 struct btrfs_extent_inline_ref **ref_ret,
1665 				 u64 bytenr, u64 num_bytes, u64 parent,
1666 				 u64 root_objectid, u64 owner, u64 offset)
1667 {
1668 	int ret;
1669 
1670 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1671 					   bytenr, num_bytes, parent,
1672 					   root_objectid, owner, offset, 0);
1673 	if (ret != -ENOENT)
1674 		return ret;
1675 
1676 	btrfs_release_path(path);
1677 	*ref_ret = NULL;
1678 
1679 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1680 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1681 					    root_objectid);
1682 	} else {
1683 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1684 					     root_objectid, owner, offset);
1685 	}
1686 	return ret;
1687 }
1688 
1689 /*
1690  * helper to update/remove inline back ref
1691  */
1692 static noinline_for_stack
1693 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1694 				  struct btrfs_root *root,
1695 				  struct btrfs_path *path,
1696 				  struct btrfs_extent_inline_ref *iref,
1697 				  int refs_to_mod,
1698 				  struct btrfs_delayed_extent_op *extent_op)
1699 {
1700 	struct extent_buffer *leaf;
1701 	struct btrfs_extent_item *ei;
1702 	struct btrfs_extent_data_ref *dref = NULL;
1703 	struct btrfs_shared_data_ref *sref = NULL;
1704 	unsigned long ptr;
1705 	unsigned long end;
1706 	u32 item_size;
1707 	int size;
1708 	int type;
1709 	u64 refs;
1710 
1711 	leaf = path->nodes[0];
1712 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1713 	refs = btrfs_extent_refs(leaf, ei);
1714 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1715 	refs += refs_to_mod;
1716 	btrfs_set_extent_refs(leaf, ei, refs);
1717 	if (extent_op)
1718 		__run_delayed_extent_op(extent_op, leaf, ei);
1719 
1720 	type = btrfs_extent_inline_ref_type(leaf, iref);
1721 
1722 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1723 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1724 		refs = btrfs_extent_data_ref_count(leaf, dref);
1725 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1726 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1727 		refs = btrfs_shared_data_ref_count(leaf, sref);
1728 	} else {
1729 		refs = 1;
1730 		BUG_ON(refs_to_mod != -1);
1731 	}
1732 
1733 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1734 	refs += refs_to_mod;
1735 
1736 	if (refs > 0) {
1737 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1738 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1739 		else
1740 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1741 	} else {
1742 		size =  btrfs_extent_inline_ref_size(type);
1743 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1744 		ptr = (unsigned long)iref;
1745 		end = (unsigned long)ei + item_size;
1746 		if (ptr + size < end)
1747 			memmove_extent_buffer(leaf, ptr, ptr + size,
1748 					      end - ptr - size);
1749 		item_size -= size;
1750 		btrfs_truncate_item(trans, root, path, item_size, 1);
1751 	}
1752 	btrfs_mark_buffer_dirty(leaf);
1753 }
1754 
1755 static noinline_for_stack
1756 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1757 				 struct btrfs_root *root,
1758 				 struct btrfs_path *path,
1759 				 u64 bytenr, u64 num_bytes, u64 parent,
1760 				 u64 root_objectid, u64 owner,
1761 				 u64 offset, int refs_to_add,
1762 				 struct btrfs_delayed_extent_op *extent_op)
1763 {
1764 	struct btrfs_extent_inline_ref *iref;
1765 	int ret;
1766 
1767 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1768 					   bytenr, num_bytes, parent,
1769 					   root_objectid, owner, offset, 1);
1770 	if (ret == 0) {
1771 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1772 		update_inline_extent_backref(trans, root, path, iref,
1773 					     refs_to_add, extent_op);
1774 	} else if (ret == -ENOENT) {
1775 		setup_inline_extent_backref(trans, root, path, iref, parent,
1776 					    root_objectid, owner, offset,
1777 					    refs_to_add, extent_op);
1778 		ret = 0;
1779 	}
1780 	return ret;
1781 }
1782 
1783 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1784 				 struct btrfs_root *root,
1785 				 struct btrfs_path *path,
1786 				 u64 bytenr, u64 parent, u64 root_objectid,
1787 				 u64 owner, u64 offset, int refs_to_add)
1788 {
1789 	int ret;
1790 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1791 		BUG_ON(refs_to_add != 1);
1792 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1793 					    parent, root_objectid);
1794 	} else {
1795 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1796 					     parent, root_objectid,
1797 					     owner, offset, refs_to_add);
1798 	}
1799 	return ret;
1800 }
1801 
1802 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1803 				 struct btrfs_root *root,
1804 				 struct btrfs_path *path,
1805 				 struct btrfs_extent_inline_ref *iref,
1806 				 int refs_to_drop, int is_data)
1807 {
1808 	int ret = 0;
1809 
1810 	BUG_ON(!is_data && refs_to_drop != 1);
1811 	if (iref) {
1812 		update_inline_extent_backref(trans, root, path, iref,
1813 					     -refs_to_drop, NULL);
1814 	} else if (is_data) {
1815 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1816 	} else {
1817 		ret = btrfs_del_item(trans, root, path);
1818 	}
1819 	return ret;
1820 }
1821 
1822 static int btrfs_issue_discard(struct block_device *bdev,
1823 				u64 start, u64 len)
1824 {
1825 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1826 }
1827 
1828 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1829 				u64 num_bytes, u64 *actual_bytes)
1830 {
1831 	int ret;
1832 	u64 discarded_bytes = 0;
1833 	struct btrfs_bio *bbio = NULL;
1834 
1835 
1836 	/* Tell the block device(s) that the sectors can be discarded */
1837 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1838 			      bytenr, &num_bytes, &bbio, 0);
1839 	/* Error condition is -ENOMEM */
1840 	if (!ret) {
1841 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1842 		int i;
1843 
1844 
1845 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1846 			if (!stripe->dev->can_discard)
1847 				continue;
1848 
1849 			ret = btrfs_issue_discard(stripe->dev->bdev,
1850 						  stripe->physical,
1851 						  stripe->length);
1852 			if (!ret)
1853 				discarded_bytes += stripe->length;
1854 			else if (ret != -EOPNOTSUPP)
1855 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1856 
1857 			/*
1858 			 * Just in case we get back EOPNOTSUPP for some reason,
1859 			 * just ignore the return value so we don't screw up
1860 			 * people calling discard_extent.
1861 			 */
1862 			ret = 0;
1863 		}
1864 		kfree(bbio);
1865 	}
1866 
1867 	if (actual_bytes)
1868 		*actual_bytes = discarded_bytes;
1869 
1870 
1871 	return ret;
1872 }
1873 
1874 /* Can return -ENOMEM */
1875 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1876 			 struct btrfs_root *root,
1877 			 u64 bytenr, u64 num_bytes, u64 parent,
1878 			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1879 {
1880 	int ret;
1881 	struct btrfs_fs_info *fs_info = root->fs_info;
1882 
1883 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1884 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1885 
1886 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1887 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1888 					num_bytes,
1889 					parent, root_objectid, (int)owner,
1890 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1891 	} else {
1892 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1893 					num_bytes,
1894 					parent, root_objectid, owner, offset,
1895 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1896 	}
1897 	return ret;
1898 }
1899 
1900 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1901 				  struct btrfs_root *root,
1902 				  u64 bytenr, u64 num_bytes,
1903 				  u64 parent, u64 root_objectid,
1904 				  u64 owner, u64 offset, int refs_to_add,
1905 				  struct btrfs_delayed_extent_op *extent_op)
1906 {
1907 	struct btrfs_path *path;
1908 	struct extent_buffer *leaf;
1909 	struct btrfs_extent_item *item;
1910 	u64 refs;
1911 	int ret;
1912 	int err = 0;
1913 
1914 	path = btrfs_alloc_path();
1915 	if (!path)
1916 		return -ENOMEM;
1917 
1918 	path->reada = 1;
1919 	path->leave_spinning = 1;
1920 	/* this will setup the path even if it fails to insert the back ref */
1921 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1922 					   path, bytenr, num_bytes, parent,
1923 					   root_objectid, owner, offset,
1924 					   refs_to_add, extent_op);
1925 	if (ret == 0)
1926 		goto out;
1927 
1928 	if (ret != -EAGAIN) {
1929 		err = ret;
1930 		goto out;
1931 	}
1932 
1933 	leaf = path->nodes[0];
1934 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1935 	refs = btrfs_extent_refs(leaf, item);
1936 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1937 	if (extent_op)
1938 		__run_delayed_extent_op(extent_op, leaf, item);
1939 
1940 	btrfs_mark_buffer_dirty(leaf);
1941 	btrfs_release_path(path);
1942 
1943 	path->reada = 1;
1944 	path->leave_spinning = 1;
1945 
1946 	/* now insert the actual backref */
1947 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1948 				    path, bytenr, parent, root_objectid,
1949 				    owner, offset, refs_to_add);
1950 	if (ret)
1951 		btrfs_abort_transaction(trans, root, ret);
1952 out:
1953 	btrfs_free_path(path);
1954 	return err;
1955 }
1956 
1957 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1958 				struct btrfs_root *root,
1959 				struct btrfs_delayed_ref_node *node,
1960 				struct btrfs_delayed_extent_op *extent_op,
1961 				int insert_reserved)
1962 {
1963 	int ret = 0;
1964 	struct btrfs_delayed_data_ref *ref;
1965 	struct btrfs_key ins;
1966 	u64 parent = 0;
1967 	u64 ref_root = 0;
1968 	u64 flags = 0;
1969 
1970 	ins.objectid = node->bytenr;
1971 	ins.offset = node->num_bytes;
1972 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1973 
1974 	ref = btrfs_delayed_node_to_data_ref(node);
1975 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1976 		parent = ref->parent;
1977 	else
1978 		ref_root = ref->root;
1979 
1980 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1981 		if (extent_op) {
1982 			BUG_ON(extent_op->update_key);
1983 			flags |= extent_op->flags_to_set;
1984 		}
1985 		ret = alloc_reserved_file_extent(trans, root,
1986 						 parent, ref_root, flags,
1987 						 ref->objectid, ref->offset,
1988 						 &ins, node->ref_mod);
1989 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1990 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1991 					     node->num_bytes, parent,
1992 					     ref_root, ref->objectid,
1993 					     ref->offset, node->ref_mod,
1994 					     extent_op);
1995 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1996 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1997 					  node->num_bytes, parent,
1998 					  ref_root, ref->objectid,
1999 					  ref->offset, node->ref_mod,
2000 					  extent_op);
2001 	} else {
2002 		BUG();
2003 	}
2004 	return ret;
2005 }
2006 
2007 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2008 				    struct extent_buffer *leaf,
2009 				    struct btrfs_extent_item *ei)
2010 {
2011 	u64 flags = btrfs_extent_flags(leaf, ei);
2012 	if (extent_op->update_flags) {
2013 		flags |= extent_op->flags_to_set;
2014 		btrfs_set_extent_flags(leaf, ei, flags);
2015 	}
2016 
2017 	if (extent_op->update_key) {
2018 		struct btrfs_tree_block_info *bi;
2019 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2020 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2021 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2022 	}
2023 }
2024 
2025 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2026 				 struct btrfs_root *root,
2027 				 struct btrfs_delayed_ref_node *node,
2028 				 struct btrfs_delayed_extent_op *extent_op)
2029 {
2030 	struct btrfs_key key;
2031 	struct btrfs_path *path;
2032 	struct btrfs_extent_item *ei;
2033 	struct extent_buffer *leaf;
2034 	u32 item_size;
2035 	int ret;
2036 	int err = 0;
2037 
2038 	if (trans->aborted)
2039 		return 0;
2040 
2041 	path = btrfs_alloc_path();
2042 	if (!path)
2043 		return -ENOMEM;
2044 
2045 	key.objectid = node->bytenr;
2046 	key.type = BTRFS_EXTENT_ITEM_KEY;
2047 	key.offset = node->num_bytes;
2048 
2049 	path->reada = 1;
2050 	path->leave_spinning = 1;
2051 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2052 				path, 0, 1);
2053 	if (ret < 0) {
2054 		err = ret;
2055 		goto out;
2056 	}
2057 	if (ret > 0) {
2058 		err = -EIO;
2059 		goto out;
2060 	}
2061 
2062 	leaf = path->nodes[0];
2063 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2064 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2065 	if (item_size < sizeof(*ei)) {
2066 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2067 					     path, (u64)-1, 0);
2068 		if (ret < 0) {
2069 			err = ret;
2070 			goto out;
2071 		}
2072 		leaf = path->nodes[0];
2073 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2074 	}
2075 #endif
2076 	BUG_ON(item_size < sizeof(*ei));
2077 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2078 	__run_delayed_extent_op(extent_op, leaf, ei);
2079 
2080 	btrfs_mark_buffer_dirty(leaf);
2081 out:
2082 	btrfs_free_path(path);
2083 	return err;
2084 }
2085 
2086 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2087 				struct btrfs_root *root,
2088 				struct btrfs_delayed_ref_node *node,
2089 				struct btrfs_delayed_extent_op *extent_op,
2090 				int insert_reserved)
2091 {
2092 	int ret = 0;
2093 	struct btrfs_delayed_tree_ref *ref;
2094 	struct btrfs_key ins;
2095 	u64 parent = 0;
2096 	u64 ref_root = 0;
2097 
2098 	ins.objectid = node->bytenr;
2099 	ins.offset = node->num_bytes;
2100 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2101 
2102 	ref = btrfs_delayed_node_to_tree_ref(node);
2103 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2104 		parent = ref->parent;
2105 	else
2106 		ref_root = ref->root;
2107 
2108 	BUG_ON(node->ref_mod != 1);
2109 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2110 		BUG_ON(!extent_op || !extent_op->update_flags ||
2111 		       !extent_op->update_key);
2112 		ret = alloc_reserved_tree_block(trans, root,
2113 						parent, ref_root,
2114 						extent_op->flags_to_set,
2115 						&extent_op->key,
2116 						ref->level, &ins);
2117 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2118 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2119 					     node->num_bytes, parent, ref_root,
2120 					     ref->level, 0, 1, extent_op);
2121 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2122 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2123 					  node->num_bytes, parent, ref_root,
2124 					  ref->level, 0, 1, extent_op);
2125 	} else {
2126 		BUG();
2127 	}
2128 	return ret;
2129 }
2130 
2131 /* helper function to actually process a single delayed ref entry */
2132 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2133 			       struct btrfs_root *root,
2134 			       struct btrfs_delayed_ref_node *node,
2135 			       struct btrfs_delayed_extent_op *extent_op,
2136 			       int insert_reserved)
2137 {
2138 	int ret = 0;
2139 
2140 	if (trans->aborted)
2141 		return 0;
2142 
2143 	if (btrfs_delayed_ref_is_head(node)) {
2144 		struct btrfs_delayed_ref_head *head;
2145 		/*
2146 		 * we've hit the end of the chain and we were supposed
2147 		 * to insert this extent into the tree.  But, it got
2148 		 * deleted before we ever needed to insert it, so all
2149 		 * we have to do is clean up the accounting
2150 		 */
2151 		BUG_ON(extent_op);
2152 		head = btrfs_delayed_node_to_head(node);
2153 		if (insert_reserved) {
2154 			btrfs_pin_extent(root, node->bytenr,
2155 					 node->num_bytes, 1);
2156 			if (head->is_data) {
2157 				ret = btrfs_del_csums(trans, root,
2158 						      node->bytenr,
2159 						      node->num_bytes);
2160 			}
2161 		}
2162 		mutex_unlock(&head->mutex);
2163 		return ret;
2164 	}
2165 
2166 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2167 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2168 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2169 					   insert_reserved);
2170 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2171 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2172 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2173 					   insert_reserved);
2174 	else
2175 		BUG();
2176 	return ret;
2177 }
2178 
2179 static noinline struct btrfs_delayed_ref_node *
2180 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2181 {
2182 	struct rb_node *node;
2183 	struct btrfs_delayed_ref_node *ref;
2184 	int action = BTRFS_ADD_DELAYED_REF;
2185 again:
2186 	/*
2187 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2188 	 * this prevents ref count from going down to zero when
2189 	 * there still are pending delayed ref.
2190 	 */
2191 	node = rb_prev(&head->node.rb_node);
2192 	while (1) {
2193 		if (!node)
2194 			break;
2195 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2196 				rb_node);
2197 		if (ref->bytenr != head->node.bytenr)
2198 			break;
2199 		if (ref->action == action)
2200 			return ref;
2201 		node = rb_prev(node);
2202 	}
2203 	if (action == BTRFS_ADD_DELAYED_REF) {
2204 		action = BTRFS_DROP_DELAYED_REF;
2205 		goto again;
2206 	}
2207 	return NULL;
2208 }
2209 
2210 /*
2211  * Returns 0 on success or if called with an already aborted transaction.
2212  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2213  */
2214 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2215 				       struct btrfs_root *root,
2216 				       struct list_head *cluster)
2217 {
2218 	struct btrfs_delayed_ref_root *delayed_refs;
2219 	struct btrfs_delayed_ref_node *ref;
2220 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2221 	struct btrfs_delayed_extent_op *extent_op;
2222 	struct btrfs_fs_info *fs_info = root->fs_info;
2223 	int ret;
2224 	int count = 0;
2225 	int must_insert_reserved = 0;
2226 
2227 	delayed_refs = &trans->transaction->delayed_refs;
2228 	while (1) {
2229 		if (!locked_ref) {
2230 			/* pick a new head ref from the cluster list */
2231 			if (list_empty(cluster))
2232 				break;
2233 
2234 			locked_ref = list_entry(cluster->next,
2235 				     struct btrfs_delayed_ref_head, cluster);
2236 
2237 			/* grab the lock that says we are going to process
2238 			 * all the refs for this head */
2239 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2240 
2241 			/*
2242 			 * we may have dropped the spin lock to get the head
2243 			 * mutex lock, and that might have given someone else
2244 			 * time to free the head.  If that's true, it has been
2245 			 * removed from our list and we can move on.
2246 			 */
2247 			if (ret == -EAGAIN) {
2248 				locked_ref = NULL;
2249 				count++;
2250 				continue;
2251 			}
2252 		}
2253 
2254 		/*
2255 		 * locked_ref is the head node, so we have to go one
2256 		 * node back for any delayed ref updates
2257 		 */
2258 		ref = select_delayed_ref(locked_ref);
2259 
2260 		if (ref && ref->seq &&
2261 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2262 			/*
2263 			 * there are still refs with lower seq numbers in the
2264 			 * process of being added. Don't run this ref yet.
2265 			 */
2266 			list_del_init(&locked_ref->cluster);
2267 			mutex_unlock(&locked_ref->mutex);
2268 			locked_ref = NULL;
2269 			delayed_refs->num_heads_ready++;
2270 			spin_unlock(&delayed_refs->lock);
2271 			cond_resched();
2272 			spin_lock(&delayed_refs->lock);
2273 			continue;
2274 		}
2275 
2276 		/*
2277 		 * record the must insert reserved flag before we
2278 		 * drop the spin lock.
2279 		 */
2280 		must_insert_reserved = locked_ref->must_insert_reserved;
2281 		locked_ref->must_insert_reserved = 0;
2282 
2283 		extent_op = locked_ref->extent_op;
2284 		locked_ref->extent_op = NULL;
2285 
2286 		if (!ref) {
2287 			/* All delayed refs have been processed, Go ahead
2288 			 * and send the head node to run_one_delayed_ref,
2289 			 * so that any accounting fixes can happen
2290 			 */
2291 			ref = &locked_ref->node;
2292 
2293 			if (extent_op && must_insert_reserved) {
2294 				kfree(extent_op);
2295 				extent_op = NULL;
2296 			}
2297 
2298 			if (extent_op) {
2299 				spin_unlock(&delayed_refs->lock);
2300 
2301 				ret = run_delayed_extent_op(trans, root,
2302 							    ref, extent_op);
2303 				kfree(extent_op);
2304 
2305 				if (ret) {
2306 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2307 					spin_lock(&delayed_refs->lock);
2308 					return ret;
2309 				}
2310 
2311 				goto next;
2312 			}
2313 
2314 			list_del_init(&locked_ref->cluster);
2315 			locked_ref = NULL;
2316 		}
2317 
2318 		ref->in_tree = 0;
2319 		rb_erase(&ref->rb_node, &delayed_refs->root);
2320 		delayed_refs->num_entries--;
2321 		/*
2322 		 * we modified num_entries, but as we're currently running
2323 		 * delayed refs, skip
2324 		 *     wake_up(&delayed_refs->seq_wait);
2325 		 * here.
2326 		 */
2327 		spin_unlock(&delayed_refs->lock);
2328 
2329 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2330 					  must_insert_reserved);
2331 
2332 		btrfs_put_delayed_ref(ref);
2333 		kfree(extent_op);
2334 		count++;
2335 
2336 		if (ret) {
2337 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2338 			spin_lock(&delayed_refs->lock);
2339 			return ret;
2340 		}
2341 
2342 next:
2343 		do_chunk_alloc(trans, fs_info->extent_root,
2344 			       2 * 1024 * 1024,
2345 			       btrfs_get_alloc_profile(root, 0),
2346 			       CHUNK_ALLOC_NO_FORCE);
2347 		cond_resched();
2348 		spin_lock(&delayed_refs->lock);
2349 	}
2350 	return count;
2351 }
2352 
2353 static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
2354 			       struct btrfs_delayed_ref_root *delayed_refs,
2355 			       unsigned long num_refs,
2356 			       struct list_head *first_seq)
2357 {
2358 	spin_unlock(&delayed_refs->lock);
2359 	pr_debug("waiting for more refs (num %ld, first %p)\n",
2360 		 num_refs, first_seq);
2361 	wait_event(fs_info->tree_mod_seq_wait,
2362 		   num_refs != delayed_refs->num_entries ||
2363 		   fs_info->tree_mod_seq_list.next != first_seq);
2364 	pr_debug("done waiting for more refs (num %ld, first %p)\n",
2365 		 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
2366 	spin_lock(&delayed_refs->lock);
2367 }
2368 
2369 #ifdef SCRAMBLE_DELAYED_REFS
2370 /*
2371  * Normally delayed refs get processed in ascending bytenr order. This
2372  * correlates in most cases to the order added. To expose dependencies on this
2373  * order, we start to process the tree in the middle instead of the beginning
2374  */
2375 static u64 find_middle(struct rb_root *root)
2376 {
2377 	struct rb_node *n = root->rb_node;
2378 	struct btrfs_delayed_ref_node *entry;
2379 	int alt = 1;
2380 	u64 middle;
2381 	u64 first = 0, last = 0;
2382 
2383 	n = rb_first(root);
2384 	if (n) {
2385 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386 		first = entry->bytenr;
2387 	}
2388 	n = rb_last(root);
2389 	if (n) {
2390 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2391 		last = entry->bytenr;
2392 	}
2393 	n = root->rb_node;
2394 
2395 	while (n) {
2396 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2397 		WARN_ON(!entry->in_tree);
2398 
2399 		middle = entry->bytenr;
2400 
2401 		if (alt)
2402 			n = n->rb_left;
2403 		else
2404 			n = n->rb_right;
2405 
2406 		alt = 1 - alt;
2407 	}
2408 	return middle;
2409 }
2410 #endif
2411 
2412 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2413 					 struct btrfs_fs_info *fs_info)
2414 {
2415 	struct qgroup_update *qgroup_update;
2416 	int ret = 0;
2417 
2418 	if (list_empty(&trans->qgroup_ref_list) !=
2419 	    !trans->delayed_ref_elem.seq) {
2420 		/* list without seq or seq without list */
2421 		printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2422 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
2423 			trans->delayed_ref_elem.seq);
2424 		BUG();
2425 	}
2426 
2427 	if (!trans->delayed_ref_elem.seq)
2428 		return 0;
2429 
2430 	while (!list_empty(&trans->qgroup_ref_list)) {
2431 		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2432 						 struct qgroup_update, list);
2433 		list_del(&qgroup_update->list);
2434 		if (!ret)
2435 			ret = btrfs_qgroup_account_ref(
2436 					trans, fs_info, qgroup_update->node,
2437 					qgroup_update->extent_op);
2438 		kfree(qgroup_update);
2439 	}
2440 
2441 	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2442 
2443 	return ret;
2444 }
2445 
2446 /*
2447  * this starts processing the delayed reference count updates and
2448  * extent insertions we have queued up so far.  count can be
2449  * 0, which means to process everything in the tree at the start
2450  * of the run (but not newly added entries), or it can be some target
2451  * number you'd like to process.
2452  *
2453  * Returns 0 on success or if called with an aborted transaction
2454  * Returns <0 on error and aborts the transaction
2455  */
2456 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2457 			   struct btrfs_root *root, unsigned long count)
2458 {
2459 	struct rb_node *node;
2460 	struct btrfs_delayed_ref_root *delayed_refs;
2461 	struct btrfs_delayed_ref_node *ref;
2462 	struct list_head cluster;
2463 	struct list_head *first_seq = NULL;
2464 	int ret;
2465 	u64 delayed_start;
2466 	int run_all = count == (unsigned long)-1;
2467 	int run_most = 0;
2468 	unsigned long num_refs = 0;
2469 	int consider_waiting;
2470 
2471 	/* We'll clean this up in btrfs_cleanup_transaction */
2472 	if (trans->aborted)
2473 		return 0;
2474 
2475 	if (root == root->fs_info->extent_root)
2476 		root = root->fs_info->tree_root;
2477 
2478 	do_chunk_alloc(trans, root->fs_info->extent_root,
2479 		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2480 		       CHUNK_ALLOC_NO_FORCE);
2481 
2482 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2483 
2484 	delayed_refs = &trans->transaction->delayed_refs;
2485 	INIT_LIST_HEAD(&cluster);
2486 again:
2487 	consider_waiting = 0;
2488 	spin_lock(&delayed_refs->lock);
2489 
2490 #ifdef SCRAMBLE_DELAYED_REFS
2491 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2492 #endif
2493 
2494 	if (count == 0) {
2495 		count = delayed_refs->num_entries * 2;
2496 		run_most = 1;
2497 	}
2498 	while (1) {
2499 		if (!(run_all || run_most) &&
2500 		    delayed_refs->num_heads_ready < 64)
2501 			break;
2502 
2503 		/*
2504 		 * go find something we can process in the rbtree.  We start at
2505 		 * the beginning of the tree, and then build a cluster
2506 		 * of refs to process starting at the first one we are able to
2507 		 * lock
2508 		 */
2509 		delayed_start = delayed_refs->run_delayed_start;
2510 		ret = btrfs_find_ref_cluster(trans, &cluster,
2511 					     delayed_refs->run_delayed_start);
2512 		if (ret)
2513 			break;
2514 
2515 		if (delayed_start >= delayed_refs->run_delayed_start) {
2516 			if (consider_waiting == 0) {
2517 				/*
2518 				 * btrfs_find_ref_cluster looped. let's do one
2519 				 * more cycle. if we don't run any delayed ref
2520 				 * during that cycle (because we can't because
2521 				 * all of them are blocked) and if the number of
2522 				 * refs doesn't change, we avoid busy waiting.
2523 				 */
2524 				consider_waiting = 1;
2525 				num_refs = delayed_refs->num_entries;
2526 				first_seq = root->fs_info->tree_mod_seq_list.next;
2527 			} else {
2528 				wait_for_more_refs(root->fs_info, delayed_refs,
2529 						   num_refs, first_seq);
2530 				/*
2531 				 * after waiting, things have changed. we
2532 				 * dropped the lock and someone else might have
2533 				 * run some refs, built new clusters and so on.
2534 				 * therefore, we restart staleness detection.
2535 				 */
2536 				consider_waiting = 0;
2537 			}
2538 		}
2539 
2540 		ret = run_clustered_refs(trans, root, &cluster);
2541 		if (ret < 0) {
2542 			spin_unlock(&delayed_refs->lock);
2543 			btrfs_abort_transaction(trans, root, ret);
2544 			return ret;
2545 		}
2546 
2547 		count -= min_t(unsigned long, ret, count);
2548 
2549 		if (count == 0)
2550 			break;
2551 
2552 		if (ret || delayed_refs->run_delayed_start == 0) {
2553 			/* refs were run, let's reset staleness detection */
2554 			consider_waiting = 0;
2555 		}
2556 	}
2557 
2558 	if (run_all) {
2559 		node = rb_first(&delayed_refs->root);
2560 		if (!node)
2561 			goto out;
2562 		count = (unsigned long)-1;
2563 
2564 		while (node) {
2565 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2566 				       rb_node);
2567 			if (btrfs_delayed_ref_is_head(ref)) {
2568 				struct btrfs_delayed_ref_head *head;
2569 
2570 				head = btrfs_delayed_node_to_head(ref);
2571 				atomic_inc(&ref->refs);
2572 
2573 				spin_unlock(&delayed_refs->lock);
2574 				/*
2575 				 * Mutex was contended, block until it's
2576 				 * released and try again
2577 				 */
2578 				mutex_lock(&head->mutex);
2579 				mutex_unlock(&head->mutex);
2580 
2581 				btrfs_put_delayed_ref(ref);
2582 				cond_resched();
2583 				goto again;
2584 			}
2585 			node = rb_next(node);
2586 		}
2587 		spin_unlock(&delayed_refs->lock);
2588 		schedule_timeout(1);
2589 		goto again;
2590 	}
2591 out:
2592 	spin_unlock(&delayed_refs->lock);
2593 	assert_qgroups_uptodate(trans);
2594 	return 0;
2595 }
2596 
2597 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 				struct btrfs_root *root,
2599 				u64 bytenr, u64 num_bytes, u64 flags,
2600 				int is_data)
2601 {
2602 	struct btrfs_delayed_extent_op *extent_op;
2603 	int ret;
2604 
2605 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2606 	if (!extent_op)
2607 		return -ENOMEM;
2608 
2609 	extent_op->flags_to_set = flags;
2610 	extent_op->update_flags = 1;
2611 	extent_op->update_key = 0;
2612 	extent_op->is_data = is_data ? 1 : 0;
2613 
2614 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2615 					  num_bytes, extent_op);
2616 	if (ret)
2617 		kfree(extent_op);
2618 	return ret;
2619 }
2620 
2621 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2622 				      struct btrfs_root *root,
2623 				      struct btrfs_path *path,
2624 				      u64 objectid, u64 offset, u64 bytenr)
2625 {
2626 	struct btrfs_delayed_ref_head *head;
2627 	struct btrfs_delayed_ref_node *ref;
2628 	struct btrfs_delayed_data_ref *data_ref;
2629 	struct btrfs_delayed_ref_root *delayed_refs;
2630 	struct rb_node *node;
2631 	int ret = 0;
2632 
2633 	ret = -ENOENT;
2634 	delayed_refs = &trans->transaction->delayed_refs;
2635 	spin_lock(&delayed_refs->lock);
2636 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2637 	if (!head)
2638 		goto out;
2639 
2640 	if (!mutex_trylock(&head->mutex)) {
2641 		atomic_inc(&head->node.refs);
2642 		spin_unlock(&delayed_refs->lock);
2643 
2644 		btrfs_release_path(path);
2645 
2646 		/*
2647 		 * Mutex was contended, block until it's released and let
2648 		 * caller try again
2649 		 */
2650 		mutex_lock(&head->mutex);
2651 		mutex_unlock(&head->mutex);
2652 		btrfs_put_delayed_ref(&head->node);
2653 		return -EAGAIN;
2654 	}
2655 
2656 	node = rb_prev(&head->node.rb_node);
2657 	if (!node)
2658 		goto out_unlock;
2659 
2660 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2661 
2662 	if (ref->bytenr != bytenr)
2663 		goto out_unlock;
2664 
2665 	ret = 1;
2666 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2667 		goto out_unlock;
2668 
2669 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2670 
2671 	node = rb_prev(node);
2672 	if (node) {
2673 		int seq = ref->seq;
2674 
2675 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2676 		if (ref->bytenr == bytenr && ref->seq == seq)
2677 			goto out_unlock;
2678 	}
2679 
2680 	if (data_ref->root != root->root_key.objectid ||
2681 	    data_ref->objectid != objectid || data_ref->offset != offset)
2682 		goto out_unlock;
2683 
2684 	ret = 0;
2685 out_unlock:
2686 	mutex_unlock(&head->mutex);
2687 out:
2688 	spin_unlock(&delayed_refs->lock);
2689 	return ret;
2690 }
2691 
2692 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2693 					struct btrfs_root *root,
2694 					struct btrfs_path *path,
2695 					u64 objectid, u64 offset, u64 bytenr)
2696 {
2697 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2698 	struct extent_buffer *leaf;
2699 	struct btrfs_extent_data_ref *ref;
2700 	struct btrfs_extent_inline_ref *iref;
2701 	struct btrfs_extent_item *ei;
2702 	struct btrfs_key key;
2703 	u32 item_size;
2704 	int ret;
2705 
2706 	key.objectid = bytenr;
2707 	key.offset = (u64)-1;
2708 	key.type = BTRFS_EXTENT_ITEM_KEY;
2709 
2710 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2711 	if (ret < 0)
2712 		goto out;
2713 	BUG_ON(ret == 0); /* Corruption */
2714 
2715 	ret = -ENOENT;
2716 	if (path->slots[0] == 0)
2717 		goto out;
2718 
2719 	path->slots[0]--;
2720 	leaf = path->nodes[0];
2721 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2722 
2723 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2724 		goto out;
2725 
2726 	ret = 1;
2727 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2728 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2729 	if (item_size < sizeof(*ei)) {
2730 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2731 		goto out;
2732 	}
2733 #endif
2734 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2735 
2736 	if (item_size != sizeof(*ei) +
2737 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2738 		goto out;
2739 
2740 	if (btrfs_extent_generation(leaf, ei) <=
2741 	    btrfs_root_last_snapshot(&root->root_item))
2742 		goto out;
2743 
2744 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2745 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2746 	    BTRFS_EXTENT_DATA_REF_KEY)
2747 		goto out;
2748 
2749 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2750 	if (btrfs_extent_refs(leaf, ei) !=
2751 	    btrfs_extent_data_ref_count(leaf, ref) ||
2752 	    btrfs_extent_data_ref_root(leaf, ref) !=
2753 	    root->root_key.objectid ||
2754 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2755 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2756 		goto out;
2757 
2758 	ret = 0;
2759 out:
2760 	return ret;
2761 }
2762 
2763 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2764 			  struct btrfs_root *root,
2765 			  u64 objectid, u64 offset, u64 bytenr)
2766 {
2767 	struct btrfs_path *path;
2768 	int ret;
2769 	int ret2;
2770 
2771 	path = btrfs_alloc_path();
2772 	if (!path)
2773 		return -ENOENT;
2774 
2775 	do {
2776 		ret = check_committed_ref(trans, root, path, objectid,
2777 					  offset, bytenr);
2778 		if (ret && ret != -ENOENT)
2779 			goto out;
2780 
2781 		ret2 = check_delayed_ref(trans, root, path, objectid,
2782 					 offset, bytenr);
2783 	} while (ret2 == -EAGAIN);
2784 
2785 	if (ret2 && ret2 != -ENOENT) {
2786 		ret = ret2;
2787 		goto out;
2788 	}
2789 
2790 	if (ret != -ENOENT || ret2 != -ENOENT)
2791 		ret = 0;
2792 out:
2793 	btrfs_free_path(path);
2794 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2795 		WARN_ON(ret > 0);
2796 	return ret;
2797 }
2798 
2799 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2800 			   struct btrfs_root *root,
2801 			   struct extent_buffer *buf,
2802 			   int full_backref, int inc, int for_cow)
2803 {
2804 	u64 bytenr;
2805 	u64 num_bytes;
2806 	u64 parent;
2807 	u64 ref_root;
2808 	u32 nritems;
2809 	struct btrfs_key key;
2810 	struct btrfs_file_extent_item *fi;
2811 	int i;
2812 	int level;
2813 	int ret = 0;
2814 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2815 			    u64, u64, u64, u64, u64, u64, int);
2816 
2817 	ref_root = btrfs_header_owner(buf);
2818 	nritems = btrfs_header_nritems(buf);
2819 	level = btrfs_header_level(buf);
2820 
2821 	if (!root->ref_cows && level == 0)
2822 		return 0;
2823 
2824 	if (inc)
2825 		process_func = btrfs_inc_extent_ref;
2826 	else
2827 		process_func = btrfs_free_extent;
2828 
2829 	if (full_backref)
2830 		parent = buf->start;
2831 	else
2832 		parent = 0;
2833 
2834 	for (i = 0; i < nritems; i++) {
2835 		if (level == 0) {
2836 			btrfs_item_key_to_cpu(buf, &key, i);
2837 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2838 				continue;
2839 			fi = btrfs_item_ptr(buf, i,
2840 					    struct btrfs_file_extent_item);
2841 			if (btrfs_file_extent_type(buf, fi) ==
2842 			    BTRFS_FILE_EXTENT_INLINE)
2843 				continue;
2844 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2845 			if (bytenr == 0)
2846 				continue;
2847 
2848 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2849 			key.offset -= btrfs_file_extent_offset(buf, fi);
2850 			ret = process_func(trans, root, bytenr, num_bytes,
2851 					   parent, ref_root, key.objectid,
2852 					   key.offset, for_cow);
2853 			if (ret)
2854 				goto fail;
2855 		} else {
2856 			bytenr = btrfs_node_blockptr(buf, i);
2857 			num_bytes = btrfs_level_size(root, level - 1);
2858 			ret = process_func(trans, root, bytenr, num_bytes,
2859 					   parent, ref_root, level - 1, 0,
2860 					   for_cow);
2861 			if (ret)
2862 				goto fail;
2863 		}
2864 	}
2865 	return 0;
2866 fail:
2867 	return ret;
2868 }
2869 
2870 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2871 		  struct extent_buffer *buf, int full_backref, int for_cow)
2872 {
2873 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2874 }
2875 
2876 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2877 		  struct extent_buffer *buf, int full_backref, int for_cow)
2878 {
2879 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2880 }
2881 
2882 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2883 				 struct btrfs_root *root,
2884 				 struct btrfs_path *path,
2885 				 struct btrfs_block_group_cache *cache)
2886 {
2887 	int ret;
2888 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2889 	unsigned long bi;
2890 	struct extent_buffer *leaf;
2891 
2892 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2893 	if (ret < 0)
2894 		goto fail;
2895 	BUG_ON(ret); /* Corruption */
2896 
2897 	leaf = path->nodes[0];
2898 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2899 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2900 	btrfs_mark_buffer_dirty(leaf);
2901 	btrfs_release_path(path);
2902 fail:
2903 	if (ret) {
2904 		btrfs_abort_transaction(trans, root, ret);
2905 		return ret;
2906 	}
2907 	return 0;
2908 
2909 }
2910 
2911 static struct btrfs_block_group_cache *
2912 next_block_group(struct btrfs_root *root,
2913 		 struct btrfs_block_group_cache *cache)
2914 {
2915 	struct rb_node *node;
2916 	spin_lock(&root->fs_info->block_group_cache_lock);
2917 	node = rb_next(&cache->cache_node);
2918 	btrfs_put_block_group(cache);
2919 	if (node) {
2920 		cache = rb_entry(node, struct btrfs_block_group_cache,
2921 				 cache_node);
2922 		btrfs_get_block_group(cache);
2923 	} else
2924 		cache = NULL;
2925 	spin_unlock(&root->fs_info->block_group_cache_lock);
2926 	return cache;
2927 }
2928 
2929 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2930 			    struct btrfs_trans_handle *trans,
2931 			    struct btrfs_path *path)
2932 {
2933 	struct btrfs_root *root = block_group->fs_info->tree_root;
2934 	struct inode *inode = NULL;
2935 	u64 alloc_hint = 0;
2936 	int dcs = BTRFS_DC_ERROR;
2937 	int num_pages = 0;
2938 	int retries = 0;
2939 	int ret = 0;
2940 
2941 	/*
2942 	 * If this block group is smaller than 100 megs don't bother caching the
2943 	 * block group.
2944 	 */
2945 	if (block_group->key.offset < (100 * 1024 * 1024)) {
2946 		spin_lock(&block_group->lock);
2947 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2948 		spin_unlock(&block_group->lock);
2949 		return 0;
2950 	}
2951 
2952 again:
2953 	inode = lookup_free_space_inode(root, block_group, path);
2954 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2955 		ret = PTR_ERR(inode);
2956 		btrfs_release_path(path);
2957 		goto out;
2958 	}
2959 
2960 	if (IS_ERR(inode)) {
2961 		BUG_ON(retries);
2962 		retries++;
2963 
2964 		if (block_group->ro)
2965 			goto out_free;
2966 
2967 		ret = create_free_space_inode(root, trans, block_group, path);
2968 		if (ret)
2969 			goto out_free;
2970 		goto again;
2971 	}
2972 
2973 	/* We've already setup this transaction, go ahead and exit */
2974 	if (block_group->cache_generation == trans->transid &&
2975 	    i_size_read(inode)) {
2976 		dcs = BTRFS_DC_SETUP;
2977 		goto out_put;
2978 	}
2979 
2980 	/*
2981 	 * We want to set the generation to 0, that way if anything goes wrong
2982 	 * from here on out we know not to trust this cache when we load up next
2983 	 * time.
2984 	 */
2985 	BTRFS_I(inode)->generation = 0;
2986 	ret = btrfs_update_inode(trans, root, inode);
2987 	WARN_ON(ret);
2988 
2989 	if (i_size_read(inode) > 0) {
2990 		ret = btrfs_truncate_free_space_cache(root, trans, path,
2991 						      inode);
2992 		if (ret)
2993 			goto out_put;
2994 	}
2995 
2996 	spin_lock(&block_group->lock);
2997 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
2998 	    !btrfs_test_opt(root, SPACE_CACHE)) {
2999 		/*
3000 		 * don't bother trying to write stuff out _if_
3001 		 * a) we're not cached,
3002 		 * b) we're with nospace_cache mount option.
3003 		 */
3004 		dcs = BTRFS_DC_WRITTEN;
3005 		spin_unlock(&block_group->lock);
3006 		goto out_put;
3007 	}
3008 	spin_unlock(&block_group->lock);
3009 
3010 	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
3011 	if (!num_pages)
3012 		num_pages = 1;
3013 
3014 	/*
3015 	 * Just to make absolutely sure we have enough space, we're going to
3016 	 * preallocate 12 pages worth of space for each block group.  In
3017 	 * practice we ought to use at most 8, but we need extra space so we can
3018 	 * add our header and have a terminator between the extents and the
3019 	 * bitmaps.
3020 	 */
3021 	num_pages *= 16;
3022 	num_pages *= PAGE_CACHE_SIZE;
3023 
3024 	ret = btrfs_check_data_free_space(inode, num_pages);
3025 	if (ret)
3026 		goto out_put;
3027 
3028 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3029 					      num_pages, num_pages,
3030 					      &alloc_hint);
3031 	if (!ret)
3032 		dcs = BTRFS_DC_SETUP;
3033 	btrfs_free_reserved_data_space(inode, num_pages);
3034 
3035 out_put:
3036 	iput(inode);
3037 out_free:
3038 	btrfs_release_path(path);
3039 out:
3040 	spin_lock(&block_group->lock);
3041 	if (!ret && dcs == BTRFS_DC_SETUP)
3042 		block_group->cache_generation = trans->transid;
3043 	block_group->disk_cache_state = dcs;
3044 	spin_unlock(&block_group->lock);
3045 
3046 	return ret;
3047 }
3048 
3049 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3050 				   struct btrfs_root *root)
3051 {
3052 	struct btrfs_block_group_cache *cache;
3053 	int err = 0;
3054 	struct btrfs_path *path;
3055 	u64 last = 0;
3056 
3057 	path = btrfs_alloc_path();
3058 	if (!path)
3059 		return -ENOMEM;
3060 
3061 again:
3062 	while (1) {
3063 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3064 		while (cache) {
3065 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3066 				break;
3067 			cache = next_block_group(root, cache);
3068 		}
3069 		if (!cache) {
3070 			if (last == 0)
3071 				break;
3072 			last = 0;
3073 			continue;
3074 		}
3075 		err = cache_save_setup(cache, trans, path);
3076 		last = cache->key.objectid + cache->key.offset;
3077 		btrfs_put_block_group(cache);
3078 	}
3079 
3080 	while (1) {
3081 		if (last == 0) {
3082 			err = btrfs_run_delayed_refs(trans, root,
3083 						     (unsigned long)-1);
3084 			if (err) /* File system offline */
3085 				goto out;
3086 		}
3087 
3088 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3089 		while (cache) {
3090 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3091 				btrfs_put_block_group(cache);
3092 				goto again;
3093 			}
3094 
3095 			if (cache->dirty)
3096 				break;
3097 			cache = next_block_group(root, cache);
3098 		}
3099 		if (!cache) {
3100 			if (last == 0)
3101 				break;
3102 			last = 0;
3103 			continue;
3104 		}
3105 
3106 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
3107 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3108 		cache->dirty = 0;
3109 		last = cache->key.objectid + cache->key.offset;
3110 
3111 		err = write_one_cache_group(trans, root, path, cache);
3112 		if (err) /* File system offline */
3113 			goto out;
3114 
3115 		btrfs_put_block_group(cache);
3116 	}
3117 
3118 	while (1) {
3119 		/*
3120 		 * I don't think this is needed since we're just marking our
3121 		 * preallocated extent as written, but just in case it can't
3122 		 * hurt.
3123 		 */
3124 		if (last == 0) {
3125 			err = btrfs_run_delayed_refs(trans, root,
3126 						     (unsigned long)-1);
3127 			if (err) /* File system offline */
3128 				goto out;
3129 		}
3130 
3131 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3132 		while (cache) {
3133 			/*
3134 			 * Really this shouldn't happen, but it could if we
3135 			 * couldn't write the entire preallocated extent and
3136 			 * splitting the extent resulted in a new block.
3137 			 */
3138 			if (cache->dirty) {
3139 				btrfs_put_block_group(cache);
3140 				goto again;
3141 			}
3142 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3143 				break;
3144 			cache = next_block_group(root, cache);
3145 		}
3146 		if (!cache) {
3147 			if (last == 0)
3148 				break;
3149 			last = 0;
3150 			continue;
3151 		}
3152 
3153 		err = btrfs_write_out_cache(root, trans, cache, path);
3154 
3155 		/*
3156 		 * If we didn't have an error then the cache state is still
3157 		 * NEED_WRITE, so we can set it to WRITTEN.
3158 		 */
3159 		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3160 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
3161 		last = cache->key.objectid + cache->key.offset;
3162 		btrfs_put_block_group(cache);
3163 	}
3164 out:
3165 
3166 	btrfs_free_path(path);
3167 	return err;
3168 }
3169 
3170 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3171 {
3172 	struct btrfs_block_group_cache *block_group;
3173 	int readonly = 0;
3174 
3175 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3176 	if (!block_group || block_group->ro)
3177 		readonly = 1;
3178 	if (block_group)
3179 		btrfs_put_block_group(block_group);
3180 	return readonly;
3181 }
3182 
3183 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3184 			     u64 total_bytes, u64 bytes_used,
3185 			     struct btrfs_space_info **space_info)
3186 {
3187 	struct btrfs_space_info *found;
3188 	int i;
3189 	int factor;
3190 
3191 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3192 		     BTRFS_BLOCK_GROUP_RAID10))
3193 		factor = 2;
3194 	else
3195 		factor = 1;
3196 
3197 	found = __find_space_info(info, flags);
3198 	if (found) {
3199 		spin_lock(&found->lock);
3200 		found->total_bytes += total_bytes;
3201 		found->disk_total += total_bytes * factor;
3202 		found->bytes_used += bytes_used;
3203 		found->disk_used += bytes_used * factor;
3204 		found->full = 0;
3205 		spin_unlock(&found->lock);
3206 		*space_info = found;
3207 		return 0;
3208 	}
3209 	found = kzalloc(sizeof(*found), GFP_NOFS);
3210 	if (!found)
3211 		return -ENOMEM;
3212 
3213 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3214 		INIT_LIST_HEAD(&found->block_groups[i]);
3215 	init_rwsem(&found->groups_sem);
3216 	spin_lock_init(&found->lock);
3217 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3218 	found->total_bytes = total_bytes;
3219 	found->disk_total = total_bytes * factor;
3220 	found->bytes_used = bytes_used;
3221 	found->disk_used = bytes_used * factor;
3222 	found->bytes_pinned = 0;
3223 	found->bytes_reserved = 0;
3224 	found->bytes_readonly = 0;
3225 	found->bytes_may_use = 0;
3226 	found->full = 0;
3227 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3228 	found->chunk_alloc = 0;
3229 	found->flush = 0;
3230 	init_waitqueue_head(&found->wait);
3231 	*space_info = found;
3232 	list_add_rcu(&found->list, &info->space_info);
3233 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3234 		info->data_sinfo = found;
3235 	return 0;
3236 }
3237 
3238 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3239 {
3240 	u64 extra_flags = chunk_to_extended(flags) &
3241 				BTRFS_EXTENDED_PROFILE_MASK;
3242 
3243 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3244 		fs_info->avail_data_alloc_bits |= extra_flags;
3245 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
3246 		fs_info->avail_metadata_alloc_bits |= extra_flags;
3247 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3248 		fs_info->avail_system_alloc_bits |= extra_flags;
3249 }
3250 
3251 /*
3252  * returns target flags in extended format or 0 if restripe for this
3253  * chunk_type is not in progress
3254  *
3255  * should be called with either volume_mutex or balance_lock held
3256  */
3257 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3258 {
3259 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3260 	u64 target = 0;
3261 
3262 	if (!bctl)
3263 		return 0;
3264 
3265 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
3266 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3267 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3268 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3269 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3270 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3271 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3272 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3273 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3274 	}
3275 
3276 	return target;
3277 }
3278 
3279 /*
3280  * @flags: available profiles in extended format (see ctree.h)
3281  *
3282  * Returns reduced profile in chunk format.  If profile changing is in
3283  * progress (either running or paused) picks the target profile (if it's
3284  * already available), otherwise falls back to plain reducing.
3285  */
3286 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3287 {
3288 	/*
3289 	 * we add in the count of missing devices because we want
3290 	 * to make sure that any RAID levels on a degraded FS
3291 	 * continue to be honored.
3292 	 */
3293 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3294 		root->fs_info->fs_devices->missing_devices;
3295 	u64 target;
3296 
3297 	/*
3298 	 * see if restripe for this chunk_type is in progress, if so
3299 	 * try to reduce to the target profile
3300 	 */
3301 	spin_lock(&root->fs_info->balance_lock);
3302 	target = get_restripe_target(root->fs_info, flags);
3303 	if (target) {
3304 		/* pick target profile only if it's already available */
3305 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3306 			spin_unlock(&root->fs_info->balance_lock);
3307 			return extended_to_chunk(target);
3308 		}
3309 	}
3310 	spin_unlock(&root->fs_info->balance_lock);
3311 
3312 	if (num_devices == 1)
3313 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3314 	if (num_devices < 4)
3315 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3316 
3317 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3318 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3319 		      BTRFS_BLOCK_GROUP_RAID10))) {
3320 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
3321 	}
3322 
3323 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3324 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3325 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3326 	}
3327 
3328 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3329 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3330 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
3331 	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
3332 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3333 	}
3334 
3335 	return extended_to_chunk(flags);
3336 }
3337 
3338 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3339 {
3340 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3341 		flags |= root->fs_info->avail_data_alloc_bits;
3342 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3343 		flags |= root->fs_info->avail_system_alloc_bits;
3344 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3345 		flags |= root->fs_info->avail_metadata_alloc_bits;
3346 
3347 	return btrfs_reduce_alloc_profile(root, flags);
3348 }
3349 
3350 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3351 {
3352 	u64 flags;
3353 
3354 	if (data)
3355 		flags = BTRFS_BLOCK_GROUP_DATA;
3356 	else if (root == root->fs_info->chunk_root)
3357 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3358 	else
3359 		flags = BTRFS_BLOCK_GROUP_METADATA;
3360 
3361 	return get_alloc_profile(root, flags);
3362 }
3363 
3364 /*
3365  * This will check the space that the inode allocates from to make sure we have
3366  * enough space for bytes.
3367  */
3368 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3369 {
3370 	struct btrfs_space_info *data_sinfo;
3371 	struct btrfs_root *root = BTRFS_I(inode)->root;
3372 	struct btrfs_fs_info *fs_info = root->fs_info;
3373 	u64 used;
3374 	int ret = 0, committed = 0, alloc_chunk = 1;
3375 
3376 	/* make sure bytes are sectorsize aligned */
3377 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3378 
3379 	if (root == root->fs_info->tree_root ||
3380 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3381 		alloc_chunk = 0;
3382 		committed = 1;
3383 	}
3384 
3385 	data_sinfo = fs_info->data_sinfo;
3386 	if (!data_sinfo)
3387 		goto alloc;
3388 
3389 again:
3390 	/* make sure we have enough space to handle the data first */
3391 	spin_lock(&data_sinfo->lock);
3392 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3393 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3394 		data_sinfo->bytes_may_use;
3395 
3396 	if (used + bytes > data_sinfo->total_bytes) {
3397 		struct btrfs_trans_handle *trans;
3398 
3399 		/*
3400 		 * if we don't have enough free bytes in this space then we need
3401 		 * to alloc a new chunk.
3402 		 */
3403 		if (!data_sinfo->full && alloc_chunk) {
3404 			u64 alloc_target;
3405 
3406 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3407 			spin_unlock(&data_sinfo->lock);
3408 alloc:
3409 			alloc_target = btrfs_get_alloc_profile(root, 1);
3410 			trans = btrfs_join_transaction(root);
3411 			if (IS_ERR(trans))
3412 				return PTR_ERR(trans);
3413 
3414 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3415 					     bytes + 2 * 1024 * 1024,
3416 					     alloc_target,
3417 					     CHUNK_ALLOC_NO_FORCE);
3418 			btrfs_end_transaction(trans, root);
3419 			if (ret < 0) {
3420 				if (ret != -ENOSPC)
3421 					return ret;
3422 				else
3423 					goto commit_trans;
3424 			}
3425 
3426 			if (!data_sinfo)
3427 				data_sinfo = fs_info->data_sinfo;
3428 
3429 			goto again;
3430 		}
3431 
3432 		/*
3433 		 * If we have less pinned bytes than we want to allocate then
3434 		 * don't bother committing the transaction, it won't help us.
3435 		 */
3436 		if (data_sinfo->bytes_pinned < bytes)
3437 			committed = 1;
3438 		spin_unlock(&data_sinfo->lock);
3439 
3440 		/* commit the current transaction and try again */
3441 commit_trans:
3442 		if (!committed &&
3443 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3444 			committed = 1;
3445 			trans = btrfs_join_transaction(root);
3446 			if (IS_ERR(trans))
3447 				return PTR_ERR(trans);
3448 			ret = btrfs_commit_transaction(trans, root);
3449 			if (ret)
3450 				return ret;
3451 			goto again;
3452 		}
3453 
3454 		return -ENOSPC;
3455 	}
3456 	data_sinfo->bytes_may_use += bytes;
3457 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3458 				      data_sinfo->flags, bytes, 1);
3459 	spin_unlock(&data_sinfo->lock);
3460 
3461 	return 0;
3462 }
3463 
3464 /*
3465  * Called if we need to clear a data reservation for this inode.
3466  */
3467 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3468 {
3469 	struct btrfs_root *root = BTRFS_I(inode)->root;
3470 	struct btrfs_space_info *data_sinfo;
3471 
3472 	/* make sure bytes are sectorsize aligned */
3473 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3474 
3475 	data_sinfo = root->fs_info->data_sinfo;
3476 	spin_lock(&data_sinfo->lock);
3477 	data_sinfo->bytes_may_use -= bytes;
3478 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3479 				      data_sinfo->flags, bytes, 0);
3480 	spin_unlock(&data_sinfo->lock);
3481 }
3482 
3483 static void force_metadata_allocation(struct btrfs_fs_info *info)
3484 {
3485 	struct list_head *head = &info->space_info;
3486 	struct btrfs_space_info *found;
3487 
3488 	rcu_read_lock();
3489 	list_for_each_entry_rcu(found, head, list) {
3490 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3491 			found->force_alloc = CHUNK_ALLOC_FORCE;
3492 	}
3493 	rcu_read_unlock();
3494 }
3495 
3496 static int should_alloc_chunk(struct btrfs_root *root,
3497 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
3498 			      int force)
3499 {
3500 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3501 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3502 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3503 	u64 thresh;
3504 
3505 	if (force == CHUNK_ALLOC_FORCE)
3506 		return 1;
3507 
3508 	/*
3509 	 * We need to take into account the global rsv because for all intents
3510 	 * and purposes it's used space.  Don't worry about locking the
3511 	 * global_rsv, it doesn't change except when the transaction commits.
3512 	 */
3513 	num_allocated += global_rsv->size;
3514 
3515 	/*
3516 	 * in limited mode, we want to have some free space up to
3517 	 * about 1% of the FS size.
3518 	 */
3519 	if (force == CHUNK_ALLOC_LIMITED) {
3520 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3521 		thresh = max_t(u64, 64 * 1024 * 1024,
3522 			       div_factor_fine(thresh, 1));
3523 
3524 		if (num_bytes - num_allocated < thresh)
3525 			return 1;
3526 	}
3527 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3528 
3529 	/* 256MB or 2% of the FS */
3530 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3531 	/* system chunks need a much small threshold */
3532 	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3533 		thresh = 32 * 1024 * 1024;
3534 
3535 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3536 		return 0;
3537 	return 1;
3538 }
3539 
3540 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3541 {
3542 	u64 num_dev;
3543 
3544 	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3545 	    type & BTRFS_BLOCK_GROUP_RAID0)
3546 		num_dev = root->fs_info->fs_devices->rw_devices;
3547 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
3548 		num_dev = 2;
3549 	else
3550 		num_dev = 1;	/* DUP or single */
3551 
3552 	/* metadata for updaing devices and chunk tree */
3553 	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3554 }
3555 
3556 static void check_system_chunk(struct btrfs_trans_handle *trans,
3557 			       struct btrfs_root *root, u64 type)
3558 {
3559 	struct btrfs_space_info *info;
3560 	u64 left;
3561 	u64 thresh;
3562 
3563 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3564 	spin_lock(&info->lock);
3565 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3566 		info->bytes_reserved - info->bytes_readonly;
3567 	spin_unlock(&info->lock);
3568 
3569 	thresh = get_system_chunk_thresh(root, type);
3570 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3571 		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3572 		       left, thresh, type);
3573 		dump_space_info(info, 0, 0);
3574 	}
3575 
3576 	if (left < thresh) {
3577 		u64 flags;
3578 
3579 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3580 		btrfs_alloc_chunk(trans, root, flags);
3581 	}
3582 }
3583 
3584 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3585 			  struct btrfs_root *extent_root, u64 alloc_bytes,
3586 			  u64 flags, int force)
3587 {
3588 	struct btrfs_space_info *space_info;
3589 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3590 	int wait_for_alloc = 0;
3591 	int ret = 0;
3592 
3593 	space_info = __find_space_info(extent_root->fs_info, flags);
3594 	if (!space_info) {
3595 		ret = update_space_info(extent_root->fs_info, flags,
3596 					0, 0, &space_info);
3597 		BUG_ON(ret); /* -ENOMEM */
3598 	}
3599 	BUG_ON(!space_info); /* Logic error */
3600 
3601 again:
3602 	spin_lock(&space_info->lock);
3603 	if (force < space_info->force_alloc)
3604 		force = space_info->force_alloc;
3605 	if (space_info->full) {
3606 		spin_unlock(&space_info->lock);
3607 		return 0;
3608 	}
3609 
3610 	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3611 		spin_unlock(&space_info->lock);
3612 		return 0;
3613 	} else if (space_info->chunk_alloc) {
3614 		wait_for_alloc = 1;
3615 	} else {
3616 		space_info->chunk_alloc = 1;
3617 	}
3618 
3619 	spin_unlock(&space_info->lock);
3620 
3621 	mutex_lock(&fs_info->chunk_mutex);
3622 
3623 	/*
3624 	 * The chunk_mutex is held throughout the entirety of a chunk
3625 	 * allocation, so once we've acquired the chunk_mutex we know that the
3626 	 * other guy is done and we need to recheck and see if we should
3627 	 * allocate.
3628 	 */
3629 	if (wait_for_alloc) {
3630 		mutex_unlock(&fs_info->chunk_mutex);
3631 		wait_for_alloc = 0;
3632 		goto again;
3633 	}
3634 
3635 	/*
3636 	 * If we have mixed data/metadata chunks we want to make sure we keep
3637 	 * allocating mixed chunks instead of individual chunks.
3638 	 */
3639 	if (btrfs_mixed_space_info(space_info))
3640 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3641 
3642 	/*
3643 	 * if we're doing a data chunk, go ahead and make sure that
3644 	 * we keep a reasonable number of metadata chunks allocated in the
3645 	 * FS as well.
3646 	 */
3647 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3648 		fs_info->data_chunk_allocations++;
3649 		if (!(fs_info->data_chunk_allocations %
3650 		      fs_info->metadata_ratio))
3651 			force_metadata_allocation(fs_info);
3652 	}
3653 
3654 	/*
3655 	 * Check if we have enough space in SYSTEM chunk because we may need
3656 	 * to update devices.
3657 	 */
3658 	check_system_chunk(trans, extent_root, flags);
3659 
3660 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3661 	if (ret < 0 && ret != -ENOSPC)
3662 		goto out;
3663 
3664 	spin_lock(&space_info->lock);
3665 	if (ret)
3666 		space_info->full = 1;
3667 	else
3668 		ret = 1;
3669 
3670 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3671 	space_info->chunk_alloc = 0;
3672 	spin_unlock(&space_info->lock);
3673 out:
3674 	mutex_unlock(&fs_info->chunk_mutex);
3675 	return ret;
3676 }
3677 
3678 /*
3679  * shrink metadata reservation for delalloc
3680  */
3681 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3682 			    bool wait_ordered)
3683 {
3684 	struct btrfs_block_rsv *block_rsv;
3685 	struct btrfs_space_info *space_info;
3686 	struct btrfs_trans_handle *trans;
3687 	u64 delalloc_bytes;
3688 	u64 max_reclaim;
3689 	long time_left;
3690 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3691 	int loops = 0;
3692 
3693 	trans = (struct btrfs_trans_handle *)current->journal_info;
3694 	block_rsv = &root->fs_info->delalloc_block_rsv;
3695 	space_info = block_rsv->space_info;
3696 
3697 	smp_mb();
3698 	delalloc_bytes = root->fs_info->delalloc_bytes;
3699 	if (delalloc_bytes == 0) {
3700 		if (trans)
3701 			return;
3702 		btrfs_wait_ordered_extents(root, 0, 0);
3703 		return;
3704 	}
3705 
3706 	while (delalloc_bytes && loops < 3) {
3707 		max_reclaim = min(delalloc_bytes, to_reclaim);
3708 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3709 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3710 					       WB_REASON_FS_FREE_SPACE);
3711 
3712 		spin_lock(&space_info->lock);
3713 		if (space_info->bytes_used + space_info->bytes_reserved +
3714 		    space_info->bytes_pinned + space_info->bytes_readonly +
3715 		    space_info->bytes_may_use + orig <=
3716 		    space_info->total_bytes) {
3717 			spin_unlock(&space_info->lock);
3718 			break;
3719 		}
3720 		spin_unlock(&space_info->lock);
3721 
3722 		loops++;
3723 		if (wait_ordered && !trans) {
3724 			btrfs_wait_ordered_extents(root, 0, 0);
3725 		} else {
3726 			time_left = schedule_timeout_killable(1);
3727 			if (time_left)
3728 				break;
3729 		}
3730 		smp_mb();
3731 		delalloc_bytes = root->fs_info->delalloc_bytes;
3732 	}
3733 }
3734 
3735 /**
3736  * maybe_commit_transaction - possibly commit the transaction if its ok to
3737  * @root - the root we're allocating for
3738  * @bytes - the number of bytes we want to reserve
3739  * @force - force the commit
3740  *
3741  * This will check to make sure that committing the transaction will actually
3742  * get us somewhere and then commit the transaction if it does.  Otherwise it
3743  * will return -ENOSPC.
3744  */
3745 static int may_commit_transaction(struct btrfs_root *root,
3746 				  struct btrfs_space_info *space_info,
3747 				  u64 bytes, int force)
3748 {
3749 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3750 	struct btrfs_trans_handle *trans;
3751 
3752 	trans = (struct btrfs_trans_handle *)current->journal_info;
3753 	if (trans)
3754 		return -EAGAIN;
3755 
3756 	if (force)
3757 		goto commit;
3758 
3759 	/* See if there is enough pinned space to make this reservation */
3760 	spin_lock(&space_info->lock);
3761 	if (space_info->bytes_pinned >= bytes) {
3762 		spin_unlock(&space_info->lock);
3763 		goto commit;
3764 	}
3765 	spin_unlock(&space_info->lock);
3766 
3767 	/*
3768 	 * See if there is some space in the delayed insertion reservation for
3769 	 * this reservation.
3770 	 */
3771 	if (space_info != delayed_rsv->space_info)
3772 		return -ENOSPC;
3773 
3774 	spin_lock(&space_info->lock);
3775 	spin_lock(&delayed_rsv->lock);
3776 	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3777 		spin_unlock(&delayed_rsv->lock);
3778 		spin_unlock(&space_info->lock);
3779 		return -ENOSPC;
3780 	}
3781 	spin_unlock(&delayed_rsv->lock);
3782 	spin_unlock(&space_info->lock);
3783 
3784 commit:
3785 	trans = btrfs_join_transaction(root);
3786 	if (IS_ERR(trans))
3787 		return -ENOSPC;
3788 
3789 	return btrfs_commit_transaction(trans, root);
3790 }
3791 
3792 enum flush_state {
3793 	FLUSH_DELALLOC		=	1,
3794 	FLUSH_DELALLOC_WAIT	=	2,
3795 	FLUSH_DELAYED_ITEMS_NR	=	3,
3796 	FLUSH_DELAYED_ITEMS	=	4,
3797 	COMMIT_TRANS		=	5,
3798 };
3799 
3800 static int flush_space(struct btrfs_root *root,
3801 		       struct btrfs_space_info *space_info, u64 num_bytes,
3802 		       u64 orig_bytes, int state)
3803 {
3804 	struct btrfs_trans_handle *trans;
3805 	int nr;
3806 	int ret = 0;
3807 
3808 	switch (state) {
3809 	case FLUSH_DELALLOC:
3810 	case FLUSH_DELALLOC_WAIT:
3811 		shrink_delalloc(root, num_bytes, orig_bytes,
3812 				state == FLUSH_DELALLOC_WAIT);
3813 		break;
3814 	case FLUSH_DELAYED_ITEMS_NR:
3815 	case FLUSH_DELAYED_ITEMS:
3816 		if (state == FLUSH_DELAYED_ITEMS_NR) {
3817 			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3818 
3819 			nr = (int)div64_u64(num_bytes, bytes);
3820 			if (!nr)
3821 				nr = 1;
3822 			nr *= 2;
3823 		} else {
3824 			nr = -1;
3825 		}
3826 		trans = btrfs_join_transaction(root);
3827 		if (IS_ERR(trans)) {
3828 			ret = PTR_ERR(trans);
3829 			break;
3830 		}
3831 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
3832 		btrfs_end_transaction(trans, root);
3833 		break;
3834 	case COMMIT_TRANS:
3835 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3836 		break;
3837 	default:
3838 		ret = -ENOSPC;
3839 		break;
3840 	}
3841 
3842 	return ret;
3843 }
3844 /**
3845  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3846  * @root - the root we're allocating for
3847  * @block_rsv - the block_rsv we're allocating for
3848  * @orig_bytes - the number of bytes we want
3849  * @flush - wether or not we can flush to make our reservation
3850  *
3851  * This will reserve orgi_bytes number of bytes from the space info associated
3852  * with the block_rsv.  If there is not enough space it will make an attempt to
3853  * flush out space to make room.  It will do this by flushing delalloc if
3854  * possible or committing the transaction.  If flush is 0 then no attempts to
3855  * regain reservations will be made and this will fail if there is not enough
3856  * space already.
3857  */
3858 static int reserve_metadata_bytes(struct btrfs_root *root,
3859 				  struct btrfs_block_rsv *block_rsv,
3860 				  u64 orig_bytes, int flush)
3861 {
3862 	struct btrfs_space_info *space_info = block_rsv->space_info;
3863 	u64 used;
3864 	u64 num_bytes = orig_bytes;
3865 	int flush_state = FLUSH_DELALLOC;
3866 	int ret = 0;
3867 	bool flushing = false;
3868 	bool committed = false;
3869 
3870 again:
3871 	ret = 0;
3872 	spin_lock(&space_info->lock);
3873 	/*
3874 	 * We only want to wait if somebody other than us is flushing and we are
3875 	 * actually alloed to flush.
3876 	 */
3877 	while (flush && !flushing && space_info->flush) {
3878 		spin_unlock(&space_info->lock);
3879 		/*
3880 		 * If we have a trans handle we can't wait because the flusher
3881 		 * may have to commit the transaction, which would mean we would
3882 		 * deadlock since we are waiting for the flusher to finish, but
3883 		 * hold the current transaction open.
3884 		 */
3885 		if (current->journal_info)
3886 			return -EAGAIN;
3887 		ret = wait_event_killable(space_info->wait, !space_info->flush);
3888 		/* Must have been killed, return */
3889 		if (ret)
3890 			return -EINTR;
3891 
3892 		spin_lock(&space_info->lock);
3893 	}
3894 
3895 	ret = -ENOSPC;
3896 	used = space_info->bytes_used + space_info->bytes_reserved +
3897 		space_info->bytes_pinned + space_info->bytes_readonly +
3898 		space_info->bytes_may_use;
3899 
3900 	/*
3901 	 * The idea here is that we've not already over-reserved the block group
3902 	 * then we can go ahead and save our reservation first and then start
3903 	 * flushing if we need to.  Otherwise if we've already overcommitted
3904 	 * lets start flushing stuff first and then come back and try to make
3905 	 * our reservation.
3906 	 */
3907 	if (used <= space_info->total_bytes) {
3908 		if (used + orig_bytes <= space_info->total_bytes) {
3909 			space_info->bytes_may_use += orig_bytes;
3910 			trace_btrfs_space_reservation(root->fs_info,
3911 				"space_info", space_info->flags, orig_bytes, 1);
3912 			ret = 0;
3913 		} else {
3914 			/*
3915 			 * Ok set num_bytes to orig_bytes since we aren't
3916 			 * overocmmitted, this way we only try and reclaim what
3917 			 * we need.
3918 			 */
3919 			num_bytes = orig_bytes;
3920 		}
3921 	} else {
3922 		/*
3923 		 * Ok we're over committed, set num_bytes to the overcommitted
3924 		 * amount plus the amount of bytes that we need for this
3925 		 * reservation.
3926 		 */
3927 		num_bytes = used - space_info->total_bytes +
3928 			(orig_bytes * 2);
3929 	}
3930 
3931 	if (ret) {
3932 		u64 profile = btrfs_get_alloc_profile(root, 0);
3933 		u64 avail;
3934 
3935 		/*
3936 		 * If we have a lot of space that's pinned, don't bother doing
3937 		 * the overcommit dance yet and just commit the transaction.
3938 		 */
3939 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3940 		do_div(avail, 10);
3941 		if (space_info->bytes_pinned >= avail && flush && !committed) {
3942 			space_info->flush = 1;
3943 			flushing = true;
3944 			spin_unlock(&space_info->lock);
3945 			ret = may_commit_transaction(root, space_info,
3946 						     orig_bytes, 1);
3947 			if (ret)
3948 				goto out;
3949 			committed = true;
3950 			goto again;
3951 		}
3952 
3953 		spin_lock(&root->fs_info->free_chunk_lock);
3954 		avail = root->fs_info->free_chunk_space;
3955 
3956 		/*
3957 		 * If we have dup, raid1 or raid10 then only half of the free
3958 		 * space is actually useable.
3959 		 */
3960 		if (profile & (BTRFS_BLOCK_GROUP_DUP |
3961 			       BTRFS_BLOCK_GROUP_RAID1 |
3962 			       BTRFS_BLOCK_GROUP_RAID10))
3963 			avail >>= 1;
3964 
3965 		/*
3966 		 * If we aren't flushing don't let us overcommit too much, say
3967 		 * 1/8th of the space.  If we can flush, let it overcommit up to
3968 		 * 1/2 of the space.
3969 		 */
3970 		if (flush)
3971 			avail >>= 3;
3972 		else
3973 			avail >>= 1;
3974 		 spin_unlock(&root->fs_info->free_chunk_lock);
3975 
3976 		if (used + num_bytes < space_info->total_bytes + avail) {
3977 			space_info->bytes_may_use += orig_bytes;
3978 			trace_btrfs_space_reservation(root->fs_info,
3979 				"space_info", space_info->flags, orig_bytes, 1);
3980 			ret = 0;
3981 		}
3982 	}
3983 
3984 	/*
3985 	 * Couldn't make our reservation, save our place so while we're trying
3986 	 * to reclaim space we can actually use it instead of somebody else
3987 	 * stealing it from us.
3988 	 */
3989 	if (ret && flush) {
3990 		flushing = true;
3991 		space_info->flush = 1;
3992 	}
3993 
3994 	spin_unlock(&space_info->lock);
3995 
3996 	if (!ret || !flush)
3997 		goto out;
3998 
3999 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
4000 			  flush_state);
4001 	flush_state++;
4002 	if (!ret)
4003 		goto again;
4004 	else if (flush_state <= COMMIT_TRANS)
4005 		goto again;
4006 
4007 out:
4008 	if (flushing) {
4009 		spin_lock(&space_info->lock);
4010 		space_info->flush = 0;
4011 		wake_up_all(&space_info->wait);
4012 		spin_unlock(&space_info->lock);
4013 	}
4014 	return ret;
4015 }
4016 
4017 static struct btrfs_block_rsv *get_block_rsv(
4018 					const struct btrfs_trans_handle *trans,
4019 					const struct btrfs_root *root)
4020 {
4021 	struct btrfs_block_rsv *block_rsv = NULL;
4022 
4023 	if (root->ref_cows)
4024 		block_rsv = trans->block_rsv;
4025 
4026 	if (root == root->fs_info->csum_root && trans->adding_csums)
4027 		block_rsv = trans->block_rsv;
4028 
4029 	if (!block_rsv)
4030 		block_rsv = root->block_rsv;
4031 
4032 	if (!block_rsv)
4033 		block_rsv = &root->fs_info->empty_block_rsv;
4034 
4035 	return block_rsv;
4036 }
4037 
4038 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4039 			       u64 num_bytes)
4040 {
4041 	int ret = -ENOSPC;
4042 	spin_lock(&block_rsv->lock);
4043 	if (block_rsv->reserved >= num_bytes) {
4044 		block_rsv->reserved -= num_bytes;
4045 		if (block_rsv->reserved < block_rsv->size)
4046 			block_rsv->full = 0;
4047 		ret = 0;
4048 	}
4049 	spin_unlock(&block_rsv->lock);
4050 	return ret;
4051 }
4052 
4053 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4054 				u64 num_bytes, int update_size)
4055 {
4056 	spin_lock(&block_rsv->lock);
4057 	block_rsv->reserved += num_bytes;
4058 	if (update_size)
4059 		block_rsv->size += num_bytes;
4060 	else if (block_rsv->reserved >= block_rsv->size)
4061 		block_rsv->full = 1;
4062 	spin_unlock(&block_rsv->lock);
4063 }
4064 
4065 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4066 				    struct btrfs_block_rsv *block_rsv,
4067 				    struct btrfs_block_rsv *dest, u64 num_bytes)
4068 {
4069 	struct btrfs_space_info *space_info = block_rsv->space_info;
4070 
4071 	spin_lock(&block_rsv->lock);
4072 	if (num_bytes == (u64)-1)
4073 		num_bytes = block_rsv->size;
4074 	block_rsv->size -= num_bytes;
4075 	if (block_rsv->reserved >= block_rsv->size) {
4076 		num_bytes = block_rsv->reserved - block_rsv->size;
4077 		block_rsv->reserved = block_rsv->size;
4078 		block_rsv->full = 1;
4079 	} else {
4080 		num_bytes = 0;
4081 	}
4082 	spin_unlock(&block_rsv->lock);
4083 
4084 	if (num_bytes > 0) {
4085 		if (dest) {
4086 			spin_lock(&dest->lock);
4087 			if (!dest->full) {
4088 				u64 bytes_to_add;
4089 
4090 				bytes_to_add = dest->size - dest->reserved;
4091 				bytes_to_add = min(num_bytes, bytes_to_add);
4092 				dest->reserved += bytes_to_add;
4093 				if (dest->reserved >= dest->size)
4094 					dest->full = 1;
4095 				num_bytes -= bytes_to_add;
4096 			}
4097 			spin_unlock(&dest->lock);
4098 		}
4099 		if (num_bytes) {
4100 			spin_lock(&space_info->lock);
4101 			space_info->bytes_may_use -= num_bytes;
4102 			trace_btrfs_space_reservation(fs_info, "space_info",
4103 					space_info->flags, num_bytes, 0);
4104 			space_info->reservation_progress++;
4105 			spin_unlock(&space_info->lock);
4106 		}
4107 	}
4108 }
4109 
4110 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4111 				   struct btrfs_block_rsv *dst, u64 num_bytes)
4112 {
4113 	int ret;
4114 
4115 	ret = block_rsv_use_bytes(src, num_bytes);
4116 	if (ret)
4117 		return ret;
4118 
4119 	block_rsv_add_bytes(dst, num_bytes, 1);
4120 	return 0;
4121 }
4122 
4123 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
4124 {
4125 	memset(rsv, 0, sizeof(*rsv));
4126 	spin_lock_init(&rsv->lock);
4127 }
4128 
4129 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
4130 {
4131 	struct btrfs_block_rsv *block_rsv;
4132 	struct btrfs_fs_info *fs_info = root->fs_info;
4133 
4134 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4135 	if (!block_rsv)
4136 		return NULL;
4137 
4138 	btrfs_init_block_rsv(block_rsv);
4139 	block_rsv->space_info = __find_space_info(fs_info,
4140 						  BTRFS_BLOCK_GROUP_METADATA);
4141 	return block_rsv;
4142 }
4143 
4144 void btrfs_free_block_rsv(struct btrfs_root *root,
4145 			  struct btrfs_block_rsv *rsv)
4146 {
4147 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4148 	kfree(rsv);
4149 }
4150 
4151 static inline int __block_rsv_add(struct btrfs_root *root,
4152 				  struct btrfs_block_rsv *block_rsv,
4153 				  u64 num_bytes, int flush)
4154 {
4155 	int ret;
4156 
4157 	if (num_bytes == 0)
4158 		return 0;
4159 
4160 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4161 	if (!ret) {
4162 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
4163 		return 0;
4164 	}
4165 
4166 	return ret;
4167 }
4168 
4169 int btrfs_block_rsv_add(struct btrfs_root *root,
4170 			struct btrfs_block_rsv *block_rsv,
4171 			u64 num_bytes)
4172 {
4173 	return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174 }
4175 
4176 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 				struct btrfs_block_rsv *block_rsv,
4178 				u64 num_bytes)
4179 {
4180 	return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181 }
4182 
4183 int btrfs_block_rsv_check(struct btrfs_root *root,
4184 			  struct btrfs_block_rsv *block_rsv, int min_factor)
4185 {
4186 	u64 num_bytes = 0;
4187 	int ret = -ENOSPC;
4188 
4189 	if (!block_rsv)
4190 		return 0;
4191 
4192 	spin_lock(&block_rsv->lock);
4193 	num_bytes = div_factor(block_rsv->size, min_factor);
4194 	if (block_rsv->reserved >= num_bytes)
4195 		ret = 0;
4196 	spin_unlock(&block_rsv->lock);
4197 
4198 	return ret;
4199 }
4200 
4201 static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4202 					   struct btrfs_block_rsv *block_rsv,
4203 					   u64 min_reserved, int flush)
4204 {
4205 	u64 num_bytes = 0;
4206 	int ret = -ENOSPC;
4207 
4208 	if (!block_rsv)
4209 		return 0;
4210 
4211 	spin_lock(&block_rsv->lock);
4212 	num_bytes = min_reserved;
4213 	if (block_rsv->reserved >= num_bytes)
4214 		ret = 0;
4215 	else
4216 		num_bytes -= block_rsv->reserved;
4217 	spin_unlock(&block_rsv->lock);
4218 
4219 	if (!ret)
4220 		return 0;
4221 
4222 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4223 	if (!ret) {
4224 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
4225 		return 0;
4226 	}
4227 
4228 	return ret;
4229 }
4230 
4231 int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 			   struct btrfs_block_rsv *block_rsv,
4233 			   u64 min_reserved)
4234 {
4235 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236 }
4237 
4238 int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 				   struct btrfs_block_rsv *block_rsv,
4240 				   u64 min_reserved)
4241 {
4242 	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243 }
4244 
4245 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 			    struct btrfs_block_rsv *dst_rsv,
4247 			    u64 num_bytes)
4248 {
4249 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4250 }
4251 
4252 void btrfs_block_rsv_release(struct btrfs_root *root,
4253 			     struct btrfs_block_rsv *block_rsv,
4254 			     u64 num_bytes)
4255 {
4256 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4257 	if (global_rsv->full || global_rsv == block_rsv ||
4258 	    block_rsv->space_info != global_rsv->space_info)
4259 		global_rsv = NULL;
4260 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4261 				num_bytes);
4262 }
4263 
4264 /*
4265  * helper to calculate size of global block reservation.
4266  * the desired value is sum of space used by extent tree,
4267  * checksum tree and root tree
4268  */
4269 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4270 {
4271 	struct btrfs_space_info *sinfo;
4272 	u64 num_bytes;
4273 	u64 meta_used;
4274 	u64 data_used;
4275 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4276 
4277 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4278 	spin_lock(&sinfo->lock);
4279 	data_used = sinfo->bytes_used;
4280 	spin_unlock(&sinfo->lock);
4281 
4282 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4283 	spin_lock(&sinfo->lock);
4284 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4285 		data_used = 0;
4286 	meta_used = sinfo->bytes_used;
4287 	spin_unlock(&sinfo->lock);
4288 
4289 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4290 		    csum_size * 2;
4291 	num_bytes += div64_u64(data_used + meta_used, 50);
4292 
4293 	if (num_bytes * 3 > meta_used)
4294 		num_bytes = div64_u64(meta_used, 3);
4295 
4296 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4297 }
4298 
4299 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4300 {
4301 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4302 	struct btrfs_space_info *sinfo = block_rsv->space_info;
4303 	u64 num_bytes;
4304 
4305 	num_bytes = calc_global_metadata_size(fs_info);
4306 
4307 	spin_lock(&sinfo->lock);
4308 	spin_lock(&block_rsv->lock);
4309 
4310 	block_rsv->size = num_bytes;
4311 
4312 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4313 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
4314 		    sinfo->bytes_may_use;
4315 
4316 	if (sinfo->total_bytes > num_bytes) {
4317 		num_bytes = sinfo->total_bytes - num_bytes;
4318 		block_rsv->reserved += num_bytes;
4319 		sinfo->bytes_may_use += num_bytes;
4320 		trace_btrfs_space_reservation(fs_info, "space_info",
4321 				      sinfo->flags, num_bytes, 1);
4322 	}
4323 
4324 	if (block_rsv->reserved >= block_rsv->size) {
4325 		num_bytes = block_rsv->reserved - block_rsv->size;
4326 		sinfo->bytes_may_use -= num_bytes;
4327 		trace_btrfs_space_reservation(fs_info, "space_info",
4328 				      sinfo->flags, num_bytes, 0);
4329 		sinfo->reservation_progress++;
4330 		block_rsv->reserved = block_rsv->size;
4331 		block_rsv->full = 1;
4332 	}
4333 
4334 	spin_unlock(&block_rsv->lock);
4335 	spin_unlock(&sinfo->lock);
4336 }
4337 
4338 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4339 {
4340 	struct btrfs_space_info *space_info;
4341 
4342 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4343 	fs_info->chunk_block_rsv.space_info = space_info;
4344 
4345 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4346 	fs_info->global_block_rsv.space_info = space_info;
4347 	fs_info->delalloc_block_rsv.space_info = space_info;
4348 	fs_info->trans_block_rsv.space_info = space_info;
4349 	fs_info->empty_block_rsv.space_info = space_info;
4350 	fs_info->delayed_block_rsv.space_info = space_info;
4351 
4352 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4353 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4354 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4355 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4356 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4357 
4358 	update_global_block_rsv(fs_info);
4359 }
4360 
4361 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4362 {
4363 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4364 				(u64)-1);
4365 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4366 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4367 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4368 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4369 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4370 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4371 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4372 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4373 }
4374 
4375 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4376 				  struct btrfs_root *root)
4377 {
4378 	if (!trans->block_rsv)
4379 		return;
4380 
4381 	if (!trans->bytes_reserved)
4382 		return;
4383 
4384 	trace_btrfs_space_reservation(root->fs_info, "transaction",
4385 				      trans->transid, trans->bytes_reserved, 0);
4386 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4387 	trans->bytes_reserved = 0;
4388 }
4389 
4390 /* Can only return 0 or -ENOSPC */
4391 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4392 				  struct inode *inode)
4393 {
4394 	struct btrfs_root *root = BTRFS_I(inode)->root;
4395 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4396 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4397 
4398 	/*
4399 	 * We need to hold space in order to delete our orphan item once we've
4400 	 * added it, so this takes the reservation so we can release it later
4401 	 * when we are truly done with the orphan item.
4402 	 */
4403 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4404 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4405 				      btrfs_ino(inode), num_bytes, 1);
4406 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4407 }
4408 
4409 void btrfs_orphan_release_metadata(struct inode *inode)
4410 {
4411 	struct btrfs_root *root = BTRFS_I(inode)->root;
4412 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4413 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4414 				      btrfs_ino(inode), num_bytes, 0);
4415 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4416 }
4417 
4418 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4419 				struct btrfs_pending_snapshot *pending)
4420 {
4421 	struct btrfs_root *root = pending->root;
4422 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4423 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4424 	/*
4425 	 * two for root back/forward refs, two for directory entries
4426 	 * and one for root of the snapshot.
4427 	 */
4428 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4429 	dst_rsv->space_info = src_rsv->space_info;
4430 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4431 }
4432 
4433 /**
4434  * drop_outstanding_extent - drop an outstanding extent
4435  * @inode: the inode we're dropping the extent for
4436  *
4437  * This is called when we are freeing up an outstanding extent, either called
4438  * after an error or after an extent is written.  This will return the number of
4439  * reserved extents that need to be freed.  This must be called with
4440  * BTRFS_I(inode)->lock held.
4441  */
4442 static unsigned drop_outstanding_extent(struct inode *inode)
4443 {
4444 	unsigned drop_inode_space = 0;
4445 	unsigned dropped_extents = 0;
4446 
4447 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4448 	BTRFS_I(inode)->outstanding_extents--;
4449 
4450 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4451 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4452 			       &BTRFS_I(inode)->runtime_flags))
4453 		drop_inode_space = 1;
4454 
4455 	/*
4456 	 * If we have more or the same amount of outsanding extents than we have
4457 	 * reserved then we need to leave the reserved extents count alone.
4458 	 */
4459 	if (BTRFS_I(inode)->outstanding_extents >=
4460 	    BTRFS_I(inode)->reserved_extents)
4461 		return drop_inode_space;
4462 
4463 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4464 		BTRFS_I(inode)->outstanding_extents;
4465 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4466 	return dropped_extents + drop_inode_space;
4467 }
4468 
4469 /**
4470  * calc_csum_metadata_size - return the amount of metada space that must be
4471  *	reserved/free'd for the given bytes.
4472  * @inode: the inode we're manipulating
4473  * @num_bytes: the number of bytes in question
4474  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4475  *
4476  * This adjusts the number of csum_bytes in the inode and then returns the
4477  * correct amount of metadata that must either be reserved or freed.  We
4478  * calculate how many checksums we can fit into one leaf and then divide the
4479  * number of bytes that will need to be checksumed by this value to figure out
4480  * how many checksums will be required.  If we are adding bytes then the number
4481  * may go up and we will return the number of additional bytes that must be
4482  * reserved.  If it is going down we will return the number of bytes that must
4483  * be freed.
4484  *
4485  * This must be called with BTRFS_I(inode)->lock held.
4486  */
4487 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4488 				   int reserve)
4489 {
4490 	struct btrfs_root *root = BTRFS_I(inode)->root;
4491 	u64 csum_size;
4492 	int num_csums_per_leaf;
4493 	int num_csums;
4494 	int old_csums;
4495 
4496 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4497 	    BTRFS_I(inode)->csum_bytes == 0)
4498 		return 0;
4499 
4500 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4501 	if (reserve)
4502 		BTRFS_I(inode)->csum_bytes += num_bytes;
4503 	else
4504 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4505 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4506 	num_csums_per_leaf = (int)div64_u64(csum_size,
4507 					    sizeof(struct btrfs_csum_item) +
4508 					    sizeof(struct btrfs_disk_key));
4509 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4510 	num_csums = num_csums + num_csums_per_leaf - 1;
4511 	num_csums = num_csums / num_csums_per_leaf;
4512 
4513 	old_csums = old_csums + num_csums_per_leaf - 1;
4514 	old_csums = old_csums / num_csums_per_leaf;
4515 
4516 	/* No change, no need to reserve more */
4517 	if (old_csums == num_csums)
4518 		return 0;
4519 
4520 	if (reserve)
4521 		return btrfs_calc_trans_metadata_size(root,
4522 						      num_csums - old_csums);
4523 
4524 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4525 }
4526 
4527 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4528 {
4529 	struct btrfs_root *root = BTRFS_I(inode)->root;
4530 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4531 	u64 to_reserve = 0;
4532 	u64 csum_bytes;
4533 	unsigned nr_extents = 0;
4534 	int extra_reserve = 0;
4535 	int flush = 1;
4536 	int ret;
4537 
4538 	/* Need to be holding the i_mutex here if we aren't free space cache */
4539 	if (btrfs_is_free_space_inode(inode))
4540 		flush = 0;
4541 
4542 	if (flush && btrfs_transaction_in_commit(root->fs_info))
4543 		schedule_timeout(1);
4544 
4545 	mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4546 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 
4548 	spin_lock(&BTRFS_I(inode)->lock);
4549 	BTRFS_I(inode)->outstanding_extents++;
4550 
4551 	if (BTRFS_I(inode)->outstanding_extents >
4552 	    BTRFS_I(inode)->reserved_extents)
4553 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4554 			BTRFS_I(inode)->reserved_extents;
4555 
4556 	/*
4557 	 * Add an item to reserve for updating the inode when we complete the
4558 	 * delalloc io.
4559 	 */
4560 	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4561 		      &BTRFS_I(inode)->runtime_flags)) {
4562 		nr_extents++;
4563 		extra_reserve = 1;
4564 	}
4565 
4566 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4567 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4568 	csum_bytes = BTRFS_I(inode)->csum_bytes;
4569 	spin_unlock(&BTRFS_I(inode)->lock);
4570 
4571 	if (root->fs_info->quota_enabled) {
4572 		ret = btrfs_qgroup_reserve(root, num_bytes +
4573 					   nr_extents * root->leafsize);
4574 		if (ret)
4575 			return ret;
4576 	}
4577 
4578 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4579 	if (ret) {
4580 		u64 to_free = 0;
4581 		unsigned dropped;
4582 
4583 		spin_lock(&BTRFS_I(inode)->lock);
4584 		dropped = drop_outstanding_extent(inode);
4585 		/*
4586 		 * If the inodes csum_bytes is the same as the original
4587 		 * csum_bytes then we know we haven't raced with any free()ers
4588 		 * so we can just reduce our inodes csum bytes and carry on.
4589 		 * Otherwise we have to do the normal free thing to account for
4590 		 * the case that the free side didn't free up its reserve
4591 		 * because of this outstanding reservation.
4592 		 */
4593 		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4594 			calc_csum_metadata_size(inode, num_bytes, 0);
4595 		else
4596 			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4597 		spin_unlock(&BTRFS_I(inode)->lock);
4598 		if (dropped)
4599 			to_free += btrfs_calc_trans_metadata_size(root, dropped);
4600 
4601 		if (to_free) {
4602 			btrfs_block_rsv_release(root, block_rsv, to_free);
4603 			trace_btrfs_space_reservation(root->fs_info,
4604 						      "delalloc",
4605 						      btrfs_ino(inode),
4606 						      to_free, 0);
4607 		}
4608 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4609 		return ret;
4610 	}
4611 
4612 	spin_lock(&BTRFS_I(inode)->lock);
4613 	if (extra_reserve) {
4614 		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4615 			&BTRFS_I(inode)->runtime_flags);
4616 		nr_extents--;
4617 	}
4618 	BTRFS_I(inode)->reserved_extents += nr_extents;
4619 	spin_unlock(&BTRFS_I(inode)->lock);
4620 	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4621 
4622 	if (to_reserve)
4623 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
4624 					      btrfs_ino(inode), to_reserve, 1);
4625 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4626 
4627 	return 0;
4628 }
4629 
4630 /**
4631  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4632  * @inode: the inode to release the reservation for
4633  * @num_bytes: the number of bytes we're releasing
4634  *
4635  * This will release the metadata reservation for an inode.  This can be called
4636  * once we complete IO for a given set of bytes to release their metadata
4637  * reservations.
4638  */
4639 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4640 {
4641 	struct btrfs_root *root = BTRFS_I(inode)->root;
4642 	u64 to_free = 0;
4643 	unsigned dropped;
4644 
4645 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4646 	spin_lock(&BTRFS_I(inode)->lock);
4647 	dropped = drop_outstanding_extent(inode);
4648 
4649 	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4650 	spin_unlock(&BTRFS_I(inode)->lock);
4651 	if (dropped > 0)
4652 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4653 
4654 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
4655 				      btrfs_ino(inode), to_free, 0);
4656 	if (root->fs_info->quota_enabled) {
4657 		btrfs_qgroup_free(root, num_bytes +
4658 					dropped * root->leafsize);
4659 	}
4660 
4661 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4662 				to_free);
4663 }
4664 
4665 /**
4666  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4667  * @inode: inode we're writing to
4668  * @num_bytes: the number of bytes we want to allocate
4669  *
4670  * This will do the following things
4671  *
4672  * o reserve space in the data space info for num_bytes
4673  * o reserve space in the metadata space info based on number of outstanding
4674  *   extents and how much csums will be needed
4675  * o add to the inodes ->delalloc_bytes
4676  * o add it to the fs_info's delalloc inodes list.
4677  *
4678  * This will return 0 for success and -ENOSPC if there is no space left.
4679  */
4680 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4681 {
4682 	int ret;
4683 
4684 	ret = btrfs_check_data_free_space(inode, num_bytes);
4685 	if (ret)
4686 		return ret;
4687 
4688 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4689 	if (ret) {
4690 		btrfs_free_reserved_data_space(inode, num_bytes);
4691 		return ret;
4692 	}
4693 
4694 	return 0;
4695 }
4696 
4697 /**
4698  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4699  * @inode: inode we're releasing space for
4700  * @num_bytes: the number of bytes we want to free up
4701  *
4702  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4703  * called in the case that we don't need the metadata AND data reservations
4704  * anymore.  So if there is an error or we insert an inline extent.
4705  *
4706  * This function will release the metadata space that was not used and will
4707  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4708  * list if there are no delalloc bytes left.
4709  */
4710 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4711 {
4712 	btrfs_delalloc_release_metadata(inode, num_bytes);
4713 	btrfs_free_reserved_data_space(inode, num_bytes);
4714 }
4715 
4716 static int update_block_group(struct btrfs_trans_handle *trans,
4717 			      struct btrfs_root *root,
4718 			      u64 bytenr, u64 num_bytes, int alloc)
4719 {
4720 	struct btrfs_block_group_cache *cache = NULL;
4721 	struct btrfs_fs_info *info = root->fs_info;
4722 	u64 total = num_bytes;
4723 	u64 old_val;
4724 	u64 byte_in_group;
4725 	int factor;
4726 
4727 	/* block accounting for super block */
4728 	spin_lock(&info->delalloc_lock);
4729 	old_val = btrfs_super_bytes_used(info->super_copy);
4730 	if (alloc)
4731 		old_val += num_bytes;
4732 	else
4733 		old_val -= num_bytes;
4734 	btrfs_set_super_bytes_used(info->super_copy, old_val);
4735 	spin_unlock(&info->delalloc_lock);
4736 
4737 	while (total) {
4738 		cache = btrfs_lookup_block_group(info, bytenr);
4739 		if (!cache)
4740 			return -ENOENT;
4741 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4742 				    BTRFS_BLOCK_GROUP_RAID1 |
4743 				    BTRFS_BLOCK_GROUP_RAID10))
4744 			factor = 2;
4745 		else
4746 			factor = 1;
4747 		/*
4748 		 * If this block group has free space cache written out, we
4749 		 * need to make sure to load it if we are removing space.  This
4750 		 * is because we need the unpinning stage to actually add the
4751 		 * space back to the block group, otherwise we will leak space.
4752 		 */
4753 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
4754 			cache_block_group(cache, trans, NULL, 1);
4755 
4756 		byte_in_group = bytenr - cache->key.objectid;
4757 		WARN_ON(byte_in_group > cache->key.offset);
4758 
4759 		spin_lock(&cache->space_info->lock);
4760 		spin_lock(&cache->lock);
4761 
4762 		if (btrfs_test_opt(root, SPACE_CACHE) &&
4763 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
4764 			cache->disk_cache_state = BTRFS_DC_CLEAR;
4765 
4766 		cache->dirty = 1;
4767 		old_val = btrfs_block_group_used(&cache->item);
4768 		num_bytes = min(total, cache->key.offset - byte_in_group);
4769 		if (alloc) {
4770 			old_val += num_bytes;
4771 			btrfs_set_block_group_used(&cache->item, old_val);
4772 			cache->reserved -= num_bytes;
4773 			cache->space_info->bytes_reserved -= num_bytes;
4774 			cache->space_info->bytes_used += num_bytes;
4775 			cache->space_info->disk_used += num_bytes * factor;
4776 			spin_unlock(&cache->lock);
4777 			spin_unlock(&cache->space_info->lock);
4778 		} else {
4779 			old_val -= num_bytes;
4780 			btrfs_set_block_group_used(&cache->item, old_val);
4781 			cache->pinned += num_bytes;
4782 			cache->space_info->bytes_pinned += num_bytes;
4783 			cache->space_info->bytes_used -= num_bytes;
4784 			cache->space_info->disk_used -= num_bytes * factor;
4785 			spin_unlock(&cache->lock);
4786 			spin_unlock(&cache->space_info->lock);
4787 
4788 			set_extent_dirty(info->pinned_extents,
4789 					 bytenr, bytenr + num_bytes - 1,
4790 					 GFP_NOFS | __GFP_NOFAIL);
4791 		}
4792 		btrfs_put_block_group(cache);
4793 		total -= num_bytes;
4794 		bytenr += num_bytes;
4795 	}
4796 	return 0;
4797 }
4798 
4799 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4800 {
4801 	struct btrfs_block_group_cache *cache;
4802 	u64 bytenr;
4803 
4804 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4805 	if (!cache)
4806 		return 0;
4807 
4808 	bytenr = cache->key.objectid;
4809 	btrfs_put_block_group(cache);
4810 
4811 	return bytenr;
4812 }
4813 
4814 static int pin_down_extent(struct btrfs_root *root,
4815 			   struct btrfs_block_group_cache *cache,
4816 			   u64 bytenr, u64 num_bytes, int reserved)
4817 {
4818 	spin_lock(&cache->space_info->lock);
4819 	spin_lock(&cache->lock);
4820 	cache->pinned += num_bytes;
4821 	cache->space_info->bytes_pinned += num_bytes;
4822 	if (reserved) {
4823 		cache->reserved -= num_bytes;
4824 		cache->space_info->bytes_reserved -= num_bytes;
4825 	}
4826 	spin_unlock(&cache->lock);
4827 	spin_unlock(&cache->space_info->lock);
4828 
4829 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4830 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4831 	return 0;
4832 }
4833 
4834 /*
4835  * this function must be called within transaction
4836  */
4837 int btrfs_pin_extent(struct btrfs_root *root,
4838 		     u64 bytenr, u64 num_bytes, int reserved)
4839 {
4840 	struct btrfs_block_group_cache *cache;
4841 
4842 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4843 	BUG_ON(!cache); /* Logic error */
4844 
4845 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4846 
4847 	btrfs_put_block_group(cache);
4848 	return 0;
4849 }
4850 
4851 /*
4852  * this function must be called within transaction
4853  */
4854 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4855 				    struct btrfs_root *root,
4856 				    u64 bytenr, u64 num_bytes)
4857 {
4858 	struct btrfs_block_group_cache *cache;
4859 
4860 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4861 	BUG_ON(!cache); /* Logic error */
4862 
4863 	/*
4864 	 * pull in the free space cache (if any) so that our pin
4865 	 * removes the free space from the cache.  We have load_only set
4866 	 * to one because the slow code to read in the free extents does check
4867 	 * the pinned extents.
4868 	 */
4869 	cache_block_group(cache, trans, root, 1);
4870 
4871 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
4872 
4873 	/* remove us from the free space cache (if we're there at all) */
4874 	btrfs_remove_free_space(cache, bytenr, num_bytes);
4875 	btrfs_put_block_group(cache);
4876 	return 0;
4877 }
4878 
4879 /**
4880  * btrfs_update_reserved_bytes - update the block_group and space info counters
4881  * @cache:	The cache we are manipulating
4882  * @num_bytes:	The number of bytes in question
4883  * @reserve:	One of the reservation enums
4884  *
4885  * This is called by the allocator when it reserves space, or by somebody who is
4886  * freeing space that was never actually used on disk.  For example if you
4887  * reserve some space for a new leaf in transaction A and before transaction A
4888  * commits you free that leaf, you call this with reserve set to 0 in order to
4889  * clear the reservation.
4890  *
4891  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4892  * ENOSPC accounting.  For data we handle the reservation through clearing the
4893  * delalloc bits in the io_tree.  We have to do this since we could end up
4894  * allocating less disk space for the amount of data we have reserved in the
4895  * case of compression.
4896  *
4897  * If this is a reservation and the block group has become read only we cannot
4898  * make the reservation and return -EAGAIN, otherwise this function always
4899  * succeeds.
4900  */
4901 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4902 				       u64 num_bytes, int reserve)
4903 {
4904 	struct btrfs_space_info *space_info = cache->space_info;
4905 	int ret = 0;
4906 
4907 	spin_lock(&space_info->lock);
4908 	spin_lock(&cache->lock);
4909 	if (reserve != RESERVE_FREE) {
4910 		if (cache->ro) {
4911 			ret = -EAGAIN;
4912 		} else {
4913 			cache->reserved += num_bytes;
4914 			space_info->bytes_reserved += num_bytes;
4915 			if (reserve == RESERVE_ALLOC) {
4916 				trace_btrfs_space_reservation(cache->fs_info,
4917 						"space_info", space_info->flags,
4918 						num_bytes, 0);
4919 				space_info->bytes_may_use -= num_bytes;
4920 			}
4921 		}
4922 	} else {
4923 		if (cache->ro)
4924 			space_info->bytes_readonly += num_bytes;
4925 		cache->reserved -= num_bytes;
4926 		space_info->bytes_reserved -= num_bytes;
4927 		space_info->reservation_progress++;
4928 	}
4929 	spin_unlock(&cache->lock);
4930 	spin_unlock(&space_info->lock);
4931 	return ret;
4932 }
4933 
4934 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4935 				struct btrfs_root *root)
4936 {
4937 	struct btrfs_fs_info *fs_info = root->fs_info;
4938 	struct btrfs_caching_control *next;
4939 	struct btrfs_caching_control *caching_ctl;
4940 	struct btrfs_block_group_cache *cache;
4941 
4942 	down_write(&fs_info->extent_commit_sem);
4943 
4944 	list_for_each_entry_safe(caching_ctl, next,
4945 				 &fs_info->caching_block_groups, list) {
4946 		cache = caching_ctl->block_group;
4947 		if (block_group_cache_done(cache)) {
4948 			cache->last_byte_to_unpin = (u64)-1;
4949 			list_del_init(&caching_ctl->list);
4950 			put_caching_control(caching_ctl);
4951 		} else {
4952 			cache->last_byte_to_unpin = caching_ctl->progress;
4953 		}
4954 	}
4955 
4956 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4957 		fs_info->pinned_extents = &fs_info->freed_extents[1];
4958 	else
4959 		fs_info->pinned_extents = &fs_info->freed_extents[0];
4960 
4961 	up_write(&fs_info->extent_commit_sem);
4962 
4963 	update_global_block_rsv(fs_info);
4964 }
4965 
4966 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4967 {
4968 	struct btrfs_fs_info *fs_info = root->fs_info;
4969 	struct btrfs_block_group_cache *cache = NULL;
4970 	u64 len;
4971 
4972 	while (start <= end) {
4973 		if (!cache ||
4974 		    start >= cache->key.objectid + cache->key.offset) {
4975 			if (cache)
4976 				btrfs_put_block_group(cache);
4977 			cache = btrfs_lookup_block_group(fs_info, start);
4978 			BUG_ON(!cache); /* Logic error */
4979 		}
4980 
4981 		len = cache->key.objectid + cache->key.offset - start;
4982 		len = min(len, end + 1 - start);
4983 
4984 		if (start < cache->last_byte_to_unpin) {
4985 			len = min(len, cache->last_byte_to_unpin - start);
4986 			btrfs_add_free_space(cache, start, len);
4987 		}
4988 
4989 		start += len;
4990 
4991 		spin_lock(&cache->space_info->lock);
4992 		spin_lock(&cache->lock);
4993 		cache->pinned -= len;
4994 		cache->space_info->bytes_pinned -= len;
4995 		if (cache->ro)
4996 			cache->space_info->bytes_readonly += len;
4997 		spin_unlock(&cache->lock);
4998 		spin_unlock(&cache->space_info->lock);
4999 	}
5000 
5001 	if (cache)
5002 		btrfs_put_block_group(cache);
5003 	return 0;
5004 }
5005 
5006 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5007 			       struct btrfs_root *root)
5008 {
5009 	struct btrfs_fs_info *fs_info = root->fs_info;
5010 	struct extent_io_tree *unpin;
5011 	u64 start;
5012 	u64 end;
5013 	int ret;
5014 
5015 	if (trans->aborted)
5016 		return 0;
5017 
5018 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5019 		unpin = &fs_info->freed_extents[1];
5020 	else
5021 		unpin = &fs_info->freed_extents[0];
5022 
5023 	while (1) {
5024 		ret = find_first_extent_bit(unpin, 0, &start, &end,
5025 					    EXTENT_DIRTY);
5026 		if (ret)
5027 			break;
5028 
5029 		if (btrfs_test_opt(root, DISCARD))
5030 			ret = btrfs_discard_extent(root, start,
5031 						   end + 1 - start, NULL);
5032 
5033 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
5034 		unpin_extent_range(root, start, end);
5035 		cond_resched();
5036 	}
5037 
5038 	return 0;
5039 }
5040 
5041 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5042 				struct btrfs_root *root,
5043 				u64 bytenr, u64 num_bytes, u64 parent,
5044 				u64 root_objectid, u64 owner_objectid,
5045 				u64 owner_offset, int refs_to_drop,
5046 				struct btrfs_delayed_extent_op *extent_op)
5047 {
5048 	struct btrfs_key key;
5049 	struct btrfs_path *path;
5050 	struct btrfs_fs_info *info = root->fs_info;
5051 	struct btrfs_root *extent_root = info->extent_root;
5052 	struct extent_buffer *leaf;
5053 	struct btrfs_extent_item *ei;
5054 	struct btrfs_extent_inline_ref *iref;
5055 	int ret;
5056 	int is_data;
5057 	int extent_slot = 0;
5058 	int found_extent = 0;
5059 	int num_to_del = 1;
5060 	u32 item_size;
5061 	u64 refs;
5062 
5063 	path = btrfs_alloc_path();
5064 	if (!path)
5065 		return -ENOMEM;
5066 
5067 	path->reada = 1;
5068 	path->leave_spinning = 1;
5069 
5070 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5071 	BUG_ON(!is_data && refs_to_drop != 1);
5072 
5073 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
5074 				    bytenr, num_bytes, parent,
5075 				    root_objectid, owner_objectid,
5076 				    owner_offset);
5077 	if (ret == 0) {
5078 		extent_slot = path->slots[0];
5079 		while (extent_slot >= 0) {
5080 			btrfs_item_key_to_cpu(path->nodes[0], &key,
5081 					      extent_slot);
5082 			if (key.objectid != bytenr)
5083 				break;
5084 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5085 			    key.offset == num_bytes) {
5086 				found_extent = 1;
5087 				break;
5088 			}
5089 			if (path->slots[0] - extent_slot > 5)
5090 				break;
5091 			extent_slot--;
5092 		}
5093 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5094 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5095 		if (found_extent && item_size < sizeof(*ei))
5096 			found_extent = 0;
5097 #endif
5098 		if (!found_extent) {
5099 			BUG_ON(iref);
5100 			ret = remove_extent_backref(trans, extent_root, path,
5101 						    NULL, refs_to_drop,
5102 						    is_data);
5103 			if (ret)
5104 				goto abort;
5105 			btrfs_release_path(path);
5106 			path->leave_spinning = 1;
5107 
5108 			key.objectid = bytenr;
5109 			key.type = BTRFS_EXTENT_ITEM_KEY;
5110 			key.offset = num_bytes;
5111 
5112 			ret = btrfs_search_slot(trans, extent_root,
5113 						&key, path, -1, 1);
5114 			if (ret) {
5115 				printk(KERN_ERR "umm, got %d back from search"
5116 				       ", was looking for %llu\n", ret,
5117 				       (unsigned long long)bytenr);
5118 				if (ret > 0)
5119 					btrfs_print_leaf(extent_root,
5120 							 path->nodes[0]);
5121 			}
5122 			if (ret < 0)
5123 				goto abort;
5124 			extent_slot = path->slots[0];
5125 		}
5126 	} else if (ret == -ENOENT) {
5127 		btrfs_print_leaf(extent_root, path->nodes[0]);
5128 		WARN_ON(1);
5129 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5130 		       "parent %llu root %llu  owner %llu offset %llu\n",
5131 		       (unsigned long long)bytenr,
5132 		       (unsigned long long)parent,
5133 		       (unsigned long long)root_objectid,
5134 		       (unsigned long long)owner_objectid,
5135 		       (unsigned long long)owner_offset);
5136 	} else {
5137 		goto abort;
5138 	}
5139 
5140 	leaf = path->nodes[0];
5141 	item_size = btrfs_item_size_nr(leaf, extent_slot);
5142 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5143 	if (item_size < sizeof(*ei)) {
5144 		BUG_ON(found_extent || extent_slot != path->slots[0]);
5145 		ret = convert_extent_item_v0(trans, extent_root, path,
5146 					     owner_objectid, 0);
5147 		if (ret < 0)
5148 			goto abort;
5149 
5150 		btrfs_release_path(path);
5151 		path->leave_spinning = 1;
5152 
5153 		key.objectid = bytenr;
5154 		key.type = BTRFS_EXTENT_ITEM_KEY;
5155 		key.offset = num_bytes;
5156 
5157 		ret = btrfs_search_slot(trans, extent_root, &key, path,
5158 					-1, 1);
5159 		if (ret) {
5160 			printk(KERN_ERR "umm, got %d back from search"
5161 			       ", was looking for %llu\n", ret,
5162 			       (unsigned long long)bytenr);
5163 			btrfs_print_leaf(extent_root, path->nodes[0]);
5164 		}
5165 		if (ret < 0)
5166 			goto abort;
5167 		extent_slot = path->slots[0];
5168 		leaf = path->nodes[0];
5169 		item_size = btrfs_item_size_nr(leaf, extent_slot);
5170 	}
5171 #endif
5172 	BUG_ON(item_size < sizeof(*ei));
5173 	ei = btrfs_item_ptr(leaf, extent_slot,
5174 			    struct btrfs_extent_item);
5175 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5176 		struct btrfs_tree_block_info *bi;
5177 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5178 		bi = (struct btrfs_tree_block_info *)(ei + 1);
5179 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5180 	}
5181 
5182 	refs = btrfs_extent_refs(leaf, ei);
5183 	BUG_ON(refs < refs_to_drop);
5184 	refs -= refs_to_drop;
5185 
5186 	if (refs > 0) {
5187 		if (extent_op)
5188 			__run_delayed_extent_op(extent_op, leaf, ei);
5189 		/*
5190 		 * In the case of inline back ref, reference count will
5191 		 * be updated by remove_extent_backref
5192 		 */
5193 		if (iref) {
5194 			BUG_ON(!found_extent);
5195 		} else {
5196 			btrfs_set_extent_refs(leaf, ei, refs);
5197 			btrfs_mark_buffer_dirty(leaf);
5198 		}
5199 		if (found_extent) {
5200 			ret = remove_extent_backref(trans, extent_root, path,
5201 						    iref, refs_to_drop,
5202 						    is_data);
5203 			if (ret)
5204 				goto abort;
5205 		}
5206 	} else {
5207 		if (found_extent) {
5208 			BUG_ON(is_data && refs_to_drop !=
5209 			       extent_data_ref_count(root, path, iref));
5210 			if (iref) {
5211 				BUG_ON(path->slots[0] != extent_slot);
5212 			} else {
5213 				BUG_ON(path->slots[0] != extent_slot + 1);
5214 				path->slots[0] = extent_slot;
5215 				num_to_del = 2;
5216 			}
5217 		}
5218 
5219 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5220 				      num_to_del);
5221 		if (ret)
5222 			goto abort;
5223 		btrfs_release_path(path);
5224 
5225 		if (is_data) {
5226 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5227 			if (ret)
5228 				goto abort;
5229 		}
5230 
5231 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5232 		if (ret)
5233 			goto abort;
5234 	}
5235 out:
5236 	btrfs_free_path(path);
5237 	return ret;
5238 
5239 abort:
5240 	btrfs_abort_transaction(trans, extent_root, ret);
5241 	goto out;
5242 }
5243 
5244 /*
5245  * when we free an block, it is possible (and likely) that we free the last
5246  * delayed ref for that extent as well.  This searches the delayed ref tree for
5247  * a given extent, and if there are no other delayed refs to be processed, it
5248  * removes it from the tree.
5249  */
5250 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5251 				      struct btrfs_root *root, u64 bytenr)
5252 {
5253 	struct btrfs_delayed_ref_head *head;
5254 	struct btrfs_delayed_ref_root *delayed_refs;
5255 	struct btrfs_delayed_ref_node *ref;
5256 	struct rb_node *node;
5257 	int ret = 0;
5258 
5259 	delayed_refs = &trans->transaction->delayed_refs;
5260 	spin_lock(&delayed_refs->lock);
5261 	head = btrfs_find_delayed_ref_head(trans, bytenr);
5262 	if (!head)
5263 		goto out;
5264 
5265 	node = rb_prev(&head->node.rb_node);
5266 	if (!node)
5267 		goto out;
5268 
5269 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5270 
5271 	/* there are still entries for this ref, we can't drop it */
5272 	if (ref->bytenr == bytenr)
5273 		goto out;
5274 
5275 	if (head->extent_op) {
5276 		if (!head->must_insert_reserved)
5277 			goto out;
5278 		kfree(head->extent_op);
5279 		head->extent_op = NULL;
5280 	}
5281 
5282 	/*
5283 	 * waiting for the lock here would deadlock.  If someone else has it
5284 	 * locked they are already in the process of dropping it anyway
5285 	 */
5286 	if (!mutex_trylock(&head->mutex))
5287 		goto out;
5288 
5289 	/*
5290 	 * at this point we have a head with no other entries.  Go
5291 	 * ahead and process it.
5292 	 */
5293 	head->node.in_tree = 0;
5294 	rb_erase(&head->node.rb_node, &delayed_refs->root);
5295 
5296 	delayed_refs->num_entries--;
5297 	smp_mb();
5298 	if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
5299 		wake_up(&root->fs_info->tree_mod_seq_wait);
5300 
5301 	/*
5302 	 * we don't take a ref on the node because we're removing it from the
5303 	 * tree, so we just steal the ref the tree was holding.
5304 	 */
5305 	delayed_refs->num_heads--;
5306 	if (list_empty(&head->cluster))
5307 		delayed_refs->num_heads_ready--;
5308 
5309 	list_del_init(&head->cluster);
5310 	spin_unlock(&delayed_refs->lock);
5311 
5312 	BUG_ON(head->extent_op);
5313 	if (head->must_insert_reserved)
5314 		ret = 1;
5315 
5316 	mutex_unlock(&head->mutex);
5317 	btrfs_put_delayed_ref(&head->node);
5318 	return ret;
5319 out:
5320 	spin_unlock(&delayed_refs->lock);
5321 	return 0;
5322 }
5323 
5324 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5325 			   struct btrfs_root *root,
5326 			   struct extent_buffer *buf,
5327 			   u64 parent, int last_ref)
5328 {
5329 	struct btrfs_block_group_cache *cache = NULL;
5330 	int ret;
5331 
5332 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5333 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5334 					buf->start, buf->len,
5335 					parent, root->root_key.objectid,
5336 					btrfs_header_level(buf),
5337 					BTRFS_DROP_DELAYED_REF, NULL, 0);
5338 		BUG_ON(ret); /* -ENOMEM */
5339 	}
5340 
5341 	if (!last_ref)
5342 		return;
5343 
5344 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5345 
5346 	if (btrfs_header_generation(buf) == trans->transid) {
5347 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5348 			ret = check_ref_cleanup(trans, root, buf->start);
5349 			if (!ret)
5350 				goto out;
5351 		}
5352 
5353 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5354 			pin_down_extent(root, cache, buf->start, buf->len, 1);
5355 			goto out;
5356 		}
5357 
5358 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5359 
5360 		btrfs_add_free_space(cache, buf->start, buf->len);
5361 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5362 	}
5363 out:
5364 	/*
5365 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5366 	 * anymore.
5367 	 */
5368 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5369 	btrfs_put_block_group(cache);
5370 }
5371 
5372 /* Can return -ENOMEM */
5373 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5374 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5375 		      u64 owner, u64 offset, int for_cow)
5376 {
5377 	int ret;
5378 	struct btrfs_fs_info *fs_info = root->fs_info;
5379 
5380 	/*
5381 	 * tree log blocks never actually go into the extent allocation
5382 	 * tree, just update pinning info and exit early.
5383 	 */
5384 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5385 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5386 		/* unlocks the pinned mutex */
5387 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
5388 		ret = 0;
5389 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5390 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5391 					num_bytes,
5392 					parent, root_objectid, (int)owner,
5393 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5394 	} else {
5395 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5396 						num_bytes,
5397 						parent, root_objectid, owner,
5398 						offset, BTRFS_DROP_DELAYED_REF,
5399 						NULL, for_cow);
5400 	}
5401 	return ret;
5402 }
5403 
5404 static u64 stripe_align(struct btrfs_root *root, u64 val)
5405 {
5406 	u64 mask = ((u64)root->stripesize - 1);
5407 	u64 ret = (val + mask) & ~mask;
5408 	return ret;
5409 }
5410 
5411 /*
5412  * when we wait for progress in the block group caching, its because
5413  * our allocation attempt failed at least once.  So, we must sleep
5414  * and let some progress happen before we try again.
5415  *
5416  * This function will sleep at least once waiting for new free space to
5417  * show up, and then it will check the block group free space numbers
5418  * for our min num_bytes.  Another option is to have it go ahead
5419  * and look in the rbtree for a free extent of a given size, but this
5420  * is a good start.
5421  */
5422 static noinline int
5423 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5424 				u64 num_bytes)
5425 {
5426 	struct btrfs_caching_control *caching_ctl;
5427 	DEFINE_WAIT(wait);
5428 
5429 	caching_ctl = get_caching_control(cache);
5430 	if (!caching_ctl)
5431 		return 0;
5432 
5433 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5434 		   (cache->free_space_ctl->free_space >= num_bytes));
5435 
5436 	put_caching_control(caching_ctl);
5437 	return 0;
5438 }
5439 
5440 static noinline int
5441 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5442 {
5443 	struct btrfs_caching_control *caching_ctl;
5444 	DEFINE_WAIT(wait);
5445 
5446 	caching_ctl = get_caching_control(cache);
5447 	if (!caching_ctl)
5448 		return 0;
5449 
5450 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
5451 
5452 	put_caching_control(caching_ctl);
5453 	return 0;
5454 }
5455 
5456 static int __get_block_group_index(u64 flags)
5457 {
5458 	int index;
5459 
5460 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
5461 		index = 0;
5462 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5463 		index = 1;
5464 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
5465 		index = 2;
5466 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5467 		index = 3;
5468 	else
5469 		index = 4;
5470 
5471 	return index;
5472 }
5473 
5474 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5475 {
5476 	return __get_block_group_index(cache->flags);
5477 }
5478 
5479 enum btrfs_loop_type {
5480 	LOOP_CACHING_NOWAIT = 0,
5481 	LOOP_CACHING_WAIT = 1,
5482 	LOOP_ALLOC_CHUNK = 2,
5483 	LOOP_NO_EMPTY_SIZE = 3,
5484 };
5485 
5486 /*
5487  * walks the btree of allocated extents and find a hole of a given size.
5488  * The key ins is changed to record the hole:
5489  * ins->objectid == block start
5490  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5491  * ins->offset == number of blocks
5492  * Any available blocks before search_start are skipped.
5493  */
5494 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5495 				     struct btrfs_root *orig_root,
5496 				     u64 num_bytes, u64 empty_size,
5497 				     u64 hint_byte, struct btrfs_key *ins,
5498 				     u64 data)
5499 {
5500 	int ret = 0;
5501 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5502 	struct btrfs_free_cluster *last_ptr = NULL;
5503 	struct btrfs_block_group_cache *block_group = NULL;
5504 	struct btrfs_block_group_cache *used_block_group;
5505 	u64 search_start = 0;
5506 	int empty_cluster = 2 * 1024 * 1024;
5507 	int allowed_chunk_alloc = 0;
5508 	int done_chunk_alloc = 0;
5509 	struct btrfs_space_info *space_info;
5510 	int loop = 0;
5511 	int index = 0;
5512 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5513 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5514 	bool found_uncached_bg = false;
5515 	bool failed_cluster_refill = false;
5516 	bool failed_alloc = false;
5517 	bool use_cluster = true;
5518 	bool have_caching_bg = false;
5519 
5520 	WARN_ON(num_bytes < root->sectorsize);
5521 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5522 	ins->objectid = 0;
5523 	ins->offset = 0;
5524 
5525 	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5526 
5527 	space_info = __find_space_info(root->fs_info, data);
5528 	if (!space_info) {
5529 		printk(KERN_ERR "No space info for %llu\n", data);
5530 		return -ENOSPC;
5531 	}
5532 
5533 	/*
5534 	 * If the space info is for both data and metadata it means we have a
5535 	 * small filesystem and we can't use the clustering stuff.
5536 	 */
5537 	if (btrfs_mixed_space_info(space_info))
5538 		use_cluster = false;
5539 
5540 	if (orig_root->ref_cows || empty_size)
5541 		allowed_chunk_alloc = 1;
5542 
5543 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5544 		last_ptr = &root->fs_info->meta_alloc_cluster;
5545 		if (!btrfs_test_opt(root, SSD))
5546 			empty_cluster = 64 * 1024;
5547 	}
5548 
5549 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5550 	    btrfs_test_opt(root, SSD)) {
5551 		last_ptr = &root->fs_info->data_alloc_cluster;
5552 	}
5553 
5554 	if (last_ptr) {
5555 		spin_lock(&last_ptr->lock);
5556 		if (last_ptr->block_group)
5557 			hint_byte = last_ptr->window_start;
5558 		spin_unlock(&last_ptr->lock);
5559 	}
5560 
5561 	search_start = max(search_start, first_logical_byte(root, 0));
5562 	search_start = max(search_start, hint_byte);
5563 
5564 	if (!last_ptr)
5565 		empty_cluster = 0;
5566 
5567 	if (search_start == hint_byte) {
5568 		block_group = btrfs_lookup_block_group(root->fs_info,
5569 						       search_start);
5570 		used_block_group = block_group;
5571 		/*
5572 		 * we don't want to use the block group if it doesn't match our
5573 		 * allocation bits, or if its not cached.
5574 		 *
5575 		 * However if we are re-searching with an ideal block group
5576 		 * picked out then we don't care that the block group is cached.
5577 		 */
5578 		if (block_group && block_group_bits(block_group, data) &&
5579 		    block_group->cached != BTRFS_CACHE_NO) {
5580 			down_read(&space_info->groups_sem);
5581 			if (list_empty(&block_group->list) ||
5582 			    block_group->ro) {
5583 				/*
5584 				 * someone is removing this block group,
5585 				 * we can't jump into the have_block_group
5586 				 * target because our list pointers are not
5587 				 * valid
5588 				 */
5589 				btrfs_put_block_group(block_group);
5590 				up_read(&space_info->groups_sem);
5591 			} else {
5592 				index = get_block_group_index(block_group);
5593 				goto have_block_group;
5594 			}
5595 		} else if (block_group) {
5596 			btrfs_put_block_group(block_group);
5597 		}
5598 	}
5599 search:
5600 	have_caching_bg = false;
5601 	down_read(&space_info->groups_sem);
5602 	list_for_each_entry(block_group, &space_info->block_groups[index],
5603 			    list) {
5604 		u64 offset;
5605 		int cached;
5606 
5607 		used_block_group = block_group;
5608 		btrfs_get_block_group(block_group);
5609 		search_start = block_group->key.objectid;
5610 
5611 		/*
5612 		 * this can happen if we end up cycling through all the
5613 		 * raid types, but we want to make sure we only allocate
5614 		 * for the proper type.
5615 		 */
5616 		if (!block_group_bits(block_group, data)) {
5617 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5618 				BTRFS_BLOCK_GROUP_RAID1 |
5619 				BTRFS_BLOCK_GROUP_RAID10;
5620 
5621 			/*
5622 			 * if they asked for extra copies and this block group
5623 			 * doesn't provide them, bail.  This does allow us to
5624 			 * fill raid0 from raid1.
5625 			 */
5626 			if ((data & extra) && !(block_group->flags & extra))
5627 				goto loop;
5628 		}
5629 
5630 have_block_group:
5631 		cached = block_group_cache_done(block_group);
5632 		if (unlikely(!cached)) {
5633 			found_uncached_bg = true;
5634 			ret = cache_block_group(block_group, trans,
5635 						orig_root, 0);
5636 			BUG_ON(ret < 0);
5637 			ret = 0;
5638 		}
5639 
5640 		if (unlikely(block_group->ro))
5641 			goto loop;
5642 
5643 		/*
5644 		 * Ok we want to try and use the cluster allocator, so
5645 		 * lets look there
5646 		 */
5647 		if (last_ptr) {
5648 			/*
5649 			 * the refill lock keeps out other
5650 			 * people trying to start a new cluster
5651 			 */
5652 			spin_lock(&last_ptr->refill_lock);
5653 			used_block_group = last_ptr->block_group;
5654 			if (used_block_group != block_group &&
5655 			    (!used_block_group ||
5656 			     used_block_group->ro ||
5657 			     !block_group_bits(used_block_group, data))) {
5658 				used_block_group = block_group;
5659 				goto refill_cluster;
5660 			}
5661 
5662 			if (used_block_group != block_group)
5663 				btrfs_get_block_group(used_block_group);
5664 
5665 			offset = btrfs_alloc_from_cluster(used_block_group,
5666 			  last_ptr, num_bytes, used_block_group->key.objectid);
5667 			if (offset) {
5668 				/* we have a block, we're done */
5669 				spin_unlock(&last_ptr->refill_lock);
5670 				trace_btrfs_reserve_extent_cluster(root,
5671 					block_group, search_start, num_bytes);
5672 				goto checks;
5673 			}
5674 
5675 			WARN_ON(last_ptr->block_group != used_block_group);
5676 			if (used_block_group != block_group) {
5677 				btrfs_put_block_group(used_block_group);
5678 				used_block_group = block_group;
5679 			}
5680 refill_cluster:
5681 			BUG_ON(used_block_group != block_group);
5682 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
5683 			 * set up a new clusters, so lets just skip it
5684 			 * and let the allocator find whatever block
5685 			 * it can find.  If we reach this point, we
5686 			 * will have tried the cluster allocator
5687 			 * plenty of times and not have found
5688 			 * anything, so we are likely way too
5689 			 * fragmented for the clustering stuff to find
5690 			 * anything.
5691 			 *
5692 			 * However, if the cluster is taken from the
5693 			 * current block group, release the cluster
5694 			 * first, so that we stand a better chance of
5695 			 * succeeding in the unclustered
5696 			 * allocation.  */
5697 			if (loop >= LOOP_NO_EMPTY_SIZE &&
5698 			    last_ptr->block_group != block_group) {
5699 				spin_unlock(&last_ptr->refill_lock);
5700 				goto unclustered_alloc;
5701 			}
5702 
5703 			/*
5704 			 * this cluster didn't work out, free it and
5705 			 * start over
5706 			 */
5707 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5708 
5709 			if (loop >= LOOP_NO_EMPTY_SIZE) {
5710 				spin_unlock(&last_ptr->refill_lock);
5711 				goto unclustered_alloc;
5712 			}
5713 
5714 			/* allocate a cluster in this block group */
5715 			ret = btrfs_find_space_cluster(trans, root,
5716 					       block_group, last_ptr,
5717 					       search_start, num_bytes,
5718 					       empty_cluster + empty_size);
5719 			if (ret == 0) {
5720 				/*
5721 				 * now pull our allocation out of this
5722 				 * cluster
5723 				 */
5724 				offset = btrfs_alloc_from_cluster(block_group,
5725 						  last_ptr, num_bytes,
5726 						  search_start);
5727 				if (offset) {
5728 					/* we found one, proceed */
5729 					spin_unlock(&last_ptr->refill_lock);
5730 					trace_btrfs_reserve_extent_cluster(root,
5731 						block_group, search_start,
5732 						num_bytes);
5733 					goto checks;
5734 				}
5735 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
5736 				   && !failed_cluster_refill) {
5737 				spin_unlock(&last_ptr->refill_lock);
5738 
5739 				failed_cluster_refill = true;
5740 				wait_block_group_cache_progress(block_group,
5741 				       num_bytes + empty_cluster + empty_size);
5742 				goto have_block_group;
5743 			}
5744 
5745 			/*
5746 			 * at this point we either didn't find a cluster
5747 			 * or we weren't able to allocate a block from our
5748 			 * cluster.  Free the cluster we've been trying
5749 			 * to use, and go to the next block group
5750 			 */
5751 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5752 			spin_unlock(&last_ptr->refill_lock);
5753 			goto loop;
5754 		}
5755 
5756 unclustered_alloc:
5757 		spin_lock(&block_group->free_space_ctl->tree_lock);
5758 		if (cached &&
5759 		    block_group->free_space_ctl->free_space <
5760 		    num_bytes + empty_cluster + empty_size) {
5761 			spin_unlock(&block_group->free_space_ctl->tree_lock);
5762 			goto loop;
5763 		}
5764 		spin_unlock(&block_group->free_space_ctl->tree_lock);
5765 
5766 		offset = btrfs_find_space_for_alloc(block_group, search_start,
5767 						    num_bytes, empty_size);
5768 		/*
5769 		 * If we didn't find a chunk, and we haven't failed on this
5770 		 * block group before, and this block group is in the middle of
5771 		 * caching and we are ok with waiting, then go ahead and wait
5772 		 * for progress to be made, and set failed_alloc to true.
5773 		 *
5774 		 * If failed_alloc is true then we've already waited on this
5775 		 * block group once and should move on to the next block group.
5776 		 */
5777 		if (!offset && !failed_alloc && !cached &&
5778 		    loop > LOOP_CACHING_NOWAIT) {
5779 			wait_block_group_cache_progress(block_group,
5780 						num_bytes + empty_size);
5781 			failed_alloc = true;
5782 			goto have_block_group;
5783 		} else if (!offset) {
5784 			if (!cached)
5785 				have_caching_bg = true;
5786 			goto loop;
5787 		}
5788 checks:
5789 		search_start = stripe_align(root, offset);
5790 
5791 		/* move on to the next group */
5792 		if (search_start + num_bytes >
5793 		    used_block_group->key.objectid + used_block_group->key.offset) {
5794 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5795 			goto loop;
5796 		}
5797 
5798 		if (offset < search_start)
5799 			btrfs_add_free_space(used_block_group, offset,
5800 					     search_start - offset);
5801 		BUG_ON(offset > search_start);
5802 
5803 		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5804 						  alloc_type);
5805 		if (ret == -EAGAIN) {
5806 			btrfs_add_free_space(used_block_group, offset, num_bytes);
5807 			goto loop;
5808 		}
5809 
5810 		/* we are all good, lets return */
5811 		ins->objectid = search_start;
5812 		ins->offset = num_bytes;
5813 
5814 		trace_btrfs_reserve_extent(orig_root, block_group,
5815 					   search_start, num_bytes);
5816 		if (offset < search_start)
5817 			btrfs_add_free_space(used_block_group, offset,
5818 					     search_start - offset);
5819 		BUG_ON(offset > search_start);
5820 		if (used_block_group != block_group)
5821 			btrfs_put_block_group(used_block_group);
5822 		btrfs_put_block_group(block_group);
5823 		break;
5824 loop:
5825 		failed_cluster_refill = false;
5826 		failed_alloc = false;
5827 		BUG_ON(index != get_block_group_index(block_group));
5828 		if (used_block_group != block_group)
5829 			btrfs_put_block_group(used_block_group);
5830 		btrfs_put_block_group(block_group);
5831 	}
5832 	up_read(&space_info->groups_sem);
5833 
5834 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5835 		goto search;
5836 
5837 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5838 		goto search;
5839 
5840 	/*
5841 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5842 	 *			caching kthreads as we move along
5843 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5844 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5845 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5846 	 *			again
5847 	 */
5848 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5849 		index = 0;
5850 		loop++;
5851 		if (loop == LOOP_ALLOC_CHUNK) {
5852 		       if (allowed_chunk_alloc) {
5853 				ret = do_chunk_alloc(trans, root, num_bytes +
5854 						     2 * 1024 * 1024, data,
5855 						     CHUNK_ALLOC_LIMITED);
5856 				/*
5857 				 * Do not bail out on ENOSPC since we
5858 				 * can do more things.
5859 				 */
5860 				if (ret < 0 && ret != -ENOSPC) {
5861 					btrfs_abort_transaction(trans,
5862 								root, ret);
5863 					goto out;
5864 				}
5865 				allowed_chunk_alloc = 0;
5866 				if (ret == 1)
5867 					done_chunk_alloc = 1;
5868 			} else if (!done_chunk_alloc &&
5869 				   space_info->force_alloc ==
5870 				   CHUNK_ALLOC_NO_FORCE) {
5871 				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5872 			}
5873 
5874 		       /*
5875 			* We didn't allocate a chunk, go ahead and drop the
5876 			* empty size and loop again.
5877 			*/
5878 		       if (!done_chunk_alloc)
5879 			       loop = LOOP_NO_EMPTY_SIZE;
5880 		}
5881 
5882 		if (loop == LOOP_NO_EMPTY_SIZE) {
5883 			empty_size = 0;
5884 			empty_cluster = 0;
5885 		}
5886 
5887 		goto search;
5888 	} else if (!ins->objectid) {
5889 		ret = -ENOSPC;
5890 	} else if (ins->objectid) {
5891 		ret = 0;
5892 	}
5893 out:
5894 
5895 	return ret;
5896 }
5897 
5898 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5899 			    int dump_block_groups)
5900 {
5901 	struct btrfs_block_group_cache *cache;
5902 	int index = 0;
5903 
5904 	spin_lock(&info->lock);
5905 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5906 	       (unsigned long long)info->flags,
5907 	       (unsigned long long)(info->total_bytes - info->bytes_used -
5908 				    info->bytes_pinned - info->bytes_reserved -
5909 				    info->bytes_readonly),
5910 	       (info->full) ? "" : "not ");
5911 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5912 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
5913 	       (unsigned long long)info->total_bytes,
5914 	       (unsigned long long)info->bytes_used,
5915 	       (unsigned long long)info->bytes_pinned,
5916 	       (unsigned long long)info->bytes_reserved,
5917 	       (unsigned long long)info->bytes_may_use,
5918 	       (unsigned long long)info->bytes_readonly);
5919 	spin_unlock(&info->lock);
5920 
5921 	if (!dump_block_groups)
5922 		return;
5923 
5924 	down_read(&info->groups_sem);
5925 again:
5926 	list_for_each_entry(cache, &info->block_groups[index], list) {
5927 		spin_lock(&cache->lock);
5928 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5929 		       (unsigned long long)cache->key.objectid,
5930 		       (unsigned long long)cache->key.offset,
5931 		       (unsigned long long)btrfs_block_group_used(&cache->item),
5932 		       (unsigned long long)cache->pinned,
5933 		       (unsigned long long)cache->reserved,
5934 		       cache->ro ? "[readonly]" : "");
5935 		btrfs_dump_free_space(cache, bytes);
5936 		spin_unlock(&cache->lock);
5937 	}
5938 	if (++index < BTRFS_NR_RAID_TYPES)
5939 		goto again;
5940 	up_read(&info->groups_sem);
5941 }
5942 
5943 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5944 			 struct btrfs_root *root,
5945 			 u64 num_bytes, u64 min_alloc_size,
5946 			 u64 empty_size, u64 hint_byte,
5947 			 struct btrfs_key *ins, u64 data)
5948 {
5949 	bool final_tried = false;
5950 	int ret;
5951 
5952 	data = btrfs_get_alloc_profile(root, data);
5953 again:
5954 	/*
5955 	 * the only place that sets empty_size is btrfs_realloc_node, which
5956 	 * is not called recursively on allocations
5957 	 */
5958 	if (empty_size || root->ref_cows) {
5959 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5960 				     num_bytes + 2 * 1024 * 1024, data,
5961 				     CHUNK_ALLOC_NO_FORCE);
5962 		if (ret < 0 && ret != -ENOSPC) {
5963 			btrfs_abort_transaction(trans, root, ret);
5964 			return ret;
5965 		}
5966 	}
5967 
5968 	WARN_ON(num_bytes < root->sectorsize);
5969 	ret = find_free_extent(trans, root, num_bytes, empty_size,
5970 			       hint_byte, ins, data);
5971 
5972 	if (ret == -ENOSPC) {
5973 		if (!final_tried) {
5974 			num_bytes = num_bytes >> 1;
5975 			num_bytes = num_bytes & ~(root->sectorsize - 1);
5976 			num_bytes = max(num_bytes, min_alloc_size);
5977 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5978 				       num_bytes, data, CHUNK_ALLOC_FORCE);
5979 			if (ret < 0 && ret != -ENOSPC) {
5980 				btrfs_abort_transaction(trans, root, ret);
5981 				return ret;
5982 			}
5983 			if (num_bytes == min_alloc_size)
5984 				final_tried = true;
5985 			goto again;
5986 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
5987 			struct btrfs_space_info *sinfo;
5988 
5989 			sinfo = __find_space_info(root->fs_info, data);
5990 			printk(KERN_ERR "btrfs allocation failed flags %llu, "
5991 			       "wanted %llu\n", (unsigned long long)data,
5992 			       (unsigned long long)num_bytes);
5993 			if (sinfo)
5994 				dump_space_info(sinfo, num_bytes, 1);
5995 		}
5996 	}
5997 
5998 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5999 
6000 	return ret;
6001 }
6002 
6003 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6004 					u64 start, u64 len, int pin)
6005 {
6006 	struct btrfs_block_group_cache *cache;
6007 	int ret = 0;
6008 
6009 	cache = btrfs_lookup_block_group(root->fs_info, start);
6010 	if (!cache) {
6011 		printk(KERN_ERR "Unable to find block group for %llu\n",
6012 		       (unsigned long long)start);
6013 		return -ENOSPC;
6014 	}
6015 
6016 	if (btrfs_test_opt(root, DISCARD))
6017 		ret = btrfs_discard_extent(root, start, len, NULL);
6018 
6019 	if (pin)
6020 		pin_down_extent(root, cache, start, len, 1);
6021 	else {
6022 		btrfs_add_free_space(cache, start, len);
6023 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6024 	}
6025 	btrfs_put_block_group(cache);
6026 
6027 	trace_btrfs_reserved_extent_free(root, start, len);
6028 
6029 	return ret;
6030 }
6031 
6032 int btrfs_free_reserved_extent(struct btrfs_root *root,
6033 					u64 start, u64 len)
6034 {
6035 	return __btrfs_free_reserved_extent(root, start, len, 0);
6036 }
6037 
6038 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6039 				       u64 start, u64 len)
6040 {
6041 	return __btrfs_free_reserved_extent(root, start, len, 1);
6042 }
6043 
6044 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6045 				      struct btrfs_root *root,
6046 				      u64 parent, u64 root_objectid,
6047 				      u64 flags, u64 owner, u64 offset,
6048 				      struct btrfs_key *ins, int ref_mod)
6049 {
6050 	int ret;
6051 	struct btrfs_fs_info *fs_info = root->fs_info;
6052 	struct btrfs_extent_item *extent_item;
6053 	struct btrfs_extent_inline_ref *iref;
6054 	struct btrfs_path *path;
6055 	struct extent_buffer *leaf;
6056 	int type;
6057 	u32 size;
6058 
6059 	if (parent > 0)
6060 		type = BTRFS_SHARED_DATA_REF_KEY;
6061 	else
6062 		type = BTRFS_EXTENT_DATA_REF_KEY;
6063 
6064 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6065 
6066 	path = btrfs_alloc_path();
6067 	if (!path)
6068 		return -ENOMEM;
6069 
6070 	path->leave_spinning = 1;
6071 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6072 				      ins, size);
6073 	if (ret) {
6074 		btrfs_free_path(path);
6075 		return ret;
6076 	}
6077 
6078 	leaf = path->nodes[0];
6079 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6080 				     struct btrfs_extent_item);
6081 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6082 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6083 	btrfs_set_extent_flags(leaf, extent_item,
6084 			       flags | BTRFS_EXTENT_FLAG_DATA);
6085 
6086 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6087 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
6088 	if (parent > 0) {
6089 		struct btrfs_shared_data_ref *ref;
6090 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
6091 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6092 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6093 	} else {
6094 		struct btrfs_extent_data_ref *ref;
6095 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6096 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6097 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6098 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6099 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6100 	}
6101 
6102 	btrfs_mark_buffer_dirty(path->nodes[0]);
6103 	btrfs_free_path(path);
6104 
6105 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6106 	if (ret) { /* -ENOENT, logic error */
6107 		printk(KERN_ERR "btrfs update block group failed for %llu "
6108 		       "%llu\n", (unsigned long long)ins->objectid,
6109 		       (unsigned long long)ins->offset);
6110 		BUG();
6111 	}
6112 	return ret;
6113 }
6114 
6115 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6116 				     struct btrfs_root *root,
6117 				     u64 parent, u64 root_objectid,
6118 				     u64 flags, struct btrfs_disk_key *key,
6119 				     int level, struct btrfs_key *ins)
6120 {
6121 	int ret;
6122 	struct btrfs_fs_info *fs_info = root->fs_info;
6123 	struct btrfs_extent_item *extent_item;
6124 	struct btrfs_tree_block_info *block_info;
6125 	struct btrfs_extent_inline_ref *iref;
6126 	struct btrfs_path *path;
6127 	struct extent_buffer *leaf;
6128 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6129 
6130 	path = btrfs_alloc_path();
6131 	if (!path)
6132 		return -ENOMEM;
6133 
6134 	path->leave_spinning = 1;
6135 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6136 				      ins, size);
6137 	if (ret) {
6138 		btrfs_free_path(path);
6139 		return ret;
6140 	}
6141 
6142 	leaf = path->nodes[0];
6143 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6144 				     struct btrfs_extent_item);
6145 	btrfs_set_extent_refs(leaf, extent_item, 1);
6146 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6147 	btrfs_set_extent_flags(leaf, extent_item,
6148 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6149 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6150 
6151 	btrfs_set_tree_block_key(leaf, block_info, key);
6152 	btrfs_set_tree_block_level(leaf, block_info, level);
6153 
6154 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6155 	if (parent > 0) {
6156 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6157 		btrfs_set_extent_inline_ref_type(leaf, iref,
6158 						 BTRFS_SHARED_BLOCK_REF_KEY);
6159 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6160 	} else {
6161 		btrfs_set_extent_inline_ref_type(leaf, iref,
6162 						 BTRFS_TREE_BLOCK_REF_KEY);
6163 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6164 	}
6165 
6166 	btrfs_mark_buffer_dirty(leaf);
6167 	btrfs_free_path(path);
6168 
6169 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6170 	if (ret) { /* -ENOENT, logic error */
6171 		printk(KERN_ERR "btrfs update block group failed for %llu "
6172 		       "%llu\n", (unsigned long long)ins->objectid,
6173 		       (unsigned long long)ins->offset);
6174 		BUG();
6175 	}
6176 	return ret;
6177 }
6178 
6179 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6180 				     struct btrfs_root *root,
6181 				     u64 root_objectid, u64 owner,
6182 				     u64 offset, struct btrfs_key *ins)
6183 {
6184 	int ret;
6185 
6186 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6187 
6188 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6189 					 ins->offset, 0,
6190 					 root_objectid, owner, offset,
6191 					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6192 	return ret;
6193 }
6194 
6195 /*
6196  * this is used by the tree logging recovery code.  It records that
6197  * an extent has been allocated and makes sure to clear the free
6198  * space cache bits as well
6199  */
6200 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6201 				   struct btrfs_root *root,
6202 				   u64 root_objectid, u64 owner, u64 offset,
6203 				   struct btrfs_key *ins)
6204 {
6205 	int ret;
6206 	struct btrfs_block_group_cache *block_group;
6207 	struct btrfs_caching_control *caching_ctl;
6208 	u64 start = ins->objectid;
6209 	u64 num_bytes = ins->offset;
6210 
6211 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6212 	cache_block_group(block_group, trans, NULL, 0);
6213 	caching_ctl = get_caching_control(block_group);
6214 
6215 	if (!caching_ctl) {
6216 		BUG_ON(!block_group_cache_done(block_group));
6217 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6218 		BUG_ON(ret); /* -ENOMEM */
6219 	} else {
6220 		mutex_lock(&caching_ctl->mutex);
6221 
6222 		if (start >= caching_ctl->progress) {
6223 			ret = add_excluded_extent(root, start, num_bytes);
6224 			BUG_ON(ret); /* -ENOMEM */
6225 		} else if (start + num_bytes <= caching_ctl->progress) {
6226 			ret = btrfs_remove_free_space(block_group,
6227 						      start, num_bytes);
6228 			BUG_ON(ret); /* -ENOMEM */
6229 		} else {
6230 			num_bytes = caching_ctl->progress - start;
6231 			ret = btrfs_remove_free_space(block_group,
6232 						      start, num_bytes);
6233 			BUG_ON(ret); /* -ENOMEM */
6234 
6235 			start = caching_ctl->progress;
6236 			num_bytes = ins->objectid + ins->offset -
6237 				    caching_ctl->progress;
6238 			ret = add_excluded_extent(root, start, num_bytes);
6239 			BUG_ON(ret); /* -ENOMEM */
6240 		}
6241 
6242 		mutex_unlock(&caching_ctl->mutex);
6243 		put_caching_control(caching_ctl);
6244 	}
6245 
6246 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6247 					  RESERVE_ALLOC_NO_ACCOUNT);
6248 	BUG_ON(ret); /* logic error */
6249 	btrfs_put_block_group(block_group);
6250 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6251 					 0, owner, offset, ins, 1);
6252 	return ret;
6253 }
6254 
6255 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6256 					    struct btrfs_root *root,
6257 					    u64 bytenr, u32 blocksize,
6258 					    int level)
6259 {
6260 	struct extent_buffer *buf;
6261 
6262 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6263 	if (!buf)
6264 		return ERR_PTR(-ENOMEM);
6265 	btrfs_set_header_generation(buf, trans->transid);
6266 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6267 	btrfs_tree_lock(buf);
6268 	clean_tree_block(trans, root, buf);
6269 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6270 
6271 	btrfs_set_lock_blocking(buf);
6272 	btrfs_set_buffer_uptodate(buf);
6273 
6274 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6275 		/*
6276 		 * we allow two log transactions at a time, use different
6277 		 * EXENT bit to differentiate dirty pages.
6278 		 */
6279 		if (root->log_transid % 2 == 0)
6280 			set_extent_dirty(&root->dirty_log_pages, buf->start,
6281 					buf->start + buf->len - 1, GFP_NOFS);
6282 		else
6283 			set_extent_new(&root->dirty_log_pages, buf->start,
6284 					buf->start + buf->len - 1, GFP_NOFS);
6285 	} else {
6286 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6287 			 buf->start + buf->len - 1, GFP_NOFS);
6288 	}
6289 	trans->blocks_used++;
6290 	/* this returns a buffer locked for blocking */
6291 	return buf;
6292 }
6293 
6294 static struct btrfs_block_rsv *
6295 use_block_rsv(struct btrfs_trans_handle *trans,
6296 	      struct btrfs_root *root, u32 blocksize)
6297 {
6298 	struct btrfs_block_rsv *block_rsv;
6299 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6300 	int ret;
6301 
6302 	block_rsv = get_block_rsv(trans, root);
6303 
6304 	if (block_rsv->size == 0) {
6305 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6306 		/*
6307 		 * If we couldn't reserve metadata bytes try and use some from
6308 		 * the global reserve.
6309 		 */
6310 		if (ret && block_rsv != global_rsv) {
6311 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6312 			if (!ret)
6313 				return global_rsv;
6314 			return ERR_PTR(ret);
6315 		} else if (ret) {
6316 			return ERR_PTR(ret);
6317 		}
6318 		return block_rsv;
6319 	}
6320 
6321 	ret = block_rsv_use_bytes(block_rsv, blocksize);
6322 	if (!ret)
6323 		return block_rsv;
6324 	if (ret) {
6325 		static DEFINE_RATELIMIT_STATE(_rs,
6326 				DEFAULT_RATELIMIT_INTERVAL,
6327 				/*DEFAULT_RATELIMIT_BURST*/ 2);
6328 		if (__ratelimit(&_rs)) {
6329 			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
6330 			WARN_ON(1);
6331 		}
6332 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
6333 		if (!ret) {
6334 			return block_rsv;
6335 		} else if (ret && block_rsv != global_rsv) {
6336 			ret = block_rsv_use_bytes(global_rsv, blocksize);
6337 			if (!ret)
6338 				return global_rsv;
6339 		}
6340 	}
6341 
6342 	return ERR_PTR(-ENOSPC);
6343 }
6344 
6345 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6346 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
6347 {
6348 	block_rsv_add_bytes(block_rsv, blocksize, 0);
6349 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6350 }
6351 
6352 /*
6353  * finds a free extent and does all the dirty work required for allocation
6354  * returns the key for the extent through ins, and a tree buffer for
6355  * the first block of the extent through buf.
6356  *
6357  * returns the tree buffer or NULL.
6358  */
6359 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6360 					struct btrfs_root *root, u32 blocksize,
6361 					u64 parent, u64 root_objectid,
6362 					struct btrfs_disk_key *key, int level,
6363 					u64 hint, u64 empty_size)
6364 {
6365 	struct btrfs_key ins;
6366 	struct btrfs_block_rsv *block_rsv;
6367 	struct extent_buffer *buf;
6368 	u64 flags = 0;
6369 	int ret;
6370 
6371 
6372 	block_rsv = use_block_rsv(trans, root, blocksize);
6373 	if (IS_ERR(block_rsv))
6374 		return ERR_CAST(block_rsv);
6375 
6376 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6377 				   empty_size, hint, &ins, 0);
6378 	if (ret) {
6379 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6380 		return ERR_PTR(ret);
6381 	}
6382 
6383 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6384 				    blocksize, level);
6385 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6386 
6387 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6388 		if (parent == 0)
6389 			parent = ins.objectid;
6390 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6391 	} else
6392 		BUG_ON(parent > 0);
6393 
6394 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6395 		struct btrfs_delayed_extent_op *extent_op;
6396 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
6397 		BUG_ON(!extent_op); /* -ENOMEM */
6398 		if (key)
6399 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
6400 		else
6401 			memset(&extent_op->key, 0, sizeof(extent_op->key));
6402 		extent_op->flags_to_set = flags;
6403 		extent_op->update_key = 1;
6404 		extent_op->update_flags = 1;
6405 		extent_op->is_data = 0;
6406 
6407 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6408 					ins.objectid,
6409 					ins.offset, parent, root_objectid,
6410 					level, BTRFS_ADD_DELAYED_EXTENT,
6411 					extent_op, 0);
6412 		BUG_ON(ret); /* -ENOMEM */
6413 	}
6414 	return buf;
6415 }
6416 
6417 struct walk_control {
6418 	u64 refs[BTRFS_MAX_LEVEL];
6419 	u64 flags[BTRFS_MAX_LEVEL];
6420 	struct btrfs_key update_progress;
6421 	int stage;
6422 	int level;
6423 	int shared_level;
6424 	int update_ref;
6425 	int keep_locks;
6426 	int reada_slot;
6427 	int reada_count;
6428 	int for_reloc;
6429 };
6430 
6431 #define DROP_REFERENCE	1
6432 #define UPDATE_BACKREF	2
6433 
6434 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6435 				     struct btrfs_root *root,
6436 				     struct walk_control *wc,
6437 				     struct btrfs_path *path)
6438 {
6439 	u64 bytenr;
6440 	u64 generation;
6441 	u64 refs;
6442 	u64 flags;
6443 	u32 nritems;
6444 	u32 blocksize;
6445 	struct btrfs_key key;
6446 	struct extent_buffer *eb;
6447 	int ret;
6448 	int slot;
6449 	int nread = 0;
6450 
6451 	if (path->slots[wc->level] < wc->reada_slot) {
6452 		wc->reada_count = wc->reada_count * 2 / 3;
6453 		wc->reada_count = max(wc->reada_count, 2);
6454 	} else {
6455 		wc->reada_count = wc->reada_count * 3 / 2;
6456 		wc->reada_count = min_t(int, wc->reada_count,
6457 					BTRFS_NODEPTRS_PER_BLOCK(root));
6458 	}
6459 
6460 	eb = path->nodes[wc->level];
6461 	nritems = btrfs_header_nritems(eb);
6462 	blocksize = btrfs_level_size(root, wc->level - 1);
6463 
6464 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6465 		if (nread >= wc->reada_count)
6466 			break;
6467 
6468 		cond_resched();
6469 		bytenr = btrfs_node_blockptr(eb, slot);
6470 		generation = btrfs_node_ptr_generation(eb, slot);
6471 
6472 		if (slot == path->slots[wc->level])
6473 			goto reada;
6474 
6475 		if (wc->stage == UPDATE_BACKREF &&
6476 		    generation <= root->root_key.offset)
6477 			continue;
6478 
6479 		/* We don't lock the tree block, it's OK to be racy here */
6480 		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6481 					       &refs, &flags);
6482 		/* We don't care about errors in readahead. */
6483 		if (ret < 0)
6484 			continue;
6485 		BUG_ON(refs == 0);
6486 
6487 		if (wc->stage == DROP_REFERENCE) {
6488 			if (refs == 1)
6489 				goto reada;
6490 
6491 			if (wc->level == 1 &&
6492 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6493 				continue;
6494 			if (!wc->update_ref ||
6495 			    generation <= root->root_key.offset)
6496 				continue;
6497 			btrfs_node_key_to_cpu(eb, &key, slot);
6498 			ret = btrfs_comp_cpu_keys(&key,
6499 						  &wc->update_progress);
6500 			if (ret < 0)
6501 				continue;
6502 		} else {
6503 			if (wc->level == 1 &&
6504 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6505 				continue;
6506 		}
6507 reada:
6508 		ret = readahead_tree_block(root, bytenr, blocksize,
6509 					   generation);
6510 		if (ret)
6511 			break;
6512 		nread++;
6513 	}
6514 	wc->reada_slot = slot;
6515 }
6516 
6517 /*
6518  * hepler to process tree block while walking down the tree.
6519  *
6520  * when wc->stage == UPDATE_BACKREF, this function updates
6521  * back refs for pointers in the block.
6522  *
6523  * NOTE: return value 1 means we should stop walking down.
6524  */
6525 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6526 				   struct btrfs_root *root,
6527 				   struct btrfs_path *path,
6528 				   struct walk_control *wc, int lookup_info)
6529 {
6530 	int level = wc->level;
6531 	struct extent_buffer *eb = path->nodes[level];
6532 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6533 	int ret;
6534 
6535 	if (wc->stage == UPDATE_BACKREF &&
6536 	    btrfs_header_owner(eb) != root->root_key.objectid)
6537 		return 1;
6538 
6539 	/*
6540 	 * when reference count of tree block is 1, it won't increase
6541 	 * again. once full backref flag is set, we never clear it.
6542 	 */
6543 	if (lookup_info &&
6544 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6545 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6546 		BUG_ON(!path->locks[level]);
6547 		ret = btrfs_lookup_extent_info(trans, root,
6548 					       eb->start, eb->len,
6549 					       &wc->refs[level],
6550 					       &wc->flags[level]);
6551 		BUG_ON(ret == -ENOMEM);
6552 		if (ret)
6553 			return ret;
6554 		BUG_ON(wc->refs[level] == 0);
6555 	}
6556 
6557 	if (wc->stage == DROP_REFERENCE) {
6558 		if (wc->refs[level] > 1)
6559 			return 1;
6560 
6561 		if (path->locks[level] && !wc->keep_locks) {
6562 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6563 			path->locks[level] = 0;
6564 		}
6565 		return 0;
6566 	}
6567 
6568 	/* wc->stage == UPDATE_BACKREF */
6569 	if (!(wc->flags[level] & flag)) {
6570 		BUG_ON(!path->locks[level]);
6571 		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6572 		BUG_ON(ret); /* -ENOMEM */
6573 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6574 		BUG_ON(ret); /* -ENOMEM */
6575 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6576 						  eb->len, flag, 0);
6577 		BUG_ON(ret); /* -ENOMEM */
6578 		wc->flags[level] |= flag;
6579 	}
6580 
6581 	/*
6582 	 * the block is shared by multiple trees, so it's not good to
6583 	 * keep the tree lock
6584 	 */
6585 	if (path->locks[level] && level > 0) {
6586 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6587 		path->locks[level] = 0;
6588 	}
6589 	return 0;
6590 }
6591 
6592 /*
6593  * hepler to process tree block pointer.
6594  *
6595  * when wc->stage == DROP_REFERENCE, this function checks
6596  * reference count of the block pointed to. if the block
6597  * is shared and we need update back refs for the subtree
6598  * rooted at the block, this function changes wc->stage to
6599  * UPDATE_BACKREF. if the block is shared and there is no
6600  * need to update back, this function drops the reference
6601  * to the block.
6602  *
6603  * NOTE: return value 1 means we should stop walking down.
6604  */
6605 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6606 				 struct btrfs_root *root,
6607 				 struct btrfs_path *path,
6608 				 struct walk_control *wc, int *lookup_info)
6609 {
6610 	u64 bytenr;
6611 	u64 generation;
6612 	u64 parent;
6613 	u32 blocksize;
6614 	struct btrfs_key key;
6615 	struct extent_buffer *next;
6616 	int level = wc->level;
6617 	int reada = 0;
6618 	int ret = 0;
6619 
6620 	generation = btrfs_node_ptr_generation(path->nodes[level],
6621 					       path->slots[level]);
6622 	/*
6623 	 * if the lower level block was created before the snapshot
6624 	 * was created, we know there is no need to update back refs
6625 	 * for the subtree
6626 	 */
6627 	if (wc->stage == UPDATE_BACKREF &&
6628 	    generation <= root->root_key.offset) {
6629 		*lookup_info = 1;
6630 		return 1;
6631 	}
6632 
6633 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6634 	blocksize = btrfs_level_size(root, level - 1);
6635 
6636 	next = btrfs_find_tree_block(root, bytenr, blocksize);
6637 	if (!next) {
6638 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6639 		if (!next)
6640 			return -ENOMEM;
6641 		reada = 1;
6642 	}
6643 	btrfs_tree_lock(next);
6644 	btrfs_set_lock_blocking(next);
6645 
6646 	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6647 				       &wc->refs[level - 1],
6648 				       &wc->flags[level - 1]);
6649 	if (ret < 0) {
6650 		btrfs_tree_unlock(next);
6651 		return ret;
6652 	}
6653 
6654 	BUG_ON(wc->refs[level - 1] == 0);
6655 	*lookup_info = 0;
6656 
6657 	if (wc->stage == DROP_REFERENCE) {
6658 		if (wc->refs[level - 1] > 1) {
6659 			if (level == 1 &&
6660 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6661 				goto skip;
6662 
6663 			if (!wc->update_ref ||
6664 			    generation <= root->root_key.offset)
6665 				goto skip;
6666 
6667 			btrfs_node_key_to_cpu(path->nodes[level], &key,
6668 					      path->slots[level]);
6669 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6670 			if (ret < 0)
6671 				goto skip;
6672 
6673 			wc->stage = UPDATE_BACKREF;
6674 			wc->shared_level = level - 1;
6675 		}
6676 	} else {
6677 		if (level == 1 &&
6678 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6679 			goto skip;
6680 	}
6681 
6682 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
6683 		btrfs_tree_unlock(next);
6684 		free_extent_buffer(next);
6685 		next = NULL;
6686 		*lookup_info = 1;
6687 	}
6688 
6689 	if (!next) {
6690 		if (reada && level == 1)
6691 			reada_walk_down(trans, root, wc, path);
6692 		next = read_tree_block(root, bytenr, blocksize, generation);
6693 		if (!next)
6694 			return -EIO;
6695 		btrfs_tree_lock(next);
6696 		btrfs_set_lock_blocking(next);
6697 	}
6698 
6699 	level--;
6700 	BUG_ON(level != btrfs_header_level(next));
6701 	path->nodes[level] = next;
6702 	path->slots[level] = 0;
6703 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6704 	wc->level = level;
6705 	if (wc->level == 1)
6706 		wc->reada_slot = 0;
6707 	return 0;
6708 skip:
6709 	wc->refs[level - 1] = 0;
6710 	wc->flags[level - 1] = 0;
6711 	if (wc->stage == DROP_REFERENCE) {
6712 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6713 			parent = path->nodes[level]->start;
6714 		} else {
6715 			BUG_ON(root->root_key.objectid !=
6716 			       btrfs_header_owner(path->nodes[level]));
6717 			parent = 0;
6718 		}
6719 
6720 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6721 				root->root_key.objectid, level - 1, 0, 0);
6722 		BUG_ON(ret); /* -ENOMEM */
6723 	}
6724 	btrfs_tree_unlock(next);
6725 	free_extent_buffer(next);
6726 	*lookup_info = 1;
6727 	return 1;
6728 }
6729 
6730 /*
6731  * hepler to process tree block while walking up the tree.
6732  *
6733  * when wc->stage == DROP_REFERENCE, this function drops
6734  * reference count on the block.
6735  *
6736  * when wc->stage == UPDATE_BACKREF, this function changes
6737  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6738  * to UPDATE_BACKREF previously while processing the block.
6739  *
6740  * NOTE: return value 1 means we should stop walking up.
6741  */
6742 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6743 				 struct btrfs_root *root,
6744 				 struct btrfs_path *path,
6745 				 struct walk_control *wc)
6746 {
6747 	int ret;
6748 	int level = wc->level;
6749 	struct extent_buffer *eb = path->nodes[level];
6750 	u64 parent = 0;
6751 
6752 	if (wc->stage == UPDATE_BACKREF) {
6753 		BUG_ON(wc->shared_level < level);
6754 		if (level < wc->shared_level)
6755 			goto out;
6756 
6757 		ret = find_next_key(path, level + 1, &wc->update_progress);
6758 		if (ret > 0)
6759 			wc->update_ref = 0;
6760 
6761 		wc->stage = DROP_REFERENCE;
6762 		wc->shared_level = -1;
6763 		path->slots[level] = 0;
6764 
6765 		/*
6766 		 * check reference count again if the block isn't locked.
6767 		 * we should start walking down the tree again if reference
6768 		 * count is one.
6769 		 */
6770 		if (!path->locks[level]) {
6771 			BUG_ON(level == 0);
6772 			btrfs_tree_lock(eb);
6773 			btrfs_set_lock_blocking(eb);
6774 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6775 
6776 			ret = btrfs_lookup_extent_info(trans, root,
6777 						       eb->start, eb->len,
6778 						       &wc->refs[level],
6779 						       &wc->flags[level]);
6780 			if (ret < 0) {
6781 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6782 				return ret;
6783 			}
6784 			BUG_ON(wc->refs[level] == 0);
6785 			if (wc->refs[level] == 1) {
6786 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6787 				return 1;
6788 			}
6789 		}
6790 	}
6791 
6792 	/* wc->stage == DROP_REFERENCE */
6793 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6794 
6795 	if (wc->refs[level] == 1) {
6796 		if (level == 0) {
6797 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6798 				ret = btrfs_dec_ref(trans, root, eb, 1,
6799 						    wc->for_reloc);
6800 			else
6801 				ret = btrfs_dec_ref(trans, root, eb, 0,
6802 						    wc->for_reloc);
6803 			BUG_ON(ret); /* -ENOMEM */
6804 		}
6805 		/* make block locked assertion in clean_tree_block happy */
6806 		if (!path->locks[level] &&
6807 		    btrfs_header_generation(eb) == trans->transid) {
6808 			btrfs_tree_lock(eb);
6809 			btrfs_set_lock_blocking(eb);
6810 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6811 		}
6812 		clean_tree_block(trans, root, eb);
6813 	}
6814 
6815 	if (eb == root->node) {
6816 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6817 			parent = eb->start;
6818 		else
6819 			BUG_ON(root->root_key.objectid !=
6820 			       btrfs_header_owner(eb));
6821 	} else {
6822 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6823 			parent = path->nodes[level + 1]->start;
6824 		else
6825 			BUG_ON(root->root_key.objectid !=
6826 			       btrfs_header_owner(path->nodes[level + 1]));
6827 	}
6828 
6829 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6830 out:
6831 	wc->refs[level] = 0;
6832 	wc->flags[level] = 0;
6833 	return 0;
6834 }
6835 
6836 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6837 				   struct btrfs_root *root,
6838 				   struct btrfs_path *path,
6839 				   struct walk_control *wc)
6840 {
6841 	int level = wc->level;
6842 	int lookup_info = 1;
6843 	int ret;
6844 
6845 	while (level >= 0) {
6846 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
6847 		if (ret > 0)
6848 			break;
6849 
6850 		if (level == 0)
6851 			break;
6852 
6853 		if (path->slots[level] >=
6854 		    btrfs_header_nritems(path->nodes[level]))
6855 			break;
6856 
6857 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
6858 		if (ret > 0) {
6859 			path->slots[level]++;
6860 			continue;
6861 		} else if (ret < 0)
6862 			return ret;
6863 		level = wc->level;
6864 	}
6865 	return 0;
6866 }
6867 
6868 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6869 				 struct btrfs_root *root,
6870 				 struct btrfs_path *path,
6871 				 struct walk_control *wc, int max_level)
6872 {
6873 	int level = wc->level;
6874 	int ret;
6875 
6876 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6877 	while (level < max_level && path->nodes[level]) {
6878 		wc->level = level;
6879 		if (path->slots[level] + 1 <
6880 		    btrfs_header_nritems(path->nodes[level])) {
6881 			path->slots[level]++;
6882 			return 0;
6883 		} else {
6884 			ret = walk_up_proc(trans, root, path, wc);
6885 			if (ret > 0)
6886 				return 0;
6887 
6888 			if (path->locks[level]) {
6889 				btrfs_tree_unlock_rw(path->nodes[level],
6890 						     path->locks[level]);
6891 				path->locks[level] = 0;
6892 			}
6893 			free_extent_buffer(path->nodes[level]);
6894 			path->nodes[level] = NULL;
6895 			level++;
6896 		}
6897 	}
6898 	return 1;
6899 }
6900 
6901 /*
6902  * drop a subvolume tree.
6903  *
6904  * this function traverses the tree freeing any blocks that only
6905  * referenced by the tree.
6906  *
6907  * when a shared tree block is found. this function decreases its
6908  * reference count by one. if update_ref is true, this function
6909  * also make sure backrefs for the shared block and all lower level
6910  * blocks are properly updated.
6911  */
6912 int btrfs_drop_snapshot(struct btrfs_root *root,
6913 			 struct btrfs_block_rsv *block_rsv, int update_ref,
6914 			 int for_reloc)
6915 {
6916 	struct btrfs_path *path;
6917 	struct btrfs_trans_handle *trans;
6918 	struct btrfs_root *tree_root = root->fs_info->tree_root;
6919 	struct btrfs_root_item *root_item = &root->root_item;
6920 	struct walk_control *wc;
6921 	struct btrfs_key key;
6922 	int err = 0;
6923 	int ret;
6924 	int level;
6925 
6926 	path = btrfs_alloc_path();
6927 	if (!path) {
6928 		err = -ENOMEM;
6929 		goto out;
6930 	}
6931 
6932 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6933 	if (!wc) {
6934 		btrfs_free_path(path);
6935 		err = -ENOMEM;
6936 		goto out;
6937 	}
6938 
6939 	trans = btrfs_start_transaction(tree_root, 0);
6940 	if (IS_ERR(trans)) {
6941 		err = PTR_ERR(trans);
6942 		goto out_free;
6943 	}
6944 
6945 	if (block_rsv)
6946 		trans->block_rsv = block_rsv;
6947 
6948 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6949 		level = btrfs_header_level(root->node);
6950 		path->nodes[level] = btrfs_lock_root_node(root);
6951 		btrfs_set_lock_blocking(path->nodes[level]);
6952 		path->slots[level] = 0;
6953 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6954 		memset(&wc->update_progress, 0,
6955 		       sizeof(wc->update_progress));
6956 	} else {
6957 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6958 		memcpy(&wc->update_progress, &key,
6959 		       sizeof(wc->update_progress));
6960 
6961 		level = root_item->drop_level;
6962 		BUG_ON(level == 0);
6963 		path->lowest_level = level;
6964 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6965 		path->lowest_level = 0;
6966 		if (ret < 0) {
6967 			err = ret;
6968 			goto out_end_trans;
6969 		}
6970 		WARN_ON(ret > 0);
6971 
6972 		/*
6973 		 * unlock our path, this is safe because only this
6974 		 * function is allowed to delete this snapshot
6975 		 */
6976 		btrfs_unlock_up_safe(path, 0);
6977 
6978 		level = btrfs_header_level(root->node);
6979 		while (1) {
6980 			btrfs_tree_lock(path->nodes[level]);
6981 			btrfs_set_lock_blocking(path->nodes[level]);
6982 
6983 			ret = btrfs_lookup_extent_info(trans, root,
6984 						path->nodes[level]->start,
6985 						path->nodes[level]->len,
6986 						&wc->refs[level],
6987 						&wc->flags[level]);
6988 			if (ret < 0) {
6989 				err = ret;
6990 				goto out_end_trans;
6991 			}
6992 			BUG_ON(wc->refs[level] == 0);
6993 
6994 			if (level == root_item->drop_level)
6995 				break;
6996 
6997 			btrfs_tree_unlock(path->nodes[level]);
6998 			WARN_ON(wc->refs[level] != 1);
6999 			level--;
7000 		}
7001 	}
7002 
7003 	wc->level = level;
7004 	wc->shared_level = -1;
7005 	wc->stage = DROP_REFERENCE;
7006 	wc->update_ref = update_ref;
7007 	wc->keep_locks = 0;
7008 	wc->for_reloc = for_reloc;
7009 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7010 
7011 	while (1) {
7012 		ret = walk_down_tree(trans, root, path, wc);
7013 		if (ret < 0) {
7014 			err = ret;
7015 			break;
7016 		}
7017 
7018 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7019 		if (ret < 0) {
7020 			err = ret;
7021 			break;
7022 		}
7023 
7024 		if (ret > 0) {
7025 			BUG_ON(wc->stage != DROP_REFERENCE);
7026 			break;
7027 		}
7028 
7029 		if (wc->stage == DROP_REFERENCE) {
7030 			level = wc->level;
7031 			btrfs_node_key(path->nodes[level],
7032 				       &root_item->drop_progress,
7033 				       path->slots[level]);
7034 			root_item->drop_level = level;
7035 		}
7036 
7037 		BUG_ON(wc->level == 0);
7038 		if (btrfs_should_end_transaction(trans, tree_root)) {
7039 			ret = btrfs_update_root(trans, tree_root,
7040 						&root->root_key,
7041 						root_item);
7042 			if (ret) {
7043 				btrfs_abort_transaction(trans, tree_root, ret);
7044 				err = ret;
7045 				goto out_end_trans;
7046 			}
7047 
7048 			btrfs_end_transaction_throttle(trans, tree_root);
7049 			trans = btrfs_start_transaction(tree_root, 0);
7050 			if (IS_ERR(trans)) {
7051 				err = PTR_ERR(trans);
7052 				goto out_free;
7053 			}
7054 			if (block_rsv)
7055 				trans->block_rsv = block_rsv;
7056 		}
7057 	}
7058 	btrfs_release_path(path);
7059 	if (err)
7060 		goto out_end_trans;
7061 
7062 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
7063 	if (ret) {
7064 		btrfs_abort_transaction(trans, tree_root, ret);
7065 		goto out_end_trans;
7066 	}
7067 
7068 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7069 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7070 					   NULL, NULL);
7071 		if (ret < 0) {
7072 			btrfs_abort_transaction(trans, tree_root, ret);
7073 			err = ret;
7074 			goto out_end_trans;
7075 		} else if (ret > 0) {
7076 			/* if we fail to delete the orphan item this time
7077 			 * around, it'll get picked up the next time.
7078 			 *
7079 			 * The most common failure here is just -ENOENT.
7080 			 */
7081 			btrfs_del_orphan_item(trans, tree_root,
7082 					      root->root_key.objectid);
7083 		}
7084 	}
7085 
7086 	if (root->in_radix) {
7087 		btrfs_free_fs_root(tree_root->fs_info, root);
7088 	} else {
7089 		free_extent_buffer(root->node);
7090 		free_extent_buffer(root->commit_root);
7091 		kfree(root);
7092 	}
7093 out_end_trans:
7094 	btrfs_end_transaction_throttle(trans, tree_root);
7095 out_free:
7096 	kfree(wc);
7097 	btrfs_free_path(path);
7098 out:
7099 	if (err)
7100 		btrfs_std_error(root->fs_info, err);
7101 	return err;
7102 }
7103 
7104 /*
7105  * drop subtree rooted at tree block 'node'.
7106  *
7107  * NOTE: this function will unlock and release tree block 'node'
7108  * only used by relocation code
7109  */
7110 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7111 			struct btrfs_root *root,
7112 			struct extent_buffer *node,
7113 			struct extent_buffer *parent)
7114 {
7115 	struct btrfs_path *path;
7116 	struct walk_control *wc;
7117 	int level;
7118 	int parent_level;
7119 	int ret = 0;
7120 	int wret;
7121 
7122 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7123 
7124 	path = btrfs_alloc_path();
7125 	if (!path)
7126 		return -ENOMEM;
7127 
7128 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7129 	if (!wc) {
7130 		btrfs_free_path(path);
7131 		return -ENOMEM;
7132 	}
7133 
7134 	btrfs_assert_tree_locked(parent);
7135 	parent_level = btrfs_header_level(parent);
7136 	extent_buffer_get(parent);
7137 	path->nodes[parent_level] = parent;
7138 	path->slots[parent_level] = btrfs_header_nritems(parent);
7139 
7140 	btrfs_assert_tree_locked(node);
7141 	level = btrfs_header_level(node);
7142 	path->nodes[level] = node;
7143 	path->slots[level] = 0;
7144 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7145 
7146 	wc->refs[parent_level] = 1;
7147 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7148 	wc->level = level;
7149 	wc->shared_level = -1;
7150 	wc->stage = DROP_REFERENCE;
7151 	wc->update_ref = 0;
7152 	wc->keep_locks = 1;
7153 	wc->for_reloc = 1;
7154 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7155 
7156 	while (1) {
7157 		wret = walk_down_tree(trans, root, path, wc);
7158 		if (wret < 0) {
7159 			ret = wret;
7160 			break;
7161 		}
7162 
7163 		wret = walk_up_tree(trans, root, path, wc, parent_level);
7164 		if (wret < 0)
7165 			ret = wret;
7166 		if (wret != 0)
7167 			break;
7168 	}
7169 
7170 	kfree(wc);
7171 	btrfs_free_path(path);
7172 	return ret;
7173 }
7174 
7175 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7176 {
7177 	u64 num_devices;
7178 	u64 stripped;
7179 
7180 	/*
7181 	 * if restripe for this chunk_type is on pick target profile and
7182 	 * return, otherwise do the usual balance
7183 	 */
7184 	stripped = get_restripe_target(root->fs_info, flags);
7185 	if (stripped)
7186 		return extended_to_chunk(stripped);
7187 
7188 	/*
7189 	 * we add in the count of missing devices because we want
7190 	 * to make sure that any RAID levels on a degraded FS
7191 	 * continue to be honored.
7192 	 */
7193 	num_devices = root->fs_info->fs_devices->rw_devices +
7194 		root->fs_info->fs_devices->missing_devices;
7195 
7196 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
7197 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7198 
7199 	if (num_devices == 1) {
7200 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7201 		stripped = flags & ~stripped;
7202 
7203 		/* turn raid0 into single device chunks */
7204 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
7205 			return stripped;
7206 
7207 		/* turn mirroring into duplication */
7208 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7209 			     BTRFS_BLOCK_GROUP_RAID10))
7210 			return stripped | BTRFS_BLOCK_GROUP_DUP;
7211 	} else {
7212 		/* they already had raid on here, just return */
7213 		if (flags & stripped)
7214 			return flags;
7215 
7216 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7217 		stripped = flags & ~stripped;
7218 
7219 		/* switch duplicated blocks with raid1 */
7220 		if (flags & BTRFS_BLOCK_GROUP_DUP)
7221 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
7222 
7223 		/* this is drive concat, leave it alone */
7224 	}
7225 
7226 	return flags;
7227 }
7228 
7229 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7230 {
7231 	struct btrfs_space_info *sinfo = cache->space_info;
7232 	u64 num_bytes;
7233 	u64 min_allocable_bytes;
7234 	int ret = -ENOSPC;
7235 
7236 
7237 	/*
7238 	 * We need some metadata space and system metadata space for
7239 	 * allocating chunks in some corner cases until we force to set
7240 	 * it to be readonly.
7241 	 */
7242 	if ((sinfo->flags &
7243 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7244 	    !force)
7245 		min_allocable_bytes = 1 * 1024 * 1024;
7246 	else
7247 		min_allocable_bytes = 0;
7248 
7249 	spin_lock(&sinfo->lock);
7250 	spin_lock(&cache->lock);
7251 
7252 	if (cache->ro) {
7253 		ret = 0;
7254 		goto out;
7255 	}
7256 
7257 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7258 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7259 
7260 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7261 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7262 	    min_allocable_bytes <= sinfo->total_bytes) {
7263 		sinfo->bytes_readonly += num_bytes;
7264 		cache->ro = 1;
7265 		ret = 0;
7266 	}
7267 out:
7268 	spin_unlock(&cache->lock);
7269 	spin_unlock(&sinfo->lock);
7270 	return ret;
7271 }
7272 
7273 int btrfs_set_block_group_ro(struct btrfs_root *root,
7274 			     struct btrfs_block_group_cache *cache)
7275 
7276 {
7277 	struct btrfs_trans_handle *trans;
7278 	u64 alloc_flags;
7279 	int ret;
7280 
7281 	BUG_ON(cache->ro);
7282 
7283 	trans = btrfs_join_transaction(root);
7284 	if (IS_ERR(trans))
7285 		return PTR_ERR(trans);
7286 
7287 	alloc_flags = update_block_group_flags(root, cache->flags);
7288 	if (alloc_flags != cache->flags) {
7289 		ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7290 				     CHUNK_ALLOC_FORCE);
7291 		if (ret < 0)
7292 			goto out;
7293 	}
7294 
7295 	ret = set_block_group_ro(cache, 0);
7296 	if (!ret)
7297 		goto out;
7298 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7299 	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7300 			     CHUNK_ALLOC_FORCE);
7301 	if (ret < 0)
7302 		goto out;
7303 	ret = set_block_group_ro(cache, 0);
7304 out:
7305 	btrfs_end_transaction(trans, root);
7306 	return ret;
7307 }
7308 
7309 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7310 			    struct btrfs_root *root, u64 type)
7311 {
7312 	u64 alloc_flags = get_alloc_profile(root, type);
7313 	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
7314 			      CHUNK_ALLOC_FORCE);
7315 }
7316 
7317 /*
7318  * helper to account the unused space of all the readonly block group in the
7319  * list. takes mirrors into account.
7320  */
7321 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7322 {
7323 	struct btrfs_block_group_cache *block_group;
7324 	u64 free_bytes = 0;
7325 	int factor;
7326 
7327 	list_for_each_entry(block_group, groups_list, list) {
7328 		spin_lock(&block_group->lock);
7329 
7330 		if (!block_group->ro) {
7331 			spin_unlock(&block_group->lock);
7332 			continue;
7333 		}
7334 
7335 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7336 					  BTRFS_BLOCK_GROUP_RAID10 |
7337 					  BTRFS_BLOCK_GROUP_DUP))
7338 			factor = 2;
7339 		else
7340 			factor = 1;
7341 
7342 		free_bytes += (block_group->key.offset -
7343 			       btrfs_block_group_used(&block_group->item)) *
7344 			       factor;
7345 
7346 		spin_unlock(&block_group->lock);
7347 	}
7348 
7349 	return free_bytes;
7350 }
7351 
7352 /*
7353  * helper to account the unused space of all the readonly block group in the
7354  * space_info. takes mirrors into account.
7355  */
7356 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7357 {
7358 	int i;
7359 	u64 free_bytes = 0;
7360 
7361 	spin_lock(&sinfo->lock);
7362 
7363 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7364 		if (!list_empty(&sinfo->block_groups[i]))
7365 			free_bytes += __btrfs_get_ro_block_group_free_space(
7366 						&sinfo->block_groups[i]);
7367 
7368 	spin_unlock(&sinfo->lock);
7369 
7370 	return free_bytes;
7371 }
7372 
7373 void btrfs_set_block_group_rw(struct btrfs_root *root,
7374 			      struct btrfs_block_group_cache *cache)
7375 {
7376 	struct btrfs_space_info *sinfo = cache->space_info;
7377 	u64 num_bytes;
7378 
7379 	BUG_ON(!cache->ro);
7380 
7381 	spin_lock(&sinfo->lock);
7382 	spin_lock(&cache->lock);
7383 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7384 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7385 	sinfo->bytes_readonly -= num_bytes;
7386 	cache->ro = 0;
7387 	spin_unlock(&cache->lock);
7388 	spin_unlock(&sinfo->lock);
7389 }
7390 
7391 /*
7392  * checks to see if its even possible to relocate this block group.
7393  *
7394  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7395  * ok to go ahead and try.
7396  */
7397 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7398 {
7399 	struct btrfs_block_group_cache *block_group;
7400 	struct btrfs_space_info *space_info;
7401 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7402 	struct btrfs_device *device;
7403 	u64 min_free;
7404 	u64 dev_min = 1;
7405 	u64 dev_nr = 0;
7406 	u64 target;
7407 	int index;
7408 	int full = 0;
7409 	int ret = 0;
7410 
7411 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7412 
7413 	/* odd, couldn't find the block group, leave it alone */
7414 	if (!block_group)
7415 		return -1;
7416 
7417 	min_free = btrfs_block_group_used(&block_group->item);
7418 
7419 	/* no bytes used, we're good */
7420 	if (!min_free)
7421 		goto out;
7422 
7423 	space_info = block_group->space_info;
7424 	spin_lock(&space_info->lock);
7425 
7426 	full = space_info->full;
7427 
7428 	/*
7429 	 * if this is the last block group we have in this space, we can't
7430 	 * relocate it unless we're able to allocate a new chunk below.
7431 	 *
7432 	 * Otherwise, we need to make sure we have room in the space to handle
7433 	 * all of the extents from this block group.  If we can, we're good
7434 	 */
7435 	if ((space_info->total_bytes != block_group->key.offset) &&
7436 	    (space_info->bytes_used + space_info->bytes_reserved +
7437 	     space_info->bytes_pinned + space_info->bytes_readonly +
7438 	     min_free < space_info->total_bytes)) {
7439 		spin_unlock(&space_info->lock);
7440 		goto out;
7441 	}
7442 	spin_unlock(&space_info->lock);
7443 
7444 	/*
7445 	 * ok we don't have enough space, but maybe we have free space on our
7446 	 * devices to allocate new chunks for relocation, so loop through our
7447 	 * alloc devices and guess if we have enough space.  if this block
7448 	 * group is going to be restriped, run checks against the target
7449 	 * profile instead of the current one.
7450 	 */
7451 	ret = -1;
7452 
7453 	/*
7454 	 * index:
7455 	 *      0: raid10
7456 	 *      1: raid1
7457 	 *      2: dup
7458 	 *      3: raid0
7459 	 *      4: single
7460 	 */
7461 	target = get_restripe_target(root->fs_info, block_group->flags);
7462 	if (target) {
7463 		index = __get_block_group_index(extended_to_chunk(target));
7464 	} else {
7465 		/*
7466 		 * this is just a balance, so if we were marked as full
7467 		 * we know there is no space for a new chunk
7468 		 */
7469 		if (full)
7470 			goto out;
7471 
7472 		index = get_block_group_index(block_group);
7473 	}
7474 
7475 	if (index == 0) {
7476 		dev_min = 4;
7477 		/* Divide by 2 */
7478 		min_free >>= 1;
7479 	} else if (index == 1) {
7480 		dev_min = 2;
7481 	} else if (index == 2) {
7482 		/* Multiply by 2 */
7483 		min_free <<= 1;
7484 	} else if (index == 3) {
7485 		dev_min = fs_devices->rw_devices;
7486 		do_div(min_free, dev_min);
7487 	}
7488 
7489 	mutex_lock(&root->fs_info->chunk_mutex);
7490 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7491 		u64 dev_offset;
7492 
7493 		/*
7494 		 * check to make sure we can actually find a chunk with enough
7495 		 * space to fit our block group in.
7496 		 */
7497 		if (device->total_bytes > device->bytes_used + min_free) {
7498 			ret = find_free_dev_extent(device, min_free,
7499 						   &dev_offset, NULL);
7500 			if (!ret)
7501 				dev_nr++;
7502 
7503 			if (dev_nr >= dev_min)
7504 				break;
7505 
7506 			ret = -1;
7507 		}
7508 	}
7509 	mutex_unlock(&root->fs_info->chunk_mutex);
7510 out:
7511 	btrfs_put_block_group(block_group);
7512 	return ret;
7513 }
7514 
7515 static int find_first_block_group(struct btrfs_root *root,
7516 		struct btrfs_path *path, struct btrfs_key *key)
7517 {
7518 	int ret = 0;
7519 	struct btrfs_key found_key;
7520 	struct extent_buffer *leaf;
7521 	int slot;
7522 
7523 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7524 	if (ret < 0)
7525 		goto out;
7526 
7527 	while (1) {
7528 		slot = path->slots[0];
7529 		leaf = path->nodes[0];
7530 		if (slot >= btrfs_header_nritems(leaf)) {
7531 			ret = btrfs_next_leaf(root, path);
7532 			if (ret == 0)
7533 				continue;
7534 			if (ret < 0)
7535 				goto out;
7536 			break;
7537 		}
7538 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7539 
7540 		if (found_key.objectid >= key->objectid &&
7541 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7542 			ret = 0;
7543 			goto out;
7544 		}
7545 		path->slots[0]++;
7546 	}
7547 out:
7548 	return ret;
7549 }
7550 
7551 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7552 {
7553 	struct btrfs_block_group_cache *block_group;
7554 	u64 last = 0;
7555 
7556 	while (1) {
7557 		struct inode *inode;
7558 
7559 		block_group = btrfs_lookup_first_block_group(info, last);
7560 		while (block_group) {
7561 			spin_lock(&block_group->lock);
7562 			if (block_group->iref)
7563 				break;
7564 			spin_unlock(&block_group->lock);
7565 			block_group = next_block_group(info->tree_root,
7566 						       block_group);
7567 		}
7568 		if (!block_group) {
7569 			if (last == 0)
7570 				break;
7571 			last = 0;
7572 			continue;
7573 		}
7574 
7575 		inode = block_group->inode;
7576 		block_group->iref = 0;
7577 		block_group->inode = NULL;
7578 		spin_unlock(&block_group->lock);
7579 		iput(inode);
7580 		last = block_group->key.objectid + block_group->key.offset;
7581 		btrfs_put_block_group(block_group);
7582 	}
7583 }
7584 
7585 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7586 {
7587 	struct btrfs_block_group_cache *block_group;
7588 	struct btrfs_space_info *space_info;
7589 	struct btrfs_caching_control *caching_ctl;
7590 	struct rb_node *n;
7591 
7592 	down_write(&info->extent_commit_sem);
7593 	while (!list_empty(&info->caching_block_groups)) {
7594 		caching_ctl = list_entry(info->caching_block_groups.next,
7595 					 struct btrfs_caching_control, list);
7596 		list_del(&caching_ctl->list);
7597 		put_caching_control(caching_ctl);
7598 	}
7599 	up_write(&info->extent_commit_sem);
7600 
7601 	spin_lock(&info->block_group_cache_lock);
7602 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7603 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7604 				       cache_node);
7605 		rb_erase(&block_group->cache_node,
7606 			 &info->block_group_cache_tree);
7607 		spin_unlock(&info->block_group_cache_lock);
7608 
7609 		down_write(&block_group->space_info->groups_sem);
7610 		list_del(&block_group->list);
7611 		up_write(&block_group->space_info->groups_sem);
7612 
7613 		if (block_group->cached == BTRFS_CACHE_STARTED)
7614 			wait_block_group_cache_done(block_group);
7615 
7616 		/*
7617 		 * We haven't cached this block group, which means we could
7618 		 * possibly have excluded extents on this block group.
7619 		 */
7620 		if (block_group->cached == BTRFS_CACHE_NO)
7621 			free_excluded_extents(info->extent_root, block_group);
7622 
7623 		btrfs_remove_free_space_cache(block_group);
7624 		btrfs_put_block_group(block_group);
7625 
7626 		spin_lock(&info->block_group_cache_lock);
7627 	}
7628 	spin_unlock(&info->block_group_cache_lock);
7629 
7630 	/* now that all the block groups are freed, go through and
7631 	 * free all the space_info structs.  This is only called during
7632 	 * the final stages of unmount, and so we know nobody is
7633 	 * using them.  We call synchronize_rcu() once before we start,
7634 	 * just to be on the safe side.
7635 	 */
7636 	synchronize_rcu();
7637 
7638 	release_global_block_rsv(info);
7639 
7640 	while(!list_empty(&info->space_info)) {
7641 		space_info = list_entry(info->space_info.next,
7642 					struct btrfs_space_info,
7643 					list);
7644 		if (space_info->bytes_pinned > 0 ||
7645 		    space_info->bytes_reserved > 0 ||
7646 		    space_info->bytes_may_use > 0) {
7647 			WARN_ON(1);
7648 			dump_space_info(space_info, 0, 0);
7649 		}
7650 		list_del(&space_info->list);
7651 		kfree(space_info);
7652 	}
7653 	return 0;
7654 }
7655 
7656 static void __link_block_group(struct btrfs_space_info *space_info,
7657 			       struct btrfs_block_group_cache *cache)
7658 {
7659 	int index = get_block_group_index(cache);
7660 
7661 	down_write(&space_info->groups_sem);
7662 	list_add_tail(&cache->list, &space_info->block_groups[index]);
7663 	up_write(&space_info->groups_sem);
7664 }
7665 
7666 int btrfs_read_block_groups(struct btrfs_root *root)
7667 {
7668 	struct btrfs_path *path;
7669 	int ret;
7670 	struct btrfs_block_group_cache *cache;
7671 	struct btrfs_fs_info *info = root->fs_info;
7672 	struct btrfs_space_info *space_info;
7673 	struct btrfs_key key;
7674 	struct btrfs_key found_key;
7675 	struct extent_buffer *leaf;
7676 	int need_clear = 0;
7677 	u64 cache_gen;
7678 
7679 	root = info->extent_root;
7680 	key.objectid = 0;
7681 	key.offset = 0;
7682 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7683 	path = btrfs_alloc_path();
7684 	if (!path)
7685 		return -ENOMEM;
7686 	path->reada = 1;
7687 
7688 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7689 	if (btrfs_test_opt(root, SPACE_CACHE) &&
7690 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7691 		need_clear = 1;
7692 	if (btrfs_test_opt(root, CLEAR_CACHE))
7693 		need_clear = 1;
7694 
7695 	while (1) {
7696 		ret = find_first_block_group(root, path, &key);
7697 		if (ret > 0)
7698 			break;
7699 		if (ret != 0)
7700 			goto error;
7701 		leaf = path->nodes[0];
7702 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7703 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7704 		if (!cache) {
7705 			ret = -ENOMEM;
7706 			goto error;
7707 		}
7708 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7709 						GFP_NOFS);
7710 		if (!cache->free_space_ctl) {
7711 			kfree(cache);
7712 			ret = -ENOMEM;
7713 			goto error;
7714 		}
7715 
7716 		atomic_set(&cache->count, 1);
7717 		spin_lock_init(&cache->lock);
7718 		cache->fs_info = info;
7719 		INIT_LIST_HEAD(&cache->list);
7720 		INIT_LIST_HEAD(&cache->cluster_list);
7721 
7722 		if (need_clear) {
7723 			/*
7724 			 * When we mount with old space cache, we need to
7725 			 * set BTRFS_DC_CLEAR and set dirty flag.
7726 			 *
7727 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7728 			 *    truncate the old free space cache inode and
7729 			 *    setup a new one.
7730 			 * b) Setting 'dirty flag' makes sure that we flush
7731 			 *    the new space cache info onto disk.
7732 			 */
7733 			cache->disk_cache_state = BTRFS_DC_CLEAR;
7734 			if (btrfs_test_opt(root, SPACE_CACHE))
7735 				cache->dirty = 1;
7736 		}
7737 
7738 		read_extent_buffer(leaf, &cache->item,
7739 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7740 				   sizeof(cache->item));
7741 		memcpy(&cache->key, &found_key, sizeof(found_key));
7742 
7743 		key.objectid = found_key.objectid + found_key.offset;
7744 		btrfs_release_path(path);
7745 		cache->flags = btrfs_block_group_flags(&cache->item);
7746 		cache->sectorsize = root->sectorsize;
7747 
7748 		btrfs_init_free_space_ctl(cache);
7749 
7750 		/*
7751 		 * We need to exclude the super stripes now so that the space
7752 		 * info has super bytes accounted for, otherwise we'll think
7753 		 * we have more space than we actually do.
7754 		 */
7755 		exclude_super_stripes(root, cache);
7756 
7757 		/*
7758 		 * check for two cases, either we are full, and therefore
7759 		 * don't need to bother with the caching work since we won't
7760 		 * find any space, or we are empty, and we can just add all
7761 		 * the space in and be done with it.  This saves us _alot_ of
7762 		 * time, particularly in the full case.
7763 		 */
7764 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7765 			cache->last_byte_to_unpin = (u64)-1;
7766 			cache->cached = BTRFS_CACHE_FINISHED;
7767 			free_excluded_extents(root, cache);
7768 		} else if (btrfs_block_group_used(&cache->item) == 0) {
7769 			cache->last_byte_to_unpin = (u64)-1;
7770 			cache->cached = BTRFS_CACHE_FINISHED;
7771 			add_new_free_space(cache, root->fs_info,
7772 					   found_key.objectid,
7773 					   found_key.objectid +
7774 					   found_key.offset);
7775 			free_excluded_extents(root, cache);
7776 		}
7777 
7778 		ret = update_space_info(info, cache->flags, found_key.offset,
7779 					btrfs_block_group_used(&cache->item),
7780 					&space_info);
7781 		BUG_ON(ret); /* -ENOMEM */
7782 		cache->space_info = space_info;
7783 		spin_lock(&cache->space_info->lock);
7784 		cache->space_info->bytes_readonly += cache->bytes_super;
7785 		spin_unlock(&cache->space_info->lock);
7786 
7787 		__link_block_group(space_info, cache);
7788 
7789 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7790 		BUG_ON(ret); /* Logic error */
7791 
7792 		set_avail_alloc_bits(root->fs_info, cache->flags);
7793 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7794 			set_block_group_ro(cache, 1);
7795 	}
7796 
7797 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7798 		if (!(get_alloc_profile(root, space_info->flags) &
7799 		      (BTRFS_BLOCK_GROUP_RAID10 |
7800 		       BTRFS_BLOCK_GROUP_RAID1 |
7801 		       BTRFS_BLOCK_GROUP_DUP)))
7802 			continue;
7803 		/*
7804 		 * avoid allocating from un-mirrored block group if there are
7805 		 * mirrored block groups.
7806 		 */
7807 		list_for_each_entry(cache, &space_info->block_groups[3], list)
7808 			set_block_group_ro(cache, 1);
7809 		list_for_each_entry(cache, &space_info->block_groups[4], list)
7810 			set_block_group_ro(cache, 1);
7811 	}
7812 
7813 	init_global_block_rsv(info);
7814 	ret = 0;
7815 error:
7816 	btrfs_free_path(path);
7817 	return ret;
7818 }
7819 
7820 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7821 			   struct btrfs_root *root, u64 bytes_used,
7822 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7823 			   u64 size)
7824 {
7825 	int ret;
7826 	struct btrfs_root *extent_root;
7827 	struct btrfs_block_group_cache *cache;
7828 
7829 	extent_root = root->fs_info->extent_root;
7830 
7831 	root->fs_info->last_trans_log_full_commit = trans->transid;
7832 
7833 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7834 	if (!cache)
7835 		return -ENOMEM;
7836 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7837 					GFP_NOFS);
7838 	if (!cache->free_space_ctl) {
7839 		kfree(cache);
7840 		return -ENOMEM;
7841 	}
7842 
7843 	cache->key.objectid = chunk_offset;
7844 	cache->key.offset = size;
7845 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7846 	cache->sectorsize = root->sectorsize;
7847 	cache->fs_info = root->fs_info;
7848 
7849 	atomic_set(&cache->count, 1);
7850 	spin_lock_init(&cache->lock);
7851 	INIT_LIST_HEAD(&cache->list);
7852 	INIT_LIST_HEAD(&cache->cluster_list);
7853 
7854 	btrfs_init_free_space_ctl(cache);
7855 
7856 	btrfs_set_block_group_used(&cache->item, bytes_used);
7857 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7858 	cache->flags = type;
7859 	btrfs_set_block_group_flags(&cache->item, type);
7860 
7861 	cache->last_byte_to_unpin = (u64)-1;
7862 	cache->cached = BTRFS_CACHE_FINISHED;
7863 	exclude_super_stripes(root, cache);
7864 
7865 	add_new_free_space(cache, root->fs_info, chunk_offset,
7866 			   chunk_offset + size);
7867 
7868 	free_excluded_extents(root, cache);
7869 
7870 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7871 				&cache->space_info);
7872 	BUG_ON(ret); /* -ENOMEM */
7873 	update_global_block_rsv(root->fs_info);
7874 
7875 	spin_lock(&cache->space_info->lock);
7876 	cache->space_info->bytes_readonly += cache->bytes_super;
7877 	spin_unlock(&cache->space_info->lock);
7878 
7879 	__link_block_group(cache->space_info, cache);
7880 
7881 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7882 	BUG_ON(ret); /* Logic error */
7883 
7884 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7885 				sizeof(cache->item));
7886 	if (ret) {
7887 		btrfs_abort_transaction(trans, extent_root, ret);
7888 		return ret;
7889 	}
7890 
7891 	set_avail_alloc_bits(extent_root->fs_info, type);
7892 
7893 	return 0;
7894 }
7895 
7896 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7897 {
7898 	u64 extra_flags = chunk_to_extended(flags) &
7899 				BTRFS_EXTENDED_PROFILE_MASK;
7900 
7901 	if (flags & BTRFS_BLOCK_GROUP_DATA)
7902 		fs_info->avail_data_alloc_bits &= ~extra_flags;
7903 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
7904 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7905 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7906 		fs_info->avail_system_alloc_bits &= ~extra_flags;
7907 }
7908 
7909 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7910 			     struct btrfs_root *root, u64 group_start)
7911 {
7912 	struct btrfs_path *path;
7913 	struct btrfs_block_group_cache *block_group;
7914 	struct btrfs_free_cluster *cluster;
7915 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7916 	struct btrfs_key key;
7917 	struct inode *inode;
7918 	int ret;
7919 	int index;
7920 	int factor;
7921 
7922 	root = root->fs_info->extent_root;
7923 
7924 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7925 	BUG_ON(!block_group);
7926 	BUG_ON(!block_group->ro);
7927 
7928 	/*
7929 	 * Free the reserved super bytes from this block group before
7930 	 * remove it.
7931 	 */
7932 	free_excluded_extents(root, block_group);
7933 
7934 	memcpy(&key, &block_group->key, sizeof(key));
7935 	index = get_block_group_index(block_group);
7936 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7937 				  BTRFS_BLOCK_GROUP_RAID1 |
7938 				  BTRFS_BLOCK_GROUP_RAID10))
7939 		factor = 2;
7940 	else
7941 		factor = 1;
7942 
7943 	/* make sure this block group isn't part of an allocation cluster */
7944 	cluster = &root->fs_info->data_alloc_cluster;
7945 	spin_lock(&cluster->refill_lock);
7946 	btrfs_return_cluster_to_free_space(block_group, cluster);
7947 	spin_unlock(&cluster->refill_lock);
7948 
7949 	/*
7950 	 * make sure this block group isn't part of a metadata
7951 	 * allocation cluster
7952 	 */
7953 	cluster = &root->fs_info->meta_alloc_cluster;
7954 	spin_lock(&cluster->refill_lock);
7955 	btrfs_return_cluster_to_free_space(block_group, cluster);
7956 	spin_unlock(&cluster->refill_lock);
7957 
7958 	path = btrfs_alloc_path();
7959 	if (!path) {
7960 		ret = -ENOMEM;
7961 		goto out;
7962 	}
7963 
7964 	inode = lookup_free_space_inode(tree_root, block_group, path);
7965 	if (!IS_ERR(inode)) {
7966 		ret = btrfs_orphan_add(trans, inode);
7967 		if (ret) {
7968 			btrfs_add_delayed_iput(inode);
7969 			goto out;
7970 		}
7971 		clear_nlink(inode);
7972 		/* One for the block groups ref */
7973 		spin_lock(&block_group->lock);
7974 		if (block_group->iref) {
7975 			block_group->iref = 0;
7976 			block_group->inode = NULL;
7977 			spin_unlock(&block_group->lock);
7978 			iput(inode);
7979 		} else {
7980 			spin_unlock(&block_group->lock);
7981 		}
7982 		/* One for our lookup ref */
7983 		btrfs_add_delayed_iput(inode);
7984 	}
7985 
7986 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7987 	key.offset = block_group->key.objectid;
7988 	key.type = 0;
7989 
7990 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7991 	if (ret < 0)
7992 		goto out;
7993 	if (ret > 0)
7994 		btrfs_release_path(path);
7995 	if (ret == 0) {
7996 		ret = btrfs_del_item(trans, tree_root, path);
7997 		if (ret)
7998 			goto out;
7999 		btrfs_release_path(path);
8000 	}
8001 
8002 	spin_lock(&root->fs_info->block_group_cache_lock);
8003 	rb_erase(&block_group->cache_node,
8004 		 &root->fs_info->block_group_cache_tree);
8005 	spin_unlock(&root->fs_info->block_group_cache_lock);
8006 
8007 	down_write(&block_group->space_info->groups_sem);
8008 	/*
8009 	 * we must use list_del_init so people can check to see if they
8010 	 * are still on the list after taking the semaphore
8011 	 */
8012 	list_del_init(&block_group->list);
8013 	if (list_empty(&block_group->space_info->block_groups[index]))
8014 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
8015 	up_write(&block_group->space_info->groups_sem);
8016 
8017 	if (block_group->cached == BTRFS_CACHE_STARTED)
8018 		wait_block_group_cache_done(block_group);
8019 
8020 	btrfs_remove_free_space_cache(block_group);
8021 
8022 	spin_lock(&block_group->space_info->lock);
8023 	block_group->space_info->total_bytes -= block_group->key.offset;
8024 	block_group->space_info->bytes_readonly -= block_group->key.offset;
8025 	block_group->space_info->disk_total -= block_group->key.offset * factor;
8026 	spin_unlock(&block_group->space_info->lock);
8027 
8028 	memcpy(&key, &block_group->key, sizeof(key));
8029 
8030 	btrfs_clear_space_info_full(root->fs_info);
8031 
8032 	btrfs_put_block_group(block_group);
8033 	btrfs_put_block_group(block_group);
8034 
8035 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8036 	if (ret > 0)
8037 		ret = -EIO;
8038 	if (ret < 0)
8039 		goto out;
8040 
8041 	ret = btrfs_del_item(trans, root, path);
8042 out:
8043 	btrfs_free_path(path);
8044 	return ret;
8045 }
8046 
8047 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8048 {
8049 	struct btrfs_space_info *space_info;
8050 	struct btrfs_super_block *disk_super;
8051 	u64 features;
8052 	u64 flags;
8053 	int mixed = 0;
8054 	int ret;
8055 
8056 	disk_super = fs_info->super_copy;
8057 	if (!btrfs_super_root(disk_super))
8058 		return 1;
8059 
8060 	features = btrfs_super_incompat_flags(disk_super);
8061 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8062 		mixed = 1;
8063 
8064 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
8065 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8066 	if (ret)
8067 		goto out;
8068 
8069 	if (mixed) {
8070 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8071 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8072 	} else {
8073 		flags = BTRFS_BLOCK_GROUP_METADATA;
8074 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8075 		if (ret)
8076 			goto out;
8077 
8078 		flags = BTRFS_BLOCK_GROUP_DATA;
8079 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8080 	}
8081 out:
8082 	return ret;
8083 }
8084 
8085 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8086 {
8087 	return unpin_extent_range(root, start, end);
8088 }
8089 
8090 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8091 			       u64 num_bytes, u64 *actual_bytes)
8092 {
8093 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8094 }
8095 
8096 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8097 {
8098 	struct btrfs_fs_info *fs_info = root->fs_info;
8099 	struct btrfs_block_group_cache *cache = NULL;
8100 	u64 group_trimmed;
8101 	u64 start;
8102 	u64 end;
8103 	u64 trimmed = 0;
8104 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8105 	int ret = 0;
8106 
8107 	/*
8108 	 * try to trim all FS space, our block group may start from non-zero.
8109 	 */
8110 	if (range->len == total_bytes)
8111 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
8112 	else
8113 		cache = btrfs_lookup_block_group(fs_info, range->start);
8114 
8115 	while (cache) {
8116 		if (cache->key.objectid >= (range->start + range->len)) {
8117 			btrfs_put_block_group(cache);
8118 			break;
8119 		}
8120 
8121 		start = max(range->start, cache->key.objectid);
8122 		end = min(range->start + range->len,
8123 				cache->key.objectid + cache->key.offset);
8124 
8125 		if (end - start >= range->minlen) {
8126 			if (!block_group_cache_done(cache)) {
8127 				ret = cache_block_group(cache, NULL, root, 0);
8128 				if (!ret)
8129 					wait_block_group_cache_done(cache);
8130 			}
8131 			ret = btrfs_trim_block_group(cache,
8132 						     &group_trimmed,
8133 						     start,
8134 						     end,
8135 						     range->minlen);
8136 
8137 			trimmed += group_trimmed;
8138 			if (ret) {
8139 				btrfs_put_block_group(cache);
8140 				break;
8141 			}
8142 		}
8143 
8144 		cache = next_block_group(fs_info->tree_root, cache);
8145 	}
8146 
8147 	range->len = trimmed;
8148 	return ret;
8149 }
8150