xref: /linux/fs/btrfs/extent-tree.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 /* control flags for do_chunk_alloc's force field
38  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
39  * if we really need one.
40  *
41  * CHUNK_ALLOC_FORCE means it must try to allocate one
42  *
43  * CHUNK_ALLOC_LIMITED means to only try and allocate one
44  * if we have very few chunks already allocated.  This is
45  * used as part of the clustering code to help make sure
46  * we have a good pool of storage to cluster in, without
47  * filling the FS with empty chunks
48  *
49  */
50 enum {
51 	CHUNK_ALLOC_NO_FORCE = 0,
52 	CHUNK_ALLOC_FORCE = 1,
53 	CHUNK_ALLOC_LIMITED = 2,
54 };
55 
56 /*
57  * Control how reservations are dealt with.
58  *
59  * RESERVE_FREE - freeing a reservation.
60  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61  *   ENOSPC accounting
62  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63  *   bytes_may_use as the ENOSPC accounting is done elsewhere
64  */
65 enum {
66 	RESERVE_FREE = 0,
67 	RESERVE_ALLOC = 1,
68 	RESERVE_ALLOC_NO_ACCOUNT = 2,
69 };
70 
71 static int update_block_group(struct btrfs_trans_handle *trans,
72 			      struct btrfs_root *root,
73 			      u64 bytenr, u64 num_bytes, int alloc);
74 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
75 				struct btrfs_root *root,
76 				u64 bytenr, u64 num_bytes, u64 parent,
77 				u64 root_objectid, u64 owner_objectid,
78 				u64 owner_offset, int refs_to_drop,
79 				struct btrfs_delayed_extent_op *extra_op);
80 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
81 				    struct extent_buffer *leaf,
82 				    struct btrfs_extent_item *ei);
83 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84 				      struct btrfs_root *root,
85 				      u64 parent, u64 root_objectid,
86 				      u64 flags, u64 owner, u64 offset,
87 				      struct btrfs_key *ins, int ref_mod);
88 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
89 				     struct btrfs_root *root,
90 				     u64 parent, u64 root_objectid,
91 				     u64 flags, struct btrfs_disk_key *key,
92 				     int level, struct btrfs_key *ins);
93 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
94 			  struct btrfs_root *extent_root, u64 alloc_bytes,
95 			  u64 flags, int force);
96 static int find_next_key(struct btrfs_path *path, int level,
97 			 struct btrfs_key *key);
98 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
99 			    int dump_block_groups);
100 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 				       u64 num_bytes, int reserve);
102 
103 static noinline int
104 block_group_cache_done(struct btrfs_block_group_cache *cache)
105 {
106 	smp_mb();
107 	return cache->cached == BTRFS_CACHE_FINISHED;
108 }
109 
110 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
111 {
112 	return (cache->flags & bits) == bits;
113 }
114 
115 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
116 {
117 	atomic_inc(&cache->count);
118 }
119 
120 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
121 {
122 	if (atomic_dec_and_test(&cache->count)) {
123 		WARN_ON(cache->pinned > 0);
124 		WARN_ON(cache->reserved > 0);
125 		kfree(cache->free_space_ctl);
126 		kfree(cache);
127 	}
128 }
129 
130 /*
131  * this adds the block group to the fs_info rb tree for the block group
132  * cache
133  */
134 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
135 				struct btrfs_block_group_cache *block_group)
136 {
137 	struct rb_node **p;
138 	struct rb_node *parent = NULL;
139 	struct btrfs_block_group_cache *cache;
140 
141 	spin_lock(&info->block_group_cache_lock);
142 	p = &info->block_group_cache_tree.rb_node;
143 
144 	while (*p) {
145 		parent = *p;
146 		cache = rb_entry(parent, struct btrfs_block_group_cache,
147 				 cache_node);
148 		if (block_group->key.objectid < cache->key.objectid) {
149 			p = &(*p)->rb_left;
150 		} else if (block_group->key.objectid > cache->key.objectid) {
151 			p = &(*p)->rb_right;
152 		} else {
153 			spin_unlock(&info->block_group_cache_lock);
154 			return -EEXIST;
155 		}
156 	}
157 
158 	rb_link_node(&block_group->cache_node, parent, p);
159 	rb_insert_color(&block_group->cache_node,
160 			&info->block_group_cache_tree);
161 	spin_unlock(&info->block_group_cache_lock);
162 
163 	return 0;
164 }
165 
166 /*
167  * This will return the block group at or after bytenr if contains is 0, else
168  * it will return the block group that contains the bytenr
169  */
170 static struct btrfs_block_group_cache *
171 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
172 			      int contains)
173 {
174 	struct btrfs_block_group_cache *cache, *ret = NULL;
175 	struct rb_node *n;
176 	u64 end, start;
177 
178 	spin_lock(&info->block_group_cache_lock);
179 	n = info->block_group_cache_tree.rb_node;
180 
181 	while (n) {
182 		cache = rb_entry(n, struct btrfs_block_group_cache,
183 				 cache_node);
184 		end = cache->key.objectid + cache->key.offset - 1;
185 		start = cache->key.objectid;
186 
187 		if (bytenr < start) {
188 			if (!contains && (!ret || start < ret->key.objectid))
189 				ret = cache;
190 			n = n->rb_left;
191 		} else if (bytenr > start) {
192 			if (contains && bytenr <= end) {
193 				ret = cache;
194 				break;
195 			}
196 			n = n->rb_right;
197 		} else {
198 			ret = cache;
199 			break;
200 		}
201 	}
202 	if (ret)
203 		btrfs_get_block_group(ret);
204 	spin_unlock(&info->block_group_cache_lock);
205 
206 	return ret;
207 }
208 
209 static int add_excluded_extent(struct btrfs_root *root,
210 			       u64 start, u64 num_bytes)
211 {
212 	u64 end = start + num_bytes - 1;
213 	set_extent_bits(&root->fs_info->freed_extents[0],
214 			start, end, EXTENT_UPTODATE, GFP_NOFS);
215 	set_extent_bits(&root->fs_info->freed_extents[1],
216 			start, end, EXTENT_UPTODATE, GFP_NOFS);
217 	return 0;
218 }
219 
220 static void free_excluded_extents(struct btrfs_root *root,
221 				  struct btrfs_block_group_cache *cache)
222 {
223 	u64 start, end;
224 
225 	start = cache->key.objectid;
226 	end = start + cache->key.offset - 1;
227 
228 	clear_extent_bits(&root->fs_info->freed_extents[0],
229 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
230 	clear_extent_bits(&root->fs_info->freed_extents[1],
231 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
232 }
233 
234 static int exclude_super_stripes(struct btrfs_root *root,
235 				 struct btrfs_block_group_cache *cache)
236 {
237 	u64 bytenr;
238 	u64 *logical;
239 	int stripe_len;
240 	int i, nr, ret;
241 
242 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
243 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
244 		cache->bytes_super += stripe_len;
245 		ret = add_excluded_extent(root, cache->key.objectid,
246 					  stripe_len);
247 		BUG_ON(ret);
248 	}
249 
250 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
251 		bytenr = btrfs_sb_offset(i);
252 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
253 				       cache->key.objectid, bytenr,
254 				       0, &logical, &nr, &stripe_len);
255 		BUG_ON(ret);
256 
257 		while (nr--) {
258 			cache->bytes_super += stripe_len;
259 			ret = add_excluded_extent(root, logical[nr],
260 						  stripe_len);
261 			BUG_ON(ret);
262 		}
263 
264 		kfree(logical);
265 	}
266 	return 0;
267 }
268 
269 static struct btrfs_caching_control *
270 get_caching_control(struct btrfs_block_group_cache *cache)
271 {
272 	struct btrfs_caching_control *ctl;
273 
274 	spin_lock(&cache->lock);
275 	if (cache->cached != BTRFS_CACHE_STARTED) {
276 		spin_unlock(&cache->lock);
277 		return NULL;
278 	}
279 
280 	/* We're loading it the fast way, so we don't have a caching_ctl. */
281 	if (!cache->caching_ctl) {
282 		spin_unlock(&cache->lock);
283 		return NULL;
284 	}
285 
286 	ctl = cache->caching_ctl;
287 	atomic_inc(&ctl->count);
288 	spin_unlock(&cache->lock);
289 	return ctl;
290 }
291 
292 static void put_caching_control(struct btrfs_caching_control *ctl)
293 {
294 	if (atomic_dec_and_test(&ctl->count))
295 		kfree(ctl);
296 }
297 
298 /*
299  * this is only called by cache_block_group, since we could have freed extents
300  * we need to check the pinned_extents for any extents that can't be used yet
301  * since their free space will be released as soon as the transaction commits.
302  */
303 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
304 			      struct btrfs_fs_info *info, u64 start, u64 end)
305 {
306 	u64 extent_start, extent_end, size, total_added = 0;
307 	int ret;
308 
309 	while (start < end) {
310 		ret = find_first_extent_bit(info->pinned_extents, start,
311 					    &extent_start, &extent_end,
312 					    EXTENT_DIRTY | EXTENT_UPTODATE);
313 		if (ret)
314 			break;
315 
316 		if (extent_start <= start) {
317 			start = extent_end + 1;
318 		} else if (extent_start > start && extent_start < end) {
319 			size = extent_start - start;
320 			total_added += size;
321 			ret = btrfs_add_free_space(block_group, start,
322 						   size);
323 			BUG_ON(ret);
324 			start = extent_end + 1;
325 		} else {
326 			break;
327 		}
328 	}
329 
330 	if (start < end) {
331 		size = end - start;
332 		total_added += size;
333 		ret = btrfs_add_free_space(block_group, start, size);
334 		BUG_ON(ret);
335 	}
336 
337 	return total_added;
338 }
339 
340 static noinline void caching_thread(struct btrfs_work *work)
341 {
342 	struct btrfs_block_group_cache *block_group;
343 	struct btrfs_fs_info *fs_info;
344 	struct btrfs_caching_control *caching_ctl;
345 	struct btrfs_root *extent_root;
346 	struct btrfs_path *path;
347 	struct extent_buffer *leaf;
348 	struct btrfs_key key;
349 	u64 total_found = 0;
350 	u64 last = 0;
351 	u32 nritems;
352 	int ret = 0;
353 
354 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
355 	block_group = caching_ctl->block_group;
356 	fs_info = block_group->fs_info;
357 	extent_root = fs_info->extent_root;
358 
359 	path = btrfs_alloc_path();
360 	if (!path)
361 		goto out;
362 
363 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
364 
365 	/*
366 	 * We don't want to deadlock with somebody trying to allocate a new
367 	 * extent for the extent root while also trying to search the extent
368 	 * root to add free space.  So we skip locking and search the commit
369 	 * root, since its read-only
370 	 */
371 	path->skip_locking = 1;
372 	path->search_commit_root = 1;
373 	path->reada = 1;
374 
375 	key.objectid = last;
376 	key.offset = 0;
377 	key.type = BTRFS_EXTENT_ITEM_KEY;
378 again:
379 	mutex_lock(&caching_ctl->mutex);
380 	/* need to make sure the commit_root doesn't disappear */
381 	down_read(&fs_info->extent_commit_sem);
382 
383 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
384 	if (ret < 0)
385 		goto err;
386 
387 	leaf = path->nodes[0];
388 	nritems = btrfs_header_nritems(leaf);
389 
390 	while (1) {
391 		if (btrfs_fs_closing(fs_info) > 1) {
392 			last = (u64)-1;
393 			break;
394 		}
395 
396 		if (path->slots[0] < nritems) {
397 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
398 		} else {
399 			ret = find_next_key(path, 0, &key);
400 			if (ret)
401 				break;
402 
403 			if (need_resched() ||
404 			    btrfs_next_leaf(extent_root, path)) {
405 				caching_ctl->progress = last;
406 				btrfs_release_path(path);
407 				up_read(&fs_info->extent_commit_sem);
408 				mutex_unlock(&caching_ctl->mutex);
409 				cond_resched();
410 				goto again;
411 			}
412 			leaf = path->nodes[0];
413 			nritems = btrfs_header_nritems(leaf);
414 			continue;
415 		}
416 
417 		if (key.objectid < block_group->key.objectid) {
418 			path->slots[0]++;
419 			continue;
420 		}
421 
422 		if (key.objectid >= block_group->key.objectid +
423 		    block_group->key.offset)
424 			break;
425 
426 		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
427 			total_found += add_new_free_space(block_group,
428 							  fs_info, last,
429 							  key.objectid);
430 			last = key.objectid + key.offset;
431 
432 			if (total_found > (1024 * 1024 * 2)) {
433 				total_found = 0;
434 				wake_up(&caching_ctl->wait);
435 			}
436 		}
437 		path->slots[0]++;
438 	}
439 	ret = 0;
440 
441 	total_found += add_new_free_space(block_group, fs_info, last,
442 					  block_group->key.objectid +
443 					  block_group->key.offset);
444 	caching_ctl->progress = (u64)-1;
445 
446 	spin_lock(&block_group->lock);
447 	block_group->caching_ctl = NULL;
448 	block_group->cached = BTRFS_CACHE_FINISHED;
449 	spin_unlock(&block_group->lock);
450 
451 err:
452 	btrfs_free_path(path);
453 	up_read(&fs_info->extent_commit_sem);
454 
455 	free_excluded_extents(extent_root, block_group);
456 
457 	mutex_unlock(&caching_ctl->mutex);
458 out:
459 	wake_up(&caching_ctl->wait);
460 
461 	put_caching_control(caching_ctl);
462 	btrfs_put_block_group(block_group);
463 }
464 
465 static int cache_block_group(struct btrfs_block_group_cache *cache,
466 			     struct btrfs_trans_handle *trans,
467 			     struct btrfs_root *root,
468 			     int load_cache_only)
469 {
470 	struct btrfs_fs_info *fs_info = cache->fs_info;
471 	struct btrfs_caching_control *caching_ctl;
472 	int ret = 0;
473 
474 	smp_mb();
475 	if (cache->cached != BTRFS_CACHE_NO)
476 		return 0;
477 
478 	/*
479 	 * We can't do the read from on-disk cache during a commit since we need
480 	 * to have the normal tree locking.  Also if we are currently trying to
481 	 * allocate blocks for the tree root we can't do the fast caching since
482 	 * we likely hold important locks.
483 	 */
484 	if (trans && (!trans->transaction->in_commit) &&
485 	    (root && root != root->fs_info->tree_root) &&
486 	    btrfs_test_opt(root, SPACE_CACHE)) {
487 		spin_lock(&cache->lock);
488 		if (cache->cached != BTRFS_CACHE_NO) {
489 			spin_unlock(&cache->lock);
490 			return 0;
491 		}
492 		cache->cached = BTRFS_CACHE_STARTED;
493 		spin_unlock(&cache->lock);
494 
495 		ret = load_free_space_cache(fs_info, cache);
496 
497 		spin_lock(&cache->lock);
498 		if (ret == 1) {
499 			cache->cached = BTRFS_CACHE_FINISHED;
500 			cache->last_byte_to_unpin = (u64)-1;
501 		} else {
502 			cache->cached = BTRFS_CACHE_NO;
503 		}
504 		spin_unlock(&cache->lock);
505 		if (ret == 1) {
506 			free_excluded_extents(fs_info->extent_root, cache);
507 			return 0;
508 		}
509 	}
510 
511 	if (load_cache_only)
512 		return 0;
513 
514 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
515 	BUG_ON(!caching_ctl);
516 
517 	INIT_LIST_HEAD(&caching_ctl->list);
518 	mutex_init(&caching_ctl->mutex);
519 	init_waitqueue_head(&caching_ctl->wait);
520 	caching_ctl->block_group = cache;
521 	caching_ctl->progress = cache->key.objectid;
522 	/* one for caching kthread, one for caching block group list */
523 	atomic_set(&caching_ctl->count, 2);
524 	caching_ctl->work.func = caching_thread;
525 
526 	spin_lock(&cache->lock);
527 	if (cache->cached != BTRFS_CACHE_NO) {
528 		spin_unlock(&cache->lock);
529 		kfree(caching_ctl);
530 		return 0;
531 	}
532 	cache->caching_ctl = caching_ctl;
533 	cache->cached = BTRFS_CACHE_STARTED;
534 	spin_unlock(&cache->lock);
535 
536 	down_write(&fs_info->extent_commit_sem);
537 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
538 	up_write(&fs_info->extent_commit_sem);
539 
540 	btrfs_get_block_group(cache);
541 
542 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
543 
544 	return ret;
545 }
546 
547 /*
548  * return the block group that starts at or after bytenr
549  */
550 static struct btrfs_block_group_cache *
551 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
552 {
553 	struct btrfs_block_group_cache *cache;
554 
555 	cache = block_group_cache_tree_search(info, bytenr, 0);
556 
557 	return cache;
558 }
559 
560 /*
561  * return the block group that contains the given bytenr
562  */
563 struct btrfs_block_group_cache *btrfs_lookup_block_group(
564 						 struct btrfs_fs_info *info,
565 						 u64 bytenr)
566 {
567 	struct btrfs_block_group_cache *cache;
568 
569 	cache = block_group_cache_tree_search(info, bytenr, 1);
570 
571 	return cache;
572 }
573 
574 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
575 						  u64 flags)
576 {
577 	struct list_head *head = &info->space_info;
578 	struct btrfs_space_info *found;
579 
580 	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
581 		 BTRFS_BLOCK_GROUP_METADATA;
582 
583 	rcu_read_lock();
584 	list_for_each_entry_rcu(found, head, list) {
585 		if (found->flags & flags) {
586 			rcu_read_unlock();
587 			return found;
588 		}
589 	}
590 	rcu_read_unlock();
591 	return NULL;
592 }
593 
594 /*
595  * after adding space to the filesystem, we need to clear the full flags
596  * on all the space infos.
597  */
598 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
599 {
600 	struct list_head *head = &info->space_info;
601 	struct btrfs_space_info *found;
602 
603 	rcu_read_lock();
604 	list_for_each_entry_rcu(found, head, list)
605 		found->full = 0;
606 	rcu_read_unlock();
607 }
608 
609 static u64 div_factor(u64 num, int factor)
610 {
611 	if (factor == 10)
612 		return num;
613 	num *= factor;
614 	do_div(num, 10);
615 	return num;
616 }
617 
618 static u64 div_factor_fine(u64 num, int factor)
619 {
620 	if (factor == 100)
621 		return num;
622 	num *= factor;
623 	do_div(num, 100);
624 	return num;
625 }
626 
627 u64 btrfs_find_block_group(struct btrfs_root *root,
628 			   u64 search_start, u64 search_hint, int owner)
629 {
630 	struct btrfs_block_group_cache *cache;
631 	u64 used;
632 	u64 last = max(search_hint, search_start);
633 	u64 group_start = 0;
634 	int full_search = 0;
635 	int factor = 9;
636 	int wrapped = 0;
637 again:
638 	while (1) {
639 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
640 		if (!cache)
641 			break;
642 
643 		spin_lock(&cache->lock);
644 		last = cache->key.objectid + cache->key.offset;
645 		used = btrfs_block_group_used(&cache->item);
646 
647 		if ((full_search || !cache->ro) &&
648 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
649 			if (used + cache->pinned + cache->reserved <
650 			    div_factor(cache->key.offset, factor)) {
651 				group_start = cache->key.objectid;
652 				spin_unlock(&cache->lock);
653 				btrfs_put_block_group(cache);
654 				goto found;
655 			}
656 		}
657 		spin_unlock(&cache->lock);
658 		btrfs_put_block_group(cache);
659 		cond_resched();
660 	}
661 	if (!wrapped) {
662 		last = search_start;
663 		wrapped = 1;
664 		goto again;
665 	}
666 	if (!full_search && factor < 10) {
667 		last = search_start;
668 		full_search = 1;
669 		factor = 10;
670 		goto again;
671 	}
672 found:
673 	return group_start;
674 }
675 
676 /* simple helper to search for an existing extent at a given offset */
677 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
678 {
679 	int ret;
680 	struct btrfs_key key;
681 	struct btrfs_path *path;
682 
683 	path = btrfs_alloc_path();
684 	if (!path)
685 		return -ENOMEM;
686 
687 	key.objectid = start;
688 	key.offset = len;
689 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
690 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
691 				0, 0);
692 	btrfs_free_path(path);
693 	return ret;
694 }
695 
696 /*
697  * helper function to lookup reference count and flags of extent.
698  *
699  * the head node for delayed ref is used to store the sum of all the
700  * reference count modifications queued up in the rbtree. the head
701  * node may also store the extent flags to set. This way you can check
702  * to see what the reference count and extent flags would be if all of
703  * the delayed refs are not processed.
704  */
705 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
706 			     struct btrfs_root *root, u64 bytenr,
707 			     u64 num_bytes, u64 *refs, u64 *flags)
708 {
709 	struct btrfs_delayed_ref_head *head;
710 	struct btrfs_delayed_ref_root *delayed_refs;
711 	struct btrfs_path *path;
712 	struct btrfs_extent_item *ei;
713 	struct extent_buffer *leaf;
714 	struct btrfs_key key;
715 	u32 item_size;
716 	u64 num_refs;
717 	u64 extent_flags;
718 	int ret;
719 
720 	path = btrfs_alloc_path();
721 	if (!path)
722 		return -ENOMEM;
723 
724 	key.objectid = bytenr;
725 	key.type = BTRFS_EXTENT_ITEM_KEY;
726 	key.offset = num_bytes;
727 	if (!trans) {
728 		path->skip_locking = 1;
729 		path->search_commit_root = 1;
730 	}
731 again:
732 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
733 				&key, path, 0, 0);
734 	if (ret < 0)
735 		goto out_free;
736 
737 	if (ret == 0) {
738 		leaf = path->nodes[0];
739 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
740 		if (item_size >= sizeof(*ei)) {
741 			ei = btrfs_item_ptr(leaf, path->slots[0],
742 					    struct btrfs_extent_item);
743 			num_refs = btrfs_extent_refs(leaf, ei);
744 			extent_flags = btrfs_extent_flags(leaf, ei);
745 		} else {
746 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
747 			struct btrfs_extent_item_v0 *ei0;
748 			BUG_ON(item_size != sizeof(*ei0));
749 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
750 					     struct btrfs_extent_item_v0);
751 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
752 			/* FIXME: this isn't correct for data */
753 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
754 #else
755 			BUG();
756 #endif
757 		}
758 		BUG_ON(num_refs == 0);
759 	} else {
760 		num_refs = 0;
761 		extent_flags = 0;
762 		ret = 0;
763 	}
764 
765 	if (!trans)
766 		goto out;
767 
768 	delayed_refs = &trans->transaction->delayed_refs;
769 	spin_lock(&delayed_refs->lock);
770 	head = btrfs_find_delayed_ref_head(trans, bytenr);
771 	if (head) {
772 		if (!mutex_trylock(&head->mutex)) {
773 			atomic_inc(&head->node.refs);
774 			spin_unlock(&delayed_refs->lock);
775 
776 			btrfs_release_path(path);
777 
778 			/*
779 			 * Mutex was contended, block until it's released and try
780 			 * again
781 			 */
782 			mutex_lock(&head->mutex);
783 			mutex_unlock(&head->mutex);
784 			btrfs_put_delayed_ref(&head->node);
785 			goto again;
786 		}
787 		if (head->extent_op && head->extent_op->update_flags)
788 			extent_flags |= head->extent_op->flags_to_set;
789 		else
790 			BUG_ON(num_refs == 0);
791 
792 		num_refs += head->node.ref_mod;
793 		mutex_unlock(&head->mutex);
794 	}
795 	spin_unlock(&delayed_refs->lock);
796 out:
797 	WARN_ON(num_refs == 0);
798 	if (refs)
799 		*refs = num_refs;
800 	if (flags)
801 		*flags = extent_flags;
802 out_free:
803 	btrfs_free_path(path);
804 	return ret;
805 }
806 
807 /*
808  * Back reference rules.  Back refs have three main goals:
809  *
810  * 1) differentiate between all holders of references to an extent so that
811  *    when a reference is dropped we can make sure it was a valid reference
812  *    before freeing the extent.
813  *
814  * 2) Provide enough information to quickly find the holders of an extent
815  *    if we notice a given block is corrupted or bad.
816  *
817  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
818  *    maintenance.  This is actually the same as #2, but with a slightly
819  *    different use case.
820  *
821  * There are two kinds of back refs. The implicit back refs is optimized
822  * for pointers in non-shared tree blocks. For a given pointer in a block,
823  * back refs of this kind provide information about the block's owner tree
824  * and the pointer's key. These information allow us to find the block by
825  * b-tree searching. The full back refs is for pointers in tree blocks not
826  * referenced by their owner trees. The location of tree block is recorded
827  * in the back refs. Actually the full back refs is generic, and can be
828  * used in all cases the implicit back refs is used. The major shortcoming
829  * of the full back refs is its overhead. Every time a tree block gets
830  * COWed, we have to update back refs entry for all pointers in it.
831  *
832  * For a newly allocated tree block, we use implicit back refs for
833  * pointers in it. This means most tree related operations only involve
834  * implicit back refs. For a tree block created in old transaction, the
835  * only way to drop a reference to it is COW it. So we can detect the
836  * event that tree block loses its owner tree's reference and do the
837  * back refs conversion.
838  *
839  * When a tree block is COW'd through a tree, there are four cases:
840  *
841  * The reference count of the block is one and the tree is the block's
842  * owner tree. Nothing to do in this case.
843  *
844  * The reference count of the block is one and the tree is not the
845  * block's owner tree. In this case, full back refs is used for pointers
846  * in the block. Remove these full back refs, add implicit back refs for
847  * every pointers in the new block.
848  *
849  * The reference count of the block is greater than one and the tree is
850  * the block's owner tree. In this case, implicit back refs is used for
851  * pointers in the block. Add full back refs for every pointers in the
852  * block, increase lower level extents' reference counts. The original
853  * implicit back refs are entailed to the new block.
854  *
855  * The reference count of the block is greater than one and the tree is
856  * not the block's owner tree. Add implicit back refs for every pointer in
857  * the new block, increase lower level extents' reference count.
858  *
859  * Back Reference Key composing:
860  *
861  * The key objectid corresponds to the first byte in the extent,
862  * The key type is used to differentiate between types of back refs.
863  * There are different meanings of the key offset for different types
864  * of back refs.
865  *
866  * File extents can be referenced by:
867  *
868  * - multiple snapshots, subvolumes, or different generations in one subvol
869  * - different files inside a single subvolume
870  * - different offsets inside a file (bookend extents in file.c)
871  *
872  * The extent ref structure for the implicit back refs has fields for:
873  *
874  * - Objectid of the subvolume root
875  * - objectid of the file holding the reference
876  * - original offset in the file
877  * - how many bookend extents
878  *
879  * The key offset for the implicit back refs is hash of the first
880  * three fields.
881  *
882  * The extent ref structure for the full back refs has field for:
883  *
884  * - number of pointers in the tree leaf
885  *
886  * The key offset for the implicit back refs is the first byte of
887  * the tree leaf
888  *
889  * When a file extent is allocated, The implicit back refs is used.
890  * the fields are filled in:
891  *
892  *     (root_key.objectid, inode objectid, offset in file, 1)
893  *
894  * When a file extent is removed file truncation, we find the
895  * corresponding implicit back refs and check the following fields:
896  *
897  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
898  *
899  * Btree extents can be referenced by:
900  *
901  * - Different subvolumes
902  *
903  * Both the implicit back refs and the full back refs for tree blocks
904  * only consist of key. The key offset for the implicit back refs is
905  * objectid of block's owner tree. The key offset for the full back refs
906  * is the first byte of parent block.
907  *
908  * When implicit back refs is used, information about the lowest key and
909  * level of the tree block are required. These information are stored in
910  * tree block info structure.
911  */
912 
913 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
914 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
915 				  struct btrfs_root *root,
916 				  struct btrfs_path *path,
917 				  u64 owner, u32 extra_size)
918 {
919 	struct btrfs_extent_item *item;
920 	struct btrfs_extent_item_v0 *ei0;
921 	struct btrfs_extent_ref_v0 *ref0;
922 	struct btrfs_tree_block_info *bi;
923 	struct extent_buffer *leaf;
924 	struct btrfs_key key;
925 	struct btrfs_key found_key;
926 	u32 new_size = sizeof(*item);
927 	u64 refs;
928 	int ret;
929 
930 	leaf = path->nodes[0];
931 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
932 
933 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
934 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
935 			     struct btrfs_extent_item_v0);
936 	refs = btrfs_extent_refs_v0(leaf, ei0);
937 
938 	if (owner == (u64)-1) {
939 		while (1) {
940 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
941 				ret = btrfs_next_leaf(root, path);
942 				if (ret < 0)
943 					return ret;
944 				BUG_ON(ret > 0);
945 				leaf = path->nodes[0];
946 			}
947 			btrfs_item_key_to_cpu(leaf, &found_key,
948 					      path->slots[0]);
949 			BUG_ON(key.objectid != found_key.objectid);
950 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
951 				path->slots[0]++;
952 				continue;
953 			}
954 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
955 					      struct btrfs_extent_ref_v0);
956 			owner = btrfs_ref_objectid_v0(leaf, ref0);
957 			break;
958 		}
959 	}
960 	btrfs_release_path(path);
961 
962 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
963 		new_size += sizeof(*bi);
964 
965 	new_size -= sizeof(*ei0);
966 	ret = btrfs_search_slot(trans, root, &key, path,
967 				new_size + extra_size, 1);
968 	if (ret < 0)
969 		return ret;
970 	BUG_ON(ret);
971 
972 	ret = btrfs_extend_item(trans, root, path, new_size);
973 
974 	leaf = path->nodes[0];
975 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
976 	btrfs_set_extent_refs(leaf, item, refs);
977 	/* FIXME: get real generation */
978 	btrfs_set_extent_generation(leaf, item, 0);
979 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
980 		btrfs_set_extent_flags(leaf, item,
981 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
982 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
983 		bi = (struct btrfs_tree_block_info *)(item + 1);
984 		/* FIXME: get first key of the block */
985 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
986 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
987 	} else {
988 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
989 	}
990 	btrfs_mark_buffer_dirty(leaf);
991 	return 0;
992 }
993 #endif
994 
995 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
996 {
997 	u32 high_crc = ~(u32)0;
998 	u32 low_crc = ~(u32)0;
999 	__le64 lenum;
1000 
1001 	lenum = cpu_to_le64(root_objectid);
1002 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1003 	lenum = cpu_to_le64(owner);
1004 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1005 	lenum = cpu_to_le64(offset);
1006 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1007 
1008 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1009 }
1010 
1011 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1012 				     struct btrfs_extent_data_ref *ref)
1013 {
1014 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1015 				    btrfs_extent_data_ref_objectid(leaf, ref),
1016 				    btrfs_extent_data_ref_offset(leaf, ref));
1017 }
1018 
1019 static int match_extent_data_ref(struct extent_buffer *leaf,
1020 				 struct btrfs_extent_data_ref *ref,
1021 				 u64 root_objectid, u64 owner, u64 offset)
1022 {
1023 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1024 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1025 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1026 		return 0;
1027 	return 1;
1028 }
1029 
1030 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1031 					   struct btrfs_root *root,
1032 					   struct btrfs_path *path,
1033 					   u64 bytenr, u64 parent,
1034 					   u64 root_objectid,
1035 					   u64 owner, u64 offset)
1036 {
1037 	struct btrfs_key key;
1038 	struct btrfs_extent_data_ref *ref;
1039 	struct extent_buffer *leaf;
1040 	u32 nritems;
1041 	int ret;
1042 	int recow;
1043 	int err = -ENOENT;
1044 
1045 	key.objectid = bytenr;
1046 	if (parent) {
1047 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1048 		key.offset = parent;
1049 	} else {
1050 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1051 		key.offset = hash_extent_data_ref(root_objectid,
1052 						  owner, offset);
1053 	}
1054 again:
1055 	recow = 0;
1056 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1057 	if (ret < 0) {
1058 		err = ret;
1059 		goto fail;
1060 	}
1061 
1062 	if (parent) {
1063 		if (!ret)
1064 			return 0;
1065 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1066 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1067 		btrfs_release_path(path);
1068 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1069 		if (ret < 0) {
1070 			err = ret;
1071 			goto fail;
1072 		}
1073 		if (!ret)
1074 			return 0;
1075 #endif
1076 		goto fail;
1077 	}
1078 
1079 	leaf = path->nodes[0];
1080 	nritems = btrfs_header_nritems(leaf);
1081 	while (1) {
1082 		if (path->slots[0] >= nritems) {
1083 			ret = btrfs_next_leaf(root, path);
1084 			if (ret < 0)
1085 				err = ret;
1086 			if (ret)
1087 				goto fail;
1088 
1089 			leaf = path->nodes[0];
1090 			nritems = btrfs_header_nritems(leaf);
1091 			recow = 1;
1092 		}
1093 
1094 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1095 		if (key.objectid != bytenr ||
1096 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1097 			goto fail;
1098 
1099 		ref = btrfs_item_ptr(leaf, path->slots[0],
1100 				     struct btrfs_extent_data_ref);
1101 
1102 		if (match_extent_data_ref(leaf, ref, root_objectid,
1103 					  owner, offset)) {
1104 			if (recow) {
1105 				btrfs_release_path(path);
1106 				goto again;
1107 			}
1108 			err = 0;
1109 			break;
1110 		}
1111 		path->slots[0]++;
1112 	}
1113 fail:
1114 	return err;
1115 }
1116 
1117 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1118 					   struct btrfs_root *root,
1119 					   struct btrfs_path *path,
1120 					   u64 bytenr, u64 parent,
1121 					   u64 root_objectid, u64 owner,
1122 					   u64 offset, int refs_to_add)
1123 {
1124 	struct btrfs_key key;
1125 	struct extent_buffer *leaf;
1126 	u32 size;
1127 	u32 num_refs;
1128 	int ret;
1129 
1130 	key.objectid = bytenr;
1131 	if (parent) {
1132 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1133 		key.offset = parent;
1134 		size = sizeof(struct btrfs_shared_data_ref);
1135 	} else {
1136 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1137 		key.offset = hash_extent_data_ref(root_objectid,
1138 						  owner, offset);
1139 		size = sizeof(struct btrfs_extent_data_ref);
1140 	}
1141 
1142 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1143 	if (ret && ret != -EEXIST)
1144 		goto fail;
1145 
1146 	leaf = path->nodes[0];
1147 	if (parent) {
1148 		struct btrfs_shared_data_ref *ref;
1149 		ref = btrfs_item_ptr(leaf, path->slots[0],
1150 				     struct btrfs_shared_data_ref);
1151 		if (ret == 0) {
1152 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1153 		} else {
1154 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1155 			num_refs += refs_to_add;
1156 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1157 		}
1158 	} else {
1159 		struct btrfs_extent_data_ref *ref;
1160 		while (ret == -EEXIST) {
1161 			ref = btrfs_item_ptr(leaf, path->slots[0],
1162 					     struct btrfs_extent_data_ref);
1163 			if (match_extent_data_ref(leaf, ref, root_objectid,
1164 						  owner, offset))
1165 				break;
1166 			btrfs_release_path(path);
1167 			key.offset++;
1168 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1169 						      size);
1170 			if (ret && ret != -EEXIST)
1171 				goto fail;
1172 
1173 			leaf = path->nodes[0];
1174 		}
1175 		ref = btrfs_item_ptr(leaf, path->slots[0],
1176 				     struct btrfs_extent_data_ref);
1177 		if (ret == 0) {
1178 			btrfs_set_extent_data_ref_root(leaf, ref,
1179 						       root_objectid);
1180 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1181 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1182 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1183 		} else {
1184 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1185 			num_refs += refs_to_add;
1186 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1187 		}
1188 	}
1189 	btrfs_mark_buffer_dirty(leaf);
1190 	ret = 0;
1191 fail:
1192 	btrfs_release_path(path);
1193 	return ret;
1194 }
1195 
1196 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1197 					   struct btrfs_root *root,
1198 					   struct btrfs_path *path,
1199 					   int refs_to_drop)
1200 {
1201 	struct btrfs_key key;
1202 	struct btrfs_extent_data_ref *ref1 = NULL;
1203 	struct btrfs_shared_data_ref *ref2 = NULL;
1204 	struct extent_buffer *leaf;
1205 	u32 num_refs = 0;
1206 	int ret = 0;
1207 
1208 	leaf = path->nodes[0];
1209 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1210 
1211 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1212 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1213 				      struct btrfs_extent_data_ref);
1214 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1215 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1216 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1217 				      struct btrfs_shared_data_ref);
1218 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1219 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1220 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1221 		struct btrfs_extent_ref_v0 *ref0;
1222 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1223 				      struct btrfs_extent_ref_v0);
1224 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1225 #endif
1226 	} else {
1227 		BUG();
1228 	}
1229 
1230 	BUG_ON(num_refs < refs_to_drop);
1231 	num_refs -= refs_to_drop;
1232 
1233 	if (num_refs == 0) {
1234 		ret = btrfs_del_item(trans, root, path);
1235 	} else {
1236 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1237 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1238 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1239 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1240 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1241 		else {
1242 			struct btrfs_extent_ref_v0 *ref0;
1243 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1244 					struct btrfs_extent_ref_v0);
1245 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1246 		}
1247 #endif
1248 		btrfs_mark_buffer_dirty(leaf);
1249 	}
1250 	return ret;
1251 }
1252 
1253 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1254 					  struct btrfs_path *path,
1255 					  struct btrfs_extent_inline_ref *iref)
1256 {
1257 	struct btrfs_key key;
1258 	struct extent_buffer *leaf;
1259 	struct btrfs_extent_data_ref *ref1;
1260 	struct btrfs_shared_data_ref *ref2;
1261 	u32 num_refs = 0;
1262 
1263 	leaf = path->nodes[0];
1264 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1265 	if (iref) {
1266 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1267 		    BTRFS_EXTENT_DATA_REF_KEY) {
1268 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1269 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1270 		} else {
1271 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1272 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1273 		}
1274 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1275 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1276 				      struct btrfs_extent_data_ref);
1277 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1278 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1279 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1280 				      struct btrfs_shared_data_ref);
1281 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1283 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1284 		struct btrfs_extent_ref_v0 *ref0;
1285 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1286 				      struct btrfs_extent_ref_v0);
1287 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1288 #endif
1289 	} else {
1290 		WARN_ON(1);
1291 	}
1292 	return num_refs;
1293 }
1294 
1295 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1296 					  struct btrfs_root *root,
1297 					  struct btrfs_path *path,
1298 					  u64 bytenr, u64 parent,
1299 					  u64 root_objectid)
1300 {
1301 	struct btrfs_key key;
1302 	int ret;
1303 
1304 	key.objectid = bytenr;
1305 	if (parent) {
1306 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1307 		key.offset = parent;
1308 	} else {
1309 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1310 		key.offset = root_objectid;
1311 	}
1312 
1313 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1314 	if (ret > 0)
1315 		ret = -ENOENT;
1316 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1317 	if (ret == -ENOENT && parent) {
1318 		btrfs_release_path(path);
1319 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1320 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1321 		if (ret > 0)
1322 			ret = -ENOENT;
1323 	}
1324 #endif
1325 	return ret;
1326 }
1327 
1328 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1329 					  struct btrfs_root *root,
1330 					  struct btrfs_path *path,
1331 					  u64 bytenr, u64 parent,
1332 					  u64 root_objectid)
1333 {
1334 	struct btrfs_key key;
1335 	int ret;
1336 
1337 	key.objectid = bytenr;
1338 	if (parent) {
1339 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1340 		key.offset = parent;
1341 	} else {
1342 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1343 		key.offset = root_objectid;
1344 	}
1345 
1346 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1347 	btrfs_release_path(path);
1348 	return ret;
1349 }
1350 
1351 static inline int extent_ref_type(u64 parent, u64 owner)
1352 {
1353 	int type;
1354 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1355 		if (parent > 0)
1356 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1357 		else
1358 			type = BTRFS_TREE_BLOCK_REF_KEY;
1359 	} else {
1360 		if (parent > 0)
1361 			type = BTRFS_SHARED_DATA_REF_KEY;
1362 		else
1363 			type = BTRFS_EXTENT_DATA_REF_KEY;
1364 	}
1365 	return type;
1366 }
1367 
1368 static int find_next_key(struct btrfs_path *path, int level,
1369 			 struct btrfs_key *key)
1370 
1371 {
1372 	for (; level < BTRFS_MAX_LEVEL; level++) {
1373 		if (!path->nodes[level])
1374 			break;
1375 		if (path->slots[level] + 1 >=
1376 		    btrfs_header_nritems(path->nodes[level]))
1377 			continue;
1378 		if (level == 0)
1379 			btrfs_item_key_to_cpu(path->nodes[level], key,
1380 					      path->slots[level] + 1);
1381 		else
1382 			btrfs_node_key_to_cpu(path->nodes[level], key,
1383 					      path->slots[level] + 1);
1384 		return 0;
1385 	}
1386 	return 1;
1387 }
1388 
1389 /*
1390  * look for inline back ref. if back ref is found, *ref_ret is set
1391  * to the address of inline back ref, and 0 is returned.
1392  *
1393  * if back ref isn't found, *ref_ret is set to the address where it
1394  * should be inserted, and -ENOENT is returned.
1395  *
1396  * if insert is true and there are too many inline back refs, the path
1397  * points to the extent item, and -EAGAIN is returned.
1398  *
1399  * NOTE: inline back refs are ordered in the same way that back ref
1400  *	 items in the tree are ordered.
1401  */
1402 static noinline_for_stack
1403 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1404 				 struct btrfs_root *root,
1405 				 struct btrfs_path *path,
1406 				 struct btrfs_extent_inline_ref **ref_ret,
1407 				 u64 bytenr, u64 num_bytes,
1408 				 u64 parent, u64 root_objectid,
1409 				 u64 owner, u64 offset, int insert)
1410 {
1411 	struct btrfs_key key;
1412 	struct extent_buffer *leaf;
1413 	struct btrfs_extent_item *ei;
1414 	struct btrfs_extent_inline_ref *iref;
1415 	u64 flags;
1416 	u64 item_size;
1417 	unsigned long ptr;
1418 	unsigned long end;
1419 	int extra_size;
1420 	int type;
1421 	int want;
1422 	int ret;
1423 	int err = 0;
1424 
1425 	key.objectid = bytenr;
1426 	key.type = BTRFS_EXTENT_ITEM_KEY;
1427 	key.offset = num_bytes;
1428 
1429 	want = extent_ref_type(parent, owner);
1430 	if (insert) {
1431 		extra_size = btrfs_extent_inline_ref_size(want);
1432 		path->keep_locks = 1;
1433 	} else
1434 		extra_size = -1;
1435 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1436 	if (ret < 0) {
1437 		err = ret;
1438 		goto out;
1439 	}
1440 	BUG_ON(ret);
1441 
1442 	leaf = path->nodes[0];
1443 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1444 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1445 	if (item_size < sizeof(*ei)) {
1446 		if (!insert) {
1447 			err = -ENOENT;
1448 			goto out;
1449 		}
1450 		ret = convert_extent_item_v0(trans, root, path, owner,
1451 					     extra_size);
1452 		if (ret < 0) {
1453 			err = ret;
1454 			goto out;
1455 		}
1456 		leaf = path->nodes[0];
1457 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1458 	}
1459 #endif
1460 	BUG_ON(item_size < sizeof(*ei));
1461 
1462 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1463 	flags = btrfs_extent_flags(leaf, ei);
1464 
1465 	ptr = (unsigned long)(ei + 1);
1466 	end = (unsigned long)ei + item_size;
1467 
1468 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1469 		ptr += sizeof(struct btrfs_tree_block_info);
1470 		BUG_ON(ptr > end);
1471 	} else {
1472 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1473 	}
1474 
1475 	err = -ENOENT;
1476 	while (1) {
1477 		if (ptr >= end) {
1478 			WARN_ON(ptr > end);
1479 			break;
1480 		}
1481 		iref = (struct btrfs_extent_inline_ref *)ptr;
1482 		type = btrfs_extent_inline_ref_type(leaf, iref);
1483 		if (want < type)
1484 			break;
1485 		if (want > type) {
1486 			ptr += btrfs_extent_inline_ref_size(type);
1487 			continue;
1488 		}
1489 
1490 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1491 			struct btrfs_extent_data_ref *dref;
1492 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1493 			if (match_extent_data_ref(leaf, dref, root_objectid,
1494 						  owner, offset)) {
1495 				err = 0;
1496 				break;
1497 			}
1498 			if (hash_extent_data_ref_item(leaf, dref) <
1499 			    hash_extent_data_ref(root_objectid, owner, offset))
1500 				break;
1501 		} else {
1502 			u64 ref_offset;
1503 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1504 			if (parent > 0) {
1505 				if (parent == ref_offset) {
1506 					err = 0;
1507 					break;
1508 				}
1509 				if (ref_offset < parent)
1510 					break;
1511 			} else {
1512 				if (root_objectid == ref_offset) {
1513 					err = 0;
1514 					break;
1515 				}
1516 				if (ref_offset < root_objectid)
1517 					break;
1518 			}
1519 		}
1520 		ptr += btrfs_extent_inline_ref_size(type);
1521 	}
1522 	if (err == -ENOENT && insert) {
1523 		if (item_size + extra_size >=
1524 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1525 			err = -EAGAIN;
1526 			goto out;
1527 		}
1528 		/*
1529 		 * To add new inline back ref, we have to make sure
1530 		 * there is no corresponding back ref item.
1531 		 * For simplicity, we just do not add new inline back
1532 		 * ref if there is any kind of item for this block
1533 		 */
1534 		if (find_next_key(path, 0, &key) == 0 &&
1535 		    key.objectid == bytenr &&
1536 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1537 			err = -EAGAIN;
1538 			goto out;
1539 		}
1540 	}
1541 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1542 out:
1543 	if (insert) {
1544 		path->keep_locks = 0;
1545 		btrfs_unlock_up_safe(path, 1);
1546 	}
1547 	return err;
1548 }
1549 
1550 /*
1551  * helper to add new inline back ref
1552  */
1553 static noinline_for_stack
1554 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1555 				struct btrfs_root *root,
1556 				struct btrfs_path *path,
1557 				struct btrfs_extent_inline_ref *iref,
1558 				u64 parent, u64 root_objectid,
1559 				u64 owner, u64 offset, int refs_to_add,
1560 				struct btrfs_delayed_extent_op *extent_op)
1561 {
1562 	struct extent_buffer *leaf;
1563 	struct btrfs_extent_item *ei;
1564 	unsigned long ptr;
1565 	unsigned long end;
1566 	unsigned long item_offset;
1567 	u64 refs;
1568 	int size;
1569 	int type;
1570 	int ret;
1571 
1572 	leaf = path->nodes[0];
1573 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1574 	item_offset = (unsigned long)iref - (unsigned long)ei;
1575 
1576 	type = extent_ref_type(parent, owner);
1577 	size = btrfs_extent_inline_ref_size(type);
1578 
1579 	ret = btrfs_extend_item(trans, root, path, size);
1580 
1581 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1582 	refs = btrfs_extent_refs(leaf, ei);
1583 	refs += refs_to_add;
1584 	btrfs_set_extent_refs(leaf, ei, refs);
1585 	if (extent_op)
1586 		__run_delayed_extent_op(extent_op, leaf, ei);
1587 
1588 	ptr = (unsigned long)ei + item_offset;
1589 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1590 	if (ptr < end - size)
1591 		memmove_extent_buffer(leaf, ptr + size, ptr,
1592 				      end - size - ptr);
1593 
1594 	iref = (struct btrfs_extent_inline_ref *)ptr;
1595 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1596 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1597 		struct btrfs_extent_data_ref *dref;
1598 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1599 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1600 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1601 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1602 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1603 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1604 		struct btrfs_shared_data_ref *sref;
1605 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1606 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1607 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1608 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1609 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1610 	} else {
1611 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1612 	}
1613 	btrfs_mark_buffer_dirty(leaf);
1614 	return 0;
1615 }
1616 
1617 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1618 				 struct btrfs_root *root,
1619 				 struct btrfs_path *path,
1620 				 struct btrfs_extent_inline_ref **ref_ret,
1621 				 u64 bytenr, u64 num_bytes, u64 parent,
1622 				 u64 root_objectid, u64 owner, u64 offset)
1623 {
1624 	int ret;
1625 
1626 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1627 					   bytenr, num_bytes, parent,
1628 					   root_objectid, owner, offset, 0);
1629 	if (ret != -ENOENT)
1630 		return ret;
1631 
1632 	btrfs_release_path(path);
1633 	*ref_ret = NULL;
1634 
1635 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1636 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1637 					    root_objectid);
1638 	} else {
1639 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1640 					     root_objectid, owner, offset);
1641 	}
1642 	return ret;
1643 }
1644 
1645 /*
1646  * helper to update/remove inline back ref
1647  */
1648 static noinline_for_stack
1649 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1650 				 struct btrfs_root *root,
1651 				 struct btrfs_path *path,
1652 				 struct btrfs_extent_inline_ref *iref,
1653 				 int refs_to_mod,
1654 				 struct btrfs_delayed_extent_op *extent_op)
1655 {
1656 	struct extent_buffer *leaf;
1657 	struct btrfs_extent_item *ei;
1658 	struct btrfs_extent_data_ref *dref = NULL;
1659 	struct btrfs_shared_data_ref *sref = NULL;
1660 	unsigned long ptr;
1661 	unsigned long end;
1662 	u32 item_size;
1663 	int size;
1664 	int type;
1665 	int ret;
1666 	u64 refs;
1667 
1668 	leaf = path->nodes[0];
1669 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1670 	refs = btrfs_extent_refs(leaf, ei);
1671 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1672 	refs += refs_to_mod;
1673 	btrfs_set_extent_refs(leaf, ei, refs);
1674 	if (extent_op)
1675 		__run_delayed_extent_op(extent_op, leaf, ei);
1676 
1677 	type = btrfs_extent_inline_ref_type(leaf, iref);
1678 
1679 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1680 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1681 		refs = btrfs_extent_data_ref_count(leaf, dref);
1682 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1683 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1684 		refs = btrfs_shared_data_ref_count(leaf, sref);
1685 	} else {
1686 		refs = 1;
1687 		BUG_ON(refs_to_mod != -1);
1688 	}
1689 
1690 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1691 	refs += refs_to_mod;
1692 
1693 	if (refs > 0) {
1694 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1695 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1696 		else
1697 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1698 	} else {
1699 		size =  btrfs_extent_inline_ref_size(type);
1700 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1701 		ptr = (unsigned long)iref;
1702 		end = (unsigned long)ei + item_size;
1703 		if (ptr + size < end)
1704 			memmove_extent_buffer(leaf, ptr, ptr + size,
1705 					      end - ptr - size);
1706 		item_size -= size;
1707 		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1708 	}
1709 	btrfs_mark_buffer_dirty(leaf);
1710 	return 0;
1711 }
1712 
1713 static noinline_for_stack
1714 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1715 				 struct btrfs_root *root,
1716 				 struct btrfs_path *path,
1717 				 u64 bytenr, u64 num_bytes, u64 parent,
1718 				 u64 root_objectid, u64 owner,
1719 				 u64 offset, int refs_to_add,
1720 				 struct btrfs_delayed_extent_op *extent_op)
1721 {
1722 	struct btrfs_extent_inline_ref *iref;
1723 	int ret;
1724 
1725 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1726 					   bytenr, num_bytes, parent,
1727 					   root_objectid, owner, offset, 1);
1728 	if (ret == 0) {
1729 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1730 		ret = update_inline_extent_backref(trans, root, path, iref,
1731 						   refs_to_add, extent_op);
1732 	} else if (ret == -ENOENT) {
1733 		ret = setup_inline_extent_backref(trans, root, path, iref,
1734 						  parent, root_objectid,
1735 						  owner, offset, refs_to_add,
1736 						  extent_op);
1737 	}
1738 	return ret;
1739 }
1740 
1741 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1742 				 struct btrfs_root *root,
1743 				 struct btrfs_path *path,
1744 				 u64 bytenr, u64 parent, u64 root_objectid,
1745 				 u64 owner, u64 offset, int refs_to_add)
1746 {
1747 	int ret;
1748 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1749 		BUG_ON(refs_to_add != 1);
1750 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1751 					    parent, root_objectid);
1752 	} else {
1753 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1754 					     parent, root_objectid,
1755 					     owner, offset, refs_to_add);
1756 	}
1757 	return ret;
1758 }
1759 
1760 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1761 				 struct btrfs_root *root,
1762 				 struct btrfs_path *path,
1763 				 struct btrfs_extent_inline_ref *iref,
1764 				 int refs_to_drop, int is_data)
1765 {
1766 	int ret;
1767 
1768 	BUG_ON(!is_data && refs_to_drop != 1);
1769 	if (iref) {
1770 		ret = update_inline_extent_backref(trans, root, path, iref,
1771 						   -refs_to_drop, NULL);
1772 	} else if (is_data) {
1773 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1774 	} else {
1775 		ret = btrfs_del_item(trans, root, path);
1776 	}
1777 	return ret;
1778 }
1779 
1780 static int btrfs_issue_discard(struct block_device *bdev,
1781 				u64 start, u64 len)
1782 {
1783 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1784 }
1785 
1786 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1787 				u64 num_bytes, u64 *actual_bytes)
1788 {
1789 	int ret;
1790 	u64 discarded_bytes = 0;
1791 	struct btrfs_bio *bbio = NULL;
1792 
1793 
1794 	/* Tell the block device(s) that the sectors can be discarded */
1795 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1796 			      bytenr, &num_bytes, &bbio, 0);
1797 	if (!ret) {
1798 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1799 		int i;
1800 
1801 
1802 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1803 			if (!stripe->dev->can_discard)
1804 				continue;
1805 
1806 			ret = btrfs_issue_discard(stripe->dev->bdev,
1807 						  stripe->physical,
1808 						  stripe->length);
1809 			if (!ret)
1810 				discarded_bytes += stripe->length;
1811 			else if (ret != -EOPNOTSUPP)
1812 				break;
1813 
1814 			/*
1815 			 * Just in case we get back EOPNOTSUPP for some reason,
1816 			 * just ignore the return value so we don't screw up
1817 			 * people calling discard_extent.
1818 			 */
1819 			ret = 0;
1820 		}
1821 		kfree(bbio);
1822 	}
1823 
1824 	if (actual_bytes)
1825 		*actual_bytes = discarded_bytes;
1826 
1827 
1828 	return ret;
1829 }
1830 
1831 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1832 			 struct btrfs_root *root,
1833 			 u64 bytenr, u64 num_bytes, u64 parent,
1834 			 u64 root_objectid, u64 owner, u64 offset)
1835 {
1836 	int ret;
1837 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1838 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1839 
1840 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1841 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1842 					parent, root_objectid, (int)owner,
1843 					BTRFS_ADD_DELAYED_REF, NULL);
1844 	} else {
1845 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1846 					parent, root_objectid, owner, offset,
1847 					BTRFS_ADD_DELAYED_REF, NULL);
1848 	}
1849 	return ret;
1850 }
1851 
1852 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1853 				  struct btrfs_root *root,
1854 				  u64 bytenr, u64 num_bytes,
1855 				  u64 parent, u64 root_objectid,
1856 				  u64 owner, u64 offset, int refs_to_add,
1857 				  struct btrfs_delayed_extent_op *extent_op)
1858 {
1859 	struct btrfs_path *path;
1860 	struct extent_buffer *leaf;
1861 	struct btrfs_extent_item *item;
1862 	u64 refs;
1863 	int ret;
1864 	int err = 0;
1865 
1866 	path = btrfs_alloc_path();
1867 	if (!path)
1868 		return -ENOMEM;
1869 
1870 	path->reada = 1;
1871 	path->leave_spinning = 1;
1872 	/* this will setup the path even if it fails to insert the back ref */
1873 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1874 					   path, bytenr, num_bytes, parent,
1875 					   root_objectid, owner, offset,
1876 					   refs_to_add, extent_op);
1877 	if (ret == 0)
1878 		goto out;
1879 
1880 	if (ret != -EAGAIN) {
1881 		err = ret;
1882 		goto out;
1883 	}
1884 
1885 	leaf = path->nodes[0];
1886 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1887 	refs = btrfs_extent_refs(leaf, item);
1888 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1889 	if (extent_op)
1890 		__run_delayed_extent_op(extent_op, leaf, item);
1891 
1892 	btrfs_mark_buffer_dirty(leaf);
1893 	btrfs_release_path(path);
1894 
1895 	path->reada = 1;
1896 	path->leave_spinning = 1;
1897 
1898 	/* now insert the actual backref */
1899 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1900 				    path, bytenr, parent, root_objectid,
1901 				    owner, offset, refs_to_add);
1902 	BUG_ON(ret);
1903 out:
1904 	btrfs_free_path(path);
1905 	return err;
1906 }
1907 
1908 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1909 				struct btrfs_root *root,
1910 				struct btrfs_delayed_ref_node *node,
1911 				struct btrfs_delayed_extent_op *extent_op,
1912 				int insert_reserved)
1913 {
1914 	int ret = 0;
1915 	struct btrfs_delayed_data_ref *ref;
1916 	struct btrfs_key ins;
1917 	u64 parent = 0;
1918 	u64 ref_root = 0;
1919 	u64 flags = 0;
1920 
1921 	ins.objectid = node->bytenr;
1922 	ins.offset = node->num_bytes;
1923 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1924 
1925 	ref = btrfs_delayed_node_to_data_ref(node);
1926 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1927 		parent = ref->parent;
1928 	else
1929 		ref_root = ref->root;
1930 
1931 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1932 		if (extent_op) {
1933 			BUG_ON(extent_op->update_key);
1934 			flags |= extent_op->flags_to_set;
1935 		}
1936 		ret = alloc_reserved_file_extent(trans, root,
1937 						 parent, ref_root, flags,
1938 						 ref->objectid, ref->offset,
1939 						 &ins, node->ref_mod);
1940 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1941 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1942 					     node->num_bytes, parent,
1943 					     ref_root, ref->objectid,
1944 					     ref->offset, node->ref_mod,
1945 					     extent_op);
1946 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1947 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1948 					  node->num_bytes, parent,
1949 					  ref_root, ref->objectid,
1950 					  ref->offset, node->ref_mod,
1951 					  extent_op);
1952 	} else {
1953 		BUG();
1954 	}
1955 	return ret;
1956 }
1957 
1958 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1959 				    struct extent_buffer *leaf,
1960 				    struct btrfs_extent_item *ei)
1961 {
1962 	u64 flags = btrfs_extent_flags(leaf, ei);
1963 	if (extent_op->update_flags) {
1964 		flags |= extent_op->flags_to_set;
1965 		btrfs_set_extent_flags(leaf, ei, flags);
1966 	}
1967 
1968 	if (extent_op->update_key) {
1969 		struct btrfs_tree_block_info *bi;
1970 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1971 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1972 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1973 	}
1974 }
1975 
1976 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1977 				 struct btrfs_root *root,
1978 				 struct btrfs_delayed_ref_node *node,
1979 				 struct btrfs_delayed_extent_op *extent_op)
1980 {
1981 	struct btrfs_key key;
1982 	struct btrfs_path *path;
1983 	struct btrfs_extent_item *ei;
1984 	struct extent_buffer *leaf;
1985 	u32 item_size;
1986 	int ret;
1987 	int err = 0;
1988 
1989 	path = btrfs_alloc_path();
1990 	if (!path)
1991 		return -ENOMEM;
1992 
1993 	key.objectid = node->bytenr;
1994 	key.type = BTRFS_EXTENT_ITEM_KEY;
1995 	key.offset = node->num_bytes;
1996 
1997 	path->reada = 1;
1998 	path->leave_spinning = 1;
1999 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2000 				path, 0, 1);
2001 	if (ret < 0) {
2002 		err = ret;
2003 		goto out;
2004 	}
2005 	if (ret > 0) {
2006 		err = -EIO;
2007 		goto out;
2008 	}
2009 
2010 	leaf = path->nodes[0];
2011 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2012 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2013 	if (item_size < sizeof(*ei)) {
2014 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2015 					     path, (u64)-1, 0);
2016 		if (ret < 0) {
2017 			err = ret;
2018 			goto out;
2019 		}
2020 		leaf = path->nodes[0];
2021 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2022 	}
2023 #endif
2024 	BUG_ON(item_size < sizeof(*ei));
2025 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2026 	__run_delayed_extent_op(extent_op, leaf, ei);
2027 
2028 	btrfs_mark_buffer_dirty(leaf);
2029 out:
2030 	btrfs_free_path(path);
2031 	return err;
2032 }
2033 
2034 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2035 				struct btrfs_root *root,
2036 				struct btrfs_delayed_ref_node *node,
2037 				struct btrfs_delayed_extent_op *extent_op,
2038 				int insert_reserved)
2039 {
2040 	int ret = 0;
2041 	struct btrfs_delayed_tree_ref *ref;
2042 	struct btrfs_key ins;
2043 	u64 parent = 0;
2044 	u64 ref_root = 0;
2045 
2046 	ins.objectid = node->bytenr;
2047 	ins.offset = node->num_bytes;
2048 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2049 
2050 	ref = btrfs_delayed_node_to_tree_ref(node);
2051 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2052 		parent = ref->parent;
2053 	else
2054 		ref_root = ref->root;
2055 
2056 	BUG_ON(node->ref_mod != 1);
2057 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2058 		BUG_ON(!extent_op || !extent_op->update_flags ||
2059 		       !extent_op->update_key);
2060 		ret = alloc_reserved_tree_block(trans, root,
2061 						parent, ref_root,
2062 						extent_op->flags_to_set,
2063 						&extent_op->key,
2064 						ref->level, &ins);
2065 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2066 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2067 					     node->num_bytes, parent, ref_root,
2068 					     ref->level, 0, 1, extent_op);
2069 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2070 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2071 					  node->num_bytes, parent, ref_root,
2072 					  ref->level, 0, 1, extent_op);
2073 	} else {
2074 		BUG();
2075 	}
2076 	return ret;
2077 }
2078 
2079 /* helper function to actually process a single delayed ref entry */
2080 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2081 			       struct btrfs_root *root,
2082 			       struct btrfs_delayed_ref_node *node,
2083 			       struct btrfs_delayed_extent_op *extent_op,
2084 			       int insert_reserved)
2085 {
2086 	int ret;
2087 	if (btrfs_delayed_ref_is_head(node)) {
2088 		struct btrfs_delayed_ref_head *head;
2089 		/*
2090 		 * we've hit the end of the chain and we were supposed
2091 		 * to insert this extent into the tree.  But, it got
2092 		 * deleted before we ever needed to insert it, so all
2093 		 * we have to do is clean up the accounting
2094 		 */
2095 		BUG_ON(extent_op);
2096 		head = btrfs_delayed_node_to_head(node);
2097 		if (insert_reserved) {
2098 			btrfs_pin_extent(root, node->bytenr,
2099 					 node->num_bytes, 1);
2100 			if (head->is_data) {
2101 				ret = btrfs_del_csums(trans, root,
2102 						      node->bytenr,
2103 						      node->num_bytes);
2104 				BUG_ON(ret);
2105 			}
2106 		}
2107 		mutex_unlock(&head->mutex);
2108 		return 0;
2109 	}
2110 
2111 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2112 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2113 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2114 					   insert_reserved);
2115 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2116 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2117 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2118 					   insert_reserved);
2119 	else
2120 		BUG();
2121 	return ret;
2122 }
2123 
2124 static noinline struct btrfs_delayed_ref_node *
2125 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2126 {
2127 	struct rb_node *node;
2128 	struct btrfs_delayed_ref_node *ref;
2129 	int action = BTRFS_ADD_DELAYED_REF;
2130 again:
2131 	/*
2132 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2133 	 * this prevents ref count from going down to zero when
2134 	 * there still are pending delayed ref.
2135 	 */
2136 	node = rb_prev(&head->node.rb_node);
2137 	while (1) {
2138 		if (!node)
2139 			break;
2140 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2141 				rb_node);
2142 		if (ref->bytenr != head->node.bytenr)
2143 			break;
2144 		if (ref->action == action)
2145 			return ref;
2146 		node = rb_prev(node);
2147 	}
2148 	if (action == BTRFS_ADD_DELAYED_REF) {
2149 		action = BTRFS_DROP_DELAYED_REF;
2150 		goto again;
2151 	}
2152 	return NULL;
2153 }
2154 
2155 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2156 				       struct btrfs_root *root,
2157 				       struct list_head *cluster)
2158 {
2159 	struct btrfs_delayed_ref_root *delayed_refs;
2160 	struct btrfs_delayed_ref_node *ref;
2161 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2162 	struct btrfs_delayed_extent_op *extent_op;
2163 	int ret;
2164 	int count = 0;
2165 	int must_insert_reserved = 0;
2166 
2167 	delayed_refs = &trans->transaction->delayed_refs;
2168 	while (1) {
2169 		if (!locked_ref) {
2170 			/* pick a new head ref from the cluster list */
2171 			if (list_empty(cluster))
2172 				break;
2173 
2174 			locked_ref = list_entry(cluster->next,
2175 				     struct btrfs_delayed_ref_head, cluster);
2176 
2177 			/* grab the lock that says we are going to process
2178 			 * all the refs for this head */
2179 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2180 
2181 			/*
2182 			 * we may have dropped the spin lock to get the head
2183 			 * mutex lock, and that might have given someone else
2184 			 * time to free the head.  If that's true, it has been
2185 			 * removed from our list and we can move on.
2186 			 */
2187 			if (ret == -EAGAIN) {
2188 				locked_ref = NULL;
2189 				count++;
2190 				continue;
2191 			}
2192 		}
2193 
2194 		/*
2195 		 * record the must insert reserved flag before we
2196 		 * drop the spin lock.
2197 		 */
2198 		must_insert_reserved = locked_ref->must_insert_reserved;
2199 		locked_ref->must_insert_reserved = 0;
2200 
2201 		extent_op = locked_ref->extent_op;
2202 		locked_ref->extent_op = NULL;
2203 
2204 		/*
2205 		 * locked_ref is the head node, so we have to go one
2206 		 * node back for any delayed ref updates
2207 		 */
2208 		ref = select_delayed_ref(locked_ref);
2209 		if (!ref) {
2210 			/* All delayed refs have been processed, Go ahead
2211 			 * and send the head node to run_one_delayed_ref,
2212 			 * so that any accounting fixes can happen
2213 			 */
2214 			ref = &locked_ref->node;
2215 
2216 			if (extent_op && must_insert_reserved) {
2217 				kfree(extent_op);
2218 				extent_op = NULL;
2219 			}
2220 
2221 			if (extent_op) {
2222 				spin_unlock(&delayed_refs->lock);
2223 
2224 				ret = run_delayed_extent_op(trans, root,
2225 							    ref, extent_op);
2226 				BUG_ON(ret);
2227 				kfree(extent_op);
2228 
2229 				cond_resched();
2230 				spin_lock(&delayed_refs->lock);
2231 				continue;
2232 			}
2233 
2234 			list_del_init(&locked_ref->cluster);
2235 			locked_ref = NULL;
2236 		}
2237 
2238 		ref->in_tree = 0;
2239 		rb_erase(&ref->rb_node, &delayed_refs->root);
2240 		delayed_refs->num_entries--;
2241 
2242 		spin_unlock(&delayed_refs->lock);
2243 
2244 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2245 					  must_insert_reserved);
2246 		BUG_ON(ret);
2247 
2248 		btrfs_put_delayed_ref(ref);
2249 		kfree(extent_op);
2250 		count++;
2251 
2252 		cond_resched();
2253 		spin_lock(&delayed_refs->lock);
2254 	}
2255 	return count;
2256 }
2257 
2258 /*
2259  * this starts processing the delayed reference count updates and
2260  * extent insertions we have queued up so far.  count can be
2261  * 0, which means to process everything in the tree at the start
2262  * of the run (but not newly added entries), or it can be some target
2263  * number you'd like to process.
2264  */
2265 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2266 			   struct btrfs_root *root, unsigned long count)
2267 {
2268 	struct rb_node *node;
2269 	struct btrfs_delayed_ref_root *delayed_refs;
2270 	struct btrfs_delayed_ref_node *ref;
2271 	struct list_head cluster;
2272 	int ret;
2273 	int run_all = count == (unsigned long)-1;
2274 	int run_most = 0;
2275 
2276 	if (root == root->fs_info->extent_root)
2277 		root = root->fs_info->tree_root;
2278 
2279 	delayed_refs = &trans->transaction->delayed_refs;
2280 	INIT_LIST_HEAD(&cluster);
2281 again:
2282 	spin_lock(&delayed_refs->lock);
2283 	if (count == 0) {
2284 		count = delayed_refs->num_entries * 2;
2285 		run_most = 1;
2286 	}
2287 	while (1) {
2288 		if (!(run_all || run_most) &&
2289 		    delayed_refs->num_heads_ready < 64)
2290 			break;
2291 
2292 		/*
2293 		 * go find something we can process in the rbtree.  We start at
2294 		 * the beginning of the tree, and then build a cluster
2295 		 * of refs to process starting at the first one we are able to
2296 		 * lock
2297 		 */
2298 		ret = btrfs_find_ref_cluster(trans, &cluster,
2299 					     delayed_refs->run_delayed_start);
2300 		if (ret)
2301 			break;
2302 
2303 		ret = run_clustered_refs(trans, root, &cluster);
2304 		BUG_ON(ret < 0);
2305 
2306 		count -= min_t(unsigned long, ret, count);
2307 
2308 		if (count == 0)
2309 			break;
2310 	}
2311 
2312 	if (run_all) {
2313 		node = rb_first(&delayed_refs->root);
2314 		if (!node)
2315 			goto out;
2316 		count = (unsigned long)-1;
2317 
2318 		while (node) {
2319 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2320 				       rb_node);
2321 			if (btrfs_delayed_ref_is_head(ref)) {
2322 				struct btrfs_delayed_ref_head *head;
2323 
2324 				head = btrfs_delayed_node_to_head(ref);
2325 				atomic_inc(&ref->refs);
2326 
2327 				spin_unlock(&delayed_refs->lock);
2328 				/*
2329 				 * Mutex was contended, block until it's
2330 				 * released and try again
2331 				 */
2332 				mutex_lock(&head->mutex);
2333 				mutex_unlock(&head->mutex);
2334 
2335 				btrfs_put_delayed_ref(ref);
2336 				cond_resched();
2337 				goto again;
2338 			}
2339 			node = rb_next(node);
2340 		}
2341 		spin_unlock(&delayed_refs->lock);
2342 		schedule_timeout(1);
2343 		goto again;
2344 	}
2345 out:
2346 	spin_unlock(&delayed_refs->lock);
2347 	return 0;
2348 }
2349 
2350 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2351 				struct btrfs_root *root,
2352 				u64 bytenr, u64 num_bytes, u64 flags,
2353 				int is_data)
2354 {
2355 	struct btrfs_delayed_extent_op *extent_op;
2356 	int ret;
2357 
2358 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2359 	if (!extent_op)
2360 		return -ENOMEM;
2361 
2362 	extent_op->flags_to_set = flags;
2363 	extent_op->update_flags = 1;
2364 	extent_op->update_key = 0;
2365 	extent_op->is_data = is_data ? 1 : 0;
2366 
2367 	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2368 	if (ret)
2369 		kfree(extent_op);
2370 	return ret;
2371 }
2372 
2373 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2374 				      struct btrfs_root *root,
2375 				      struct btrfs_path *path,
2376 				      u64 objectid, u64 offset, u64 bytenr)
2377 {
2378 	struct btrfs_delayed_ref_head *head;
2379 	struct btrfs_delayed_ref_node *ref;
2380 	struct btrfs_delayed_data_ref *data_ref;
2381 	struct btrfs_delayed_ref_root *delayed_refs;
2382 	struct rb_node *node;
2383 	int ret = 0;
2384 
2385 	ret = -ENOENT;
2386 	delayed_refs = &trans->transaction->delayed_refs;
2387 	spin_lock(&delayed_refs->lock);
2388 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2389 	if (!head)
2390 		goto out;
2391 
2392 	if (!mutex_trylock(&head->mutex)) {
2393 		atomic_inc(&head->node.refs);
2394 		spin_unlock(&delayed_refs->lock);
2395 
2396 		btrfs_release_path(path);
2397 
2398 		/*
2399 		 * Mutex was contended, block until it's released and let
2400 		 * caller try again
2401 		 */
2402 		mutex_lock(&head->mutex);
2403 		mutex_unlock(&head->mutex);
2404 		btrfs_put_delayed_ref(&head->node);
2405 		return -EAGAIN;
2406 	}
2407 
2408 	node = rb_prev(&head->node.rb_node);
2409 	if (!node)
2410 		goto out_unlock;
2411 
2412 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2413 
2414 	if (ref->bytenr != bytenr)
2415 		goto out_unlock;
2416 
2417 	ret = 1;
2418 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2419 		goto out_unlock;
2420 
2421 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2422 
2423 	node = rb_prev(node);
2424 	if (node) {
2425 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2426 		if (ref->bytenr == bytenr)
2427 			goto out_unlock;
2428 	}
2429 
2430 	if (data_ref->root != root->root_key.objectid ||
2431 	    data_ref->objectid != objectid || data_ref->offset != offset)
2432 		goto out_unlock;
2433 
2434 	ret = 0;
2435 out_unlock:
2436 	mutex_unlock(&head->mutex);
2437 out:
2438 	spin_unlock(&delayed_refs->lock);
2439 	return ret;
2440 }
2441 
2442 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2443 					struct btrfs_root *root,
2444 					struct btrfs_path *path,
2445 					u64 objectid, u64 offset, u64 bytenr)
2446 {
2447 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2448 	struct extent_buffer *leaf;
2449 	struct btrfs_extent_data_ref *ref;
2450 	struct btrfs_extent_inline_ref *iref;
2451 	struct btrfs_extent_item *ei;
2452 	struct btrfs_key key;
2453 	u32 item_size;
2454 	int ret;
2455 
2456 	key.objectid = bytenr;
2457 	key.offset = (u64)-1;
2458 	key.type = BTRFS_EXTENT_ITEM_KEY;
2459 
2460 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2461 	if (ret < 0)
2462 		goto out;
2463 	BUG_ON(ret == 0);
2464 
2465 	ret = -ENOENT;
2466 	if (path->slots[0] == 0)
2467 		goto out;
2468 
2469 	path->slots[0]--;
2470 	leaf = path->nodes[0];
2471 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2472 
2473 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2474 		goto out;
2475 
2476 	ret = 1;
2477 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2478 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2479 	if (item_size < sizeof(*ei)) {
2480 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2481 		goto out;
2482 	}
2483 #endif
2484 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2485 
2486 	if (item_size != sizeof(*ei) +
2487 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2488 		goto out;
2489 
2490 	if (btrfs_extent_generation(leaf, ei) <=
2491 	    btrfs_root_last_snapshot(&root->root_item))
2492 		goto out;
2493 
2494 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2495 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2496 	    BTRFS_EXTENT_DATA_REF_KEY)
2497 		goto out;
2498 
2499 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2500 	if (btrfs_extent_refs(leaf, ei) !=
2501 	    btrfs_extent_data_ref_count(leaf, ref) ||
2502 	    btrfs_extent_data_ref_root(leaf, ref) !=
2503 	    root->root_key.objectid ||
2504 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2505 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2506 		goto out;
2507 
2508 	ret = 0;
2509 out:
2510 	return ret;
2511 }
2512 
2513 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2514 			  struct btrfs_root *root,
2515 			  u64 objectid, u64 offset, u64 bytenr)
2516 {
2517 	struct btrfs_path *path;
2518 	int ret;
2519 	int ret2;
2520 
2521 	path = btrfs_alloc_path();
2522 	if (!path)
2523 		return -ENOENT;
2524 
2525 	do {
2526 		ret = check_committed_ref(trans, root, path, objectid,
2527 					  offset, bytenr);
2528 		if (ret && ret != -ENOENT)
2529 			goto out;
2530 
2531 		ret2 = check_delayed_ref(trans, root, path, objectid,
2532 					 offset, bytenr);
2533 	} while (ret2 == -EAGAIN);
2534 
2535 	if (ret2 && ret2 != -ENOENT) {
2536 		ret = ret2;
2537 		goto out;
2538 	}
2539 
2540 	if (ret != -ENOENT || ret2 != -ENOENT)
2541 		ret = 0;
2542 out:
2543 	btrfs_free_path(path);
2544 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2545 		WARN_ON(ret > 0);
2546 	return ret;
2547 }
2548 
2549 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2550 			   struct btrfs_root *root,
2551 			   struct extent_buffer *buf,
2552 			   int full_backref, int inc)
2553 {
2554 	u64 bytenr;
2555 	u64 num_bytes;
2556 	u64 parent;
2557 	u64 ref_root;
2558 	u32 nritems;
2559 	struct btrfs_key key;
2560 	struct btrfs_file_extent_item *fi;
2561 	int i;
2562 	int level;
2563 	int ret = 0;
2564 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2565 			    u64, u64, u64, u64, u64, u64);
2566 
2567 	ref_root = btrfs_header_owner(buf);
2568 	nritems = btrfs_header_nritems(buf);
2569 	level = btrfs_header_level(buf);
2570 
2571 	if (!root->ref_cows && level == 0)
2572 		return 0;
2573 
2574 	if (inc)
2575 		process_func = btrfs_inc_extent_ref;
2576 	else
2577 		process_func = btrfs_free_extent;
2578 
2579 	if (full_backref)
2580 		parent = buf->start;
2581 	else
2582 		parent = 0;
2583 
2584 	for (i = 0; i < nritems; i++) {
2585 		if (level == 0) {
2586 			btrfs_item_key_to_cpu(buf, &key, i);
2587 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2588 				continue;
2589 			fi = btrfs_item_ptr(buf, i,
2590 					    struct btrfs_file_extent_item);
2591 			if (btrfs_file_extent_type(buf, fi) ==
2592 			    BTRFS_FILE_EXTENT_INLINE)
2593 				continue;
2594 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2595 			if (bytenr == 0)
2596 				continue;
2597 
2598 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2599 			key.offset -= btrfs_file_extent_offset(buf, fi);
2600 			ret = process_func(trans, root, bytenr, num_bytes,
2601 					   parent, ref_root, key.objectid,
2602 					   key.offset);
2603 			if (ret)
2604 				goto fail;
2605 		} else {
2606 			bytenr = btrfs_node_blockptr(buf, i);
2607 			num_bytes = btrfs_level_size(root, level - 1);
2608 			ret = process_func(trans, root, bytenr, num_bytes,
2609 					   parent, ref_root, level - 1, 0);
2610 			if (ret)
2611 				goto fail;
2612 		}
2613 	}
2614 	return 0;
2615 fail:
2616 	BUG();
2617 	return ret;
2618 }
2619 
2620 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2621 		  struct extent_buffer *buf, int full_backref)
2622 {
2623 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2624 }
2625 
2626 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2627 		  struct extent_buffer *buf, int full_backref)
2628 {
2629 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2630 }
2631 
2632 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2633 				 struct btrfs_root *root,
2634 				 struct btrfs_path *path,
2635 				 struct btrfs_block_group_cache *cache)
2636 {
2637 	int ret;
2638 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2639 	unsigned long bi;
2640 	struct extent_buffer *leaf;
2641 
2642 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2643 	if (ret < 0)
2644 		goto fail;
2645 	BUG_ON(ret);
2646 
2647 	leaf = path->nodes[0];
2648 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2649 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2650 	btrfs_mark_buffer_dirty(leaf);
2651 	btrfs_release_path(path);
2652 fail:
2653 	if (ret)
2654 		return ret;
2655 	return 0;
2656 
2657 }
2658 
2659 static struct btrfs_block_group_cache *
2660 next_block_group(struct btrfs_root *root,
2661 		 struct btrfs_block_group_cache *cache)
2662 {
2663 	struct rb_node *node;
2664 	spin_lock(&root->fs_info->block_group_cache_lock);
2665 	node = rb_next(&cache->cache_node);
2666 	btrfs_put_block_group(cache);
2667 	if (node) {
2668 		cache = rb_entry(node, struct btrfs_block_group_cache,
2669 				 cache_node);
2670 		btrfs_get_block_group(cache);
2671 	} else
2672 		cache = NULL;
2673 	spin_unlock(&root->fs_info->block_group_cache_lock);
2674 	return cache;
2675 }
2676 
2677 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2678 			    struct btrfs_trans_handle *trans,
2679 			    struct btrfs_path *path)
2680 {
2681 	struct btrfs_root *root = block_group->fs_info->tree_root;
2682 	struct inode *inode = NULL;
2683 	u64 alloc_hint = 0;
2684 	int dcs = BTRFS_DC_ERROR;
2685 	int num_pages = 0;
2686 	int retries = 0;
2687 	int ret = 0;
2688 
2689 	/*
2690 	 * If this block group is smaller than 100 megs don't bother caching the
2691 	 * block group.
2692 	 */
2693 	if (block_group->key.offset < (100 * 1024 * 1024)) {
2694 		spin_lock(&block_group->lock);
2695 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2696 		spin_unlock(&block_group->lock);
2697 		return 0;
2698 	}
2699 
2700 again:
2701 	inode = lookup_free_space_inode(root, block_group, path);
2702 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2703 		ret = PTR_ERR(inode);
2704 		btrfs_release_path(path);
2705 		goto out;
2706 	}
2707 
2708 	if (IS_ERR(inode)) {
2709 		BUG_ON(retries);
2710 		retries++;
2711 
2712 		if (block_group->ro)
2713 			goto out_free;
2714 
2715 		ret = create_free_space_inode(root, trans, block_group, path);
2716 		if (ret)
2717 			goto out_free;
2718 		goto again;
2719 	}
2720 
2721 	/* We've already setup this transaction, go ahead and exit */
2722 	if (block_group->cache_generation == trans->transid &&
2723 	    i_size_read(inode)) {
2724 		dcs = BTRFS_DC_SETUP;
2725 		goto out_put;
2726 	}
2727 
2728 	/*
2729 	 * We want to set the generation to 0, that way if anything goes wrong
2730 	 * from here on out we know not to trust this cache when we load up next
2731 	 * time.
2732 	 */
2733 	BTRFS_I(inode)->generation = 0;
2734 	ret = btrfs_update_inode(trans, root, inode);
2735 	WARN_ON(ret);
2736 
2737 	if (i_size_read(inode) > 0) {
2738 		ret = btrfs_truncate_free_space_cache(root, trans, path,
2739 						      inode);
2740 		if (ret)
2741 			goto out_put;
2742 	}
2743 
2744 	spin_lock(&block_group->lock);
2745 	if (block_group->cached != BTRFS_CACHE_FINISHED) {
2746 		/* We're not cached, don't bother trying to write stuff out */
2747 		dcs = BTRFS_DC_WRITTEN;
2748 		spin_unlock(&block_group->lock);
2749 		goto out_put;
2750 	}
2751 	spin_unlock(&block_group->lock);
2752 
2753 	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2754 	if (!num_pages)
2755 		num_pages = 1;
2756 
2757 	/*
2758 	 * Just to make absolutely sure we have enough space, we're going to
2759 	 * preallocate 12 pages worth of space for each block group.  In
2760 	 * practice we ought to use at most 8, but we need extra space so we can
2761 	 * add our header and have a terminator between the extents and the
2762 	 * bitmaps.
2763 	 */
2764 	num_pages *= 16;
2765 	num_pages *= PAGE_CACHE_SIZE;
2766 
2767 	ret = btrfs_check_data_free_space(inode, num_pages);
2768 	if (ret)
2769 		goto out_put;
2770 
2771 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2772 					      num_pages, num_pages,
2773 					      &alloc_hint);
2774 	if (!ret)
2775 		dcs = BTRFS_DC_SETUP;
2776 	btrfs_free_reserved_data_space(inode, num_pages);
2777 
2778 out_put:
2779 	iput(inode);
2780 out_free:
2781 	btrfs_release_path(path);
2782 out:
2783 	spin_lock(&block_group->lock);
2784 	if (!ret)
2785 		block_group->cache_generation = trans->transid;
2786 	block_group->disk_cache_state = dcs;
2787 	spin_unlock(&block_group->lock);
2788 
2789 	return ret;
2790 }
2791 
2792 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2793 				   struct btrfs_root *root)
2794 {
2795 	struct btrfs_block_group_cache *cache;
2796 	int err = 0;
2797 	struct btrfs_path *path;
2798 	u64 last = 0;
2799 
2800 	path = btrfs_alloc_path();
2801 	if (!path)
2802 		return -ENOMEM;
2803 
2804 again:
2805 	while (1) {
2806 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2807 		while (cache) {
2808 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2809 				break;
2810 			cache = next_block_group(root, cache);
2811 		}
2812 		if (!cache) {
2813 			if (last == 0)
2814 				break;
2815 			last = 0;
2816 			continue;
2817 		}
2818 		err = cache_save_setup(cache, trans, path);
2819 		last = cache->key.objectid + cache->key.offset;
2820 		btrfs_put_block_group(cache);
2821 	}
2822 
2823 	while (1) {
2824 		if (last == 0) {
2825 			err = btrfs_run_delayed_refs(trans, root,
2826 						     (unsigned long)-1);
2827 			BUG_ON(err);
2828 		}
2829 
2830 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2831 		while (cache) {
2832 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2833 				btrfs_put_block_group(cache);
2834 				goto again;
2835 			}
2836 
2837 			if (cache->dirty)
2838 				break;
2839 			cache = next_block_group(root, cache);
2840 		}
2841 		if (!cache) {
2842 			if (last == 0)
2843 				break;
2844 			last = 0;
2845 			continue;
2846 		}
2847 
2848 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
2849 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2850 		cache->dirty = 0;
2851 		last = cache->key.objectid + cache->key.offset;
2852 
2853 		err = write_one_cache_group(trans, root, path, cache);
2854 		BUG_ON(err);
2855 		btrfs_put_block_group(cache);
2856 	}
2857 
2858 	while (1) {
2859 		/*
2860 		 * I don't think this is needed since we're just marking our
2861 		 * preallocated extent as written, but just in case it can't
2862 		 * hurt.
2863 		 */
2864 		if (last == 0) {
2865 			err = btrfs_run_delayed_refs(trans, root,
2866 						     (unsigned long)-1);
2867 			BUG_ON(err);
2868 		}
2869 
2870 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2871 		while (cache) {
2872 			/*
2873 			 * Really this shouldn't happen, but it could if we
2874 			 * couldn't write the entire preallocated extent and
2875 			 * splitting the extent resulted in a new block.
2876 			 */
2877 			if (cache->dirty) {
2878 				btrfs_put_block_group(cache);
2879 				goto again;
2880 			}
2881 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2882 				break;
2883 			cache = next_block_group(root, cache);
2884 		}
2885 		if (!cache) {
2886 			if (last == 0)
2887 				break;
2888 			last = 0;
2889 			continue;
2890 		}
2891 
2892 		btrfs_write_out_cache(root, trans, cache, path);
2893 
2894 		/*
2895 		 * If we didn't have an error then the cache state is still
2896 		 * NEED_WRITE, so we can set it to WRITTEN.
2897 		 */
2898 		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2899 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
2900 		last = cache->key.objectid + cache->key.offset;
2901 		btrfs_put_block_group(cache);
2902 	}
2903 
2904 	btrfs_free_path(path);
2905 	return 0;
2906 }
2907 
2908 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2909 {
2910 	struct btrfs_block_group_cache *block_group;
2911 	int readonly = 0;
2912 
2913 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2914 	if (!block_group || block_group->ro)
2915 		readonly = 1;
2916 	if (block_group)
2917 		btrfs_put_block_group(block_group);
2918 	return readonly;
2919 }
2920 
2921 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2922 			     u64 total_bytes, u64 bytes_used,
2923 			     struct btrfs_space_info **space_info)
2924 {
2925 	struct btrfs_space_info *found;
2926 	int i;
2927 	int factor;
2928 
2929 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2930 		     BTRFS_BLOCK_GROUP_RAID10))
2931 		factor = 2;
2932 	else
2933 		factor = 1;
2934 
2935 	found = __find_space_info(info, flags);
2936 	if (found) {
2937 		spin_lock(&found->lock);
2938 		found->total_bytes += total_bytes;
2939 		found->disk_total += total_bytes * factor;
2940 		found->bytes_used += bytes_used;
2941 		found->disk_used += bytes_used * factor;
2942 		found->full = 0;
2943 		spin_unlock(&found->lock);
2944 		*space_info = found;
2945 		return 0;
2946 	}
2947 	found = kzalloc(sizeof(*found), GFP_NOFS);
2948 	if (!found)
2949 		return -ENOMEM;
2950 
2951 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2952 		INIT_LIST_HEAD(&found->block_groups[i]);
2953 	init_rwsem(&found->groups_sem);
2954 	spin_lock_init(&found->lock);
2955 	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2956 				BTRFS_BLOCK_GROUP_SYSTEM |
2957 				BTRFS_BLOCK_GROUP_METADATA);
2958 	found->total_bytes = total_bytes;
2959 	found->disk_total = total_bytes * factor;
2960 	found->bytes_used = bytes_used;
2961 	found->disk_used = bytes_used * factor;
2962 	found->bytes_pinned = 0;
2963 	found->bytes_reserved = 0;
2964 	found->bytes_readonly = 0;
2965 	found->bytes_may_use = 0;
2966 	found->full = 0;
2967 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2968 	found->chunk_alloc = 0;
2969 	found->flush = 0;
2970 	init_waitqueue_head(&found->wait);
2971 	*space_info = found;
2972 	list_add_rcu(&found->list, &info->space_info);
2973 	return 0;
2974 }
2975 
2976 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2977 {
2978 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2979 				   BTRFS_BLOCK_GROUP_RAID1 |
2980 				   BTRFS_BLOCK_GROUP_RAID10 |
2981 				   BTRFS_BLOCK_GROUP_DUP);
2982 	if (extra_flags) {
2983 		if (flags & BTRFS_BLOCK_GROUP_DATA)
2984 			fs_info->avail_data_alloc_bits |= extra_flags;
2985 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
2986 			fs_info->avail_metadata_alloc_bits |= extra_flags;
2987 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2988 			fs_info->avail_system_alloc_bits |= extra_flags;
2989 	}
2990 }
2991 
2992 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2993 {
2994 	/*
2995 	 * we add in the count of missing devices because we want
2996 	 * to make sure that any RAID levels on a degraded FS
2997 	 * continue to be honored.
2998 	 */
2999 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3000 		root->fs_info->fs_devices->missing_devices;
3001 
3002 	if (num_devices == 1)
3003 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3004 	if (num_devices < 4)
3005 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3006 
3007 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3008 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3009 		      BTRFS_BLOCK_GROUP_RAID10))) {
3010 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
3011 	}
3012 
3013 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3014 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3015 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3016 	}
3017 
3018 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3019 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3020 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
3021 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
3022 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3023 	return flags;
3024 }
3025 
3026 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3027 {
3028 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3029 		flags |= root->fs_info->avail_data_alloc_bits &
3030 			 root->fs_info->data_alloc_profile;
3031 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3032 		flags |= root->fs_info->avail_system_alloc_bits &
3033 			 root->fs_info->system_alloc_profile;
3034 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3035 		flags |= root->fs_info->avail_metadata_alloc_bits &
3036 			 root->fs_info->metadata_alloc_profile;
3037 	return btrfs_reduce_alloc_profile(root, flags);
3038 }
3039 
3040 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3041 {
3042 	u64 flags;
3043 
3044 	if (data)
3045 		flags = BTRFS_BLOCK_GROUP_DATA;
3046 	else if (root == root->fs_info->chunk_root)
3047 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3048 	else
3049 		flags = BTRFS_BLOCK_GROUP_METADATA;
3050 
3051 	return get_alloc_profile(root, flags);
3052 }
3053 
3054 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3055 {
3056 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3057 						       BTRFS_BLOCK_GROUP_DATA);
3058 }
3059 
3060 /*
3061  * This will check the space that the inode allocates from to make sure we have
3062  * enough space for bytes.
3063  */
3064 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3065 {
3066 	struct btrfs_space_info *data_sinfo;
3067 	struct btrfs_root *root = BTRFS_I(inode)->root;
3068 	u64 used;
3069 	int ret = 0, committed = 0, alloc_chunk = 1;
3070 
3071 	/* make sure bytes are sectorsize aligned */
3072 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3073 
3074 	if (root == root->fs_info->tree_root ||
3075 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3076 		alloc_chunk = 0;
3077 		committed = 1;
3078 	}
3079 
3080 	data_sinfo = BTRFS_I(inode)->space_info;
3081 	if (!data_sinfo)
3082 		goto alloc;
3083 
3084 again:
3085 	/* make sure we have enough space to handle the data first */
3086 	spin_lock(&data_sinfo->lock);
3087 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3088 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3089 		data_sinfo->bytes_may_use;
3090 
3091 	if (used + bytes > data_sinfo->total_bytes) {
3092 		struct btrfs_trans_handle *trans;
3093 
3094 		/*
3095 		 * if we don't have enough free bytes in this space then we need
3096 		 * to alloc a new chunk.
3097 		 */
3098 		if (!data_sinfo->full && alloc_chunk) {
3099 			u64 alloc_target;
3100 
3101 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3102 			spin_unlock(&data_sinfo->lock);
3103 alloc:
3104 			alloc_target = btrfs_get_alloc_profile(root, 1);
3105 			trans = btrfs_join_transaction(root);
3106 			if (IS_ERR(trans))
3107 				return PTR_ERR(trans);
3108 
3109 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3110 					     bytes + 2 * 1024 * 1024,
3111 					     alloc_target,
3112 					     CHUNK_ALLOC_NO_FORCE);
3113 			btrfs_end_transaction(trans, root);
3114 			if (ret < 0) {
3115 				if (ret != -ENOSPC)
3116 					return ret;
3117 				else
3118 					goto commit_trans;
3119 			}
3120 
3121 			if (!data_sinfo) {
3122 				btrfs_set_inode_space_info(root, inode);
3123 				data_sinfo = BTRFS_I(inode)->space_info;
3124 			}
3125 			goto again;
3126 		}
3127 
3128 		/*
3129 		 * If we have less pinned bytes than we want to allocate then
3130 		 * don't bother committing the transaction, it won't help us.
3131 		 */
3132 		if (data_sinfo->bytes_pinned < bytes)
3133 			committed = 1;
3134 		spin_unlock(&data_sinfo->lock);
3135 
3136 		/* commit the current transaction and try again */
3137 commit_trans:
3138 		if (!committed &&
3139 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3140 			committed = 1;
3141 			trans = btrfs_join_transaction(root);
3142 			if (IS_ERR(trans))
3143 				return PTR_ERR(trans);
3144 			ret = btrfs_commit_transaction(trans, root);
3145 			if (ret)
3146 				return ret;
3147 			goto again;
3148 		}
3149 
3150 		return -ENOSPC;
3151 	}
3152 	data_sinfo->bytes_may_use += bytes;
3153 	spin_unlock(&data_sinfo->lock);
3154 
3155 	return 0;
3156 }
3157 
3158 /*
3159  * Called if we need to clear a data reservation for this inode.
3160  */
3161 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3162 {
3163 	struct btrfs_root *root = BTRFS_I(inode)->root;
3164 	struct btrfs_space_info *data_sinfo;
3165 
3166 	/* make sure bytes are sectorsize aligned */
3167 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3168 
3169 	data_sinfo = BTRFS_I(inode)->space_info;
3170 	spin_lock(&data_sinfo->lock);
3171 	data_sinfo->bytes_may_use -= bytes;
3172 	spin_unlock(&data_sinfo->lock);
3173 }
3174 
3175 static void force_metadata_allocation(struct btrfs_fs_info *info)
3176 {
3177 	struct list_head *head = &info->space_info;
3178 	struct btrfs_space_info *found;
3179 
3180 	rcu_read_lock();
3181 	list_for_each_entry_rcu(found, head, list) {
3182 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3183 			found->force_alloc = CHUNK_ALLOC_FORCE;
3184 	}
3185 	rcu_read_unlock();
3186 }
3187 
3188 static int should_alloc_chunk(struct btrfs_root *root,
3189 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
3190 			      int force)
3191 {
3192 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3193 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3194 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3195 	u64 thresh;
3196 
3197 	if (force == CHUNK_ALLOC_FORCE)
3198 		return 1;
3199 
3200 	/*
3201 	 * We need to take into account the global rsv because for all intents
3202 	 * and purposes it's used space.  Don't worry about locking the
3203 	 * global_rsv, it doesn't change except when the transaction commits.
3204 	 */
3205 	num_allocated += global_rsv->size;
3206 
3207 	/*
3208 	 * in limited mode, we want to have some free space up to
3209 	 * about 1% of the FS size.
3210 	 */
3211 	if (force == CHUNK_ALLOC_LIMITED) {
3212 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3213 		thresh = max_t(u64, 64 * 1024 * 1024,
3214 			       div_factor_fine(thresh, 1));
3215 
3216 		if (num_bytes - num_allocated < thresh)
3217 			return 1;
3218 	}
3219 
3220 	/*
3221 	 * we have two similar checks here, one based on percentage
3222 	 * and once based on a hard number of 256MB.  The idea
3223 	 * is that if we have a good amount of free
3224 	 * room, don't allocate a chunk.  A good mount is
3225 	 * less than 80% utilized of the chunks we have allocated,
3226 	 * or more than 256MB free
3227 	 */
3228 	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3229 		return 0;
3230 
3231 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3232 		return 0;
3233 
3234 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3235 
3236 	/* 256MB or 5% of the FS */
3237 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3238 
3239 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3240 		return 0;
3241 	return 1;
3242 }
3243 
3244 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3245 			  struct btrfs_root *extent_root, u64 alloc_bytes,
3246 			  u64 flags, int force)
3247 {
3248 	struct btrfs_space_info *space_info;
3249 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3250 	int wait_for_alloc = 0;
3251 	int ret = 0;
3252 
3253 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
3254 
3255 	space_info = __find_space_info(extent_root->fs_info, flags);
3256 	if (!space_info) {
3257 		ret = update_space_info(extent_root->fs_info, flags,
3258 					0, 0, &space_info);
3259 		BUG_ON(ret);
3260 	}
3261 	BUG_ON(!space_info);
3262 
3263 again:
3264 	spin_lock(&space_info->lock);
3265 	if (space_info->force_alloc)
3266 		force = space_info->force_alloc;
3267 	if (space_info->full) {
3268 		spin_unlock(&space_info->lock);
3269 		return 0;
3270 	}
3271 
3272 	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3273 		spin_unlock(&space_info->lock);
3274 		return 0;
3275 	} else if (space_info->chunk_alloc) {
3276 		wait_for_alloc = 1;
3277 	} else {
3278 		space_info->chunk_alloc = 1;
3279 	}
3280 
3281 	spin_unlock(&space_info->lock);
3282 
3283 	mutex_lock(&fs_info->chunk_mutex);
3284 
3285 	/*
3286 	 * The chunk_mutex is held throughout the entirety of a chunk
3287 	 * allocation, so once we've acquired the chunk_mutex we know that the
3288 	 * other guy is done and we need to recheck and see if we should
3289 	 * allocate.
3290 	 */
3291 	if (wait_for_alloc) {
3292 		mutex_unlock(&fs_info->chunk_mutex);
3293 		wait_for_alloc = 0;
3294 		goto again;
3295 	}
3296 
3297 	/*
3298 	 * If we have mixed data/metadata chunks we want to make sure we keep
3299 	 * allocating mixed chunks instead of individual chunks.
3300 	 */
3301 	if (btrfs_mixed_space_info(space_info))
3302 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3303 
3304 	/*
3305 	 * if we're doing a data chunk, go ahead and make sure that
3306 	 * we keep a reasonable number of metadata chunks allocated in the
3307 	 * FS as well.
3308 	 */
3309 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3310 		fs_info->data_chunk_allocations++;
3311 		if (!(fs_info->data_chunk_allocations %
3312 		      fs_info->metadata_ratio))
3313 			force_metadata_allocation(fs_info);
3314 	}
3315 
3316 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3317 	if (ret < 0 && ret != -ENOSPC)
3318 		goto out;
3319 
3320 	spin_lock(&space_info->lock);
3321 	if (ret)
3322 		space_info->full = 1;
3323 	else
3324 		ret = 1;
3325 
3326 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3327 	space_info->chunk_alloc = 0;
3328 	spin_unlock(&space_info->lock);
3329 out:
3330 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
3331 	return ret;
3332 }
3333 
3334 /*
3335  * shrink metadata reservation for delalloc
3336  */
3337 static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3338 			   bool wait_ordered)
3339 {
3340 	struct btrfs_block_rsv *block_rsv;
3341 	struct btrfs_space_info *space_info;
3342 	struct btrfs_trans_handle *trans;
3343 	u64 reserved;
3344 	u64 max_reclaim;
3345 	u64 reclaimed = 0;
3346 	long time_left;
3347 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3348 	int loops = 0;
3349 	unsigned long progress;
3350 
3351 	trans = (struct btrfs_trans_handle *)current->journal_info;
3352 	block_rsv = &root->fs_info->delalloc_block_rsv;
3353 	space_info = block_rsv->space_info;
3354 
3355 	smp_mb();
3356 	reserved = space_info->bytes_may_use;
3357 	progress = space_info->reservation_progress;
3358 
3359 	if (reserved == 0)
3360 		return 0;
3361 
3362 	smp_mb();
3363 	if (root->fs_info->delalloc_bytes == 0) {
3364 		if (trans)
3365 			return 0;
3366 		btrfs_wait_ordered_extents(root, 0, 0);
3367 		return 0;
3368 	}
3369 
3370 	max_reclaim = min(reserved, to_reclaim);
3371 	nr_pages = max_t(unsigned long, nr_pages,
3372 			 max_reclaim >> PAGE_CACHE_SHIFT);
3373 	while (loops < 1024) {
3374 		/* have the flusher threads jump in and do some IO */
3375 		smp_mb();
3376 		nr_pages = min_t(unsigned long, nr_pages,
3377 		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3378 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3379 						WB_REASON_FS_FREE_SPACE);
3380 
3381 		spin_lock(&space_info->lock);
3382 		if (reserved > space_info->bytes_may_use)
3383 			reclaimed += reserved - space_info->bytes_may_use;
3384 		reserved = space_info->bytes_may_use;
3385 		spin_unlock(&space_info->lock);
3386 
3387 		loops++;
3388 
3389 		if (reserved == 0 || reclaimed >= max_reclaim)
3390 			break;
3391 
3392 		if (trans && trans->transaction->blocked)
3393 			return -EAGAIN;
3394 
3395 		if (wait_ordered && !trans) {
3396 			btrfs_wait_ordered_extents(root, 0, 0);
3397 		} else {
3398 			time_left = schedule_timeout_interruptible(1);
3399 
3400 			/* We were interrupted, exit */
3401 			if (time_left)
3402 				break;
3403 		}
3404 
3405 		/* we've kicked the IO a few times, if anything has been freed,
3406 		 * exit.  There is no sense in looping here for a long time
3407 		 * when we really need to commit the transaction, or there are
3408 		 * just too many writers without enough free space
3409 		 */
3410 
3411 		if (loops > 3) {
3412 			smp_mb();
3413 			if (progress != space_info->reservation_progress)
3414 				break;
3415 		}
3416 
3417 	}
3418 
3419 	return reclaimed >= to_reclaim;
3420 }
3421 
3422 /**
3423  * maybe_commit_transaction - possibly commit the transaction if its ok to
3424  * @root - the root we're allocating for
3425  * @bytes - the number of bytes we want to reserve
3426  * @force - force the commit
3427  *
3428  * This will check to make sure that committing the transaction will actually
3429  * get us somewhere and then commit the transaction if it does.  Otherwise it
3430  * will return -ENOSPC.
3431  */
3432 static int may_commit_transaction(struct btrfs_root *root,
3433 				  struct btrfs_space_info *space_info,
3434 				  u64 bytes, int force)
3435 {
3436 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3437 	struct btrfs_trans_handle *trans;
3438 
3439 	trans = (struct btrfs_trans_handle *)current->journal_info;
3440 	if (trans)
3441 		return -EAGAIN;
3442 
3443 	if (force)
3444 		goto commit;
3445 
3446 	/* See if there is enough pinned space to make this reservation */
3447 	spin_lock(&space_info->lock);
3448 	if (space_info->bytes_pinned >= bytes) {
3449 		spin_unlock(&space_info->lock);
3450 		goto commit;
3451 	}
3452 	spin_unlock(&space_info->lock);
3453 
3454 	/*
3455 	 * See if there is some space in the delayed insertion reservation for
3456 	 * this reservation.
3457 	 */
3458 	if (space_info != delayed_rsv->space_info)
3459 		return -ENOSPC;
3460 
3461 	spin_lock(&delayed_rsv->lock);
3462 	if (delayed_rsv->size < bytes) {
3463 		spin_unlock(&delayed_rsv->lock);
3464 		return -ENOSPC;
3465 	}
3466 	spin_unlock(&delayed_rsv->lock);
3467 
3468 commit:
3469 	trans = btrfs_join_transaction(root);
3470 	if (IS_ERR(trans))
3471 		return -ENOSPC;
3472 
3473 	return btrfs_commit_transaction(trans, root);
3474 }
3475 
3476 /**
3477  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3478  * @root - the root we're allocating for
3479  * @block_rsv - the block_rsv we're allocating for
3480  * @orig_bytes - the number of bytes we want
3481  * @flush - wether or not we can flush to make our reservation
3482  *
3483  * This will reserve orgi_bytes number of bytes from the space info associated
3484  * with the block_rsv.  If there is not enough space it will make an attempt to
3485  * flush out space to make room.  It will do this by flushing delalloc if
3486  * possible or committing the transaction.  If flush is 0 then no attempts to
3487  * regain reservations will be made and this will fail if there is not enough
3488  * space already.
3489  */
3490 static int reserve_metadata_bytes(struct btrfs_root *root,
3491 				  struct btrfs_block_rsv *block_rsv,
3492 				  u64 orig_bytes, int flush)
3493 {
3494 	struct btrfs_space_info *space_info = block_rsv->space_info;
3495 	u64 used;
3496 	u64 num_bytes = orig_bytes;
3497 	int retries = 0;
3498 	int ret = 0;
3499 	bool committed = false;
3500 	bool flushing = false;
3501 	bool wait_ordered = false;
3502 
3503 again:
3504 	ret = 0;
3505 	spin_lock(&space_info->lock);
3506 	/*
3507 	 * We only want to wait if somebody other than us is flushing and we are
3508 	 * actually alloed to flush.
3509 	 */
3510 	while (flush && !flushing && space_info->flush) {
3511 		spin_unlock(&space_info->lock);
3512 		/*
3513 		 * If we have a trans handle we can't wait because the flusher
3514 		 * may have to commit the transaction, which would mean we would
3515 		 * deadlock since we are waiting for the flusher to finish, but
3516 		 * hold the current transaction open.
3517 		 */
3518 		if (current->journal_info)
3519 			return -EAGAIN;
3520 		ret = wait_event_interruptible(space_info->wait,
3521 					       !space_info->flush);
3522 		/* Must have been interrupted, return */
3523 		if (ret)
3524 			return -EINTR;
3525 
3526 		spin_lock(&space_info->lock);
3527 	}
3528 
3529 	ret = -ENOSPC;
3530 	used = space_info->bytes_used + space_info->bytes_reserved +
3531 		space_info->bytes_pinned + space_info->bytes_readonly +
3532 		space_info->bytes_may_use;
3533 
3534 	/*
3535 	 * The idea here is that we've not already over-reserved the block group
3536 	 * then we can go ahead and save our reservation first and then start
3537 	 * flushing if we need to.  Otherwise if we've already overcommitted
3538 	 * lets start flushing stuff first and then come back and try to make
3539 	 * our reservation.
3540 	 */
3541 	if (used <= space_info->total_bytes) {
3542 		if (used + orig_bytes <= space_info->total_bytes) {
3543 			space_info->bytes_may_use += orig_bytes;
3544 			ret = 0;
3545 		} else {
3546 			/*
3547 			 * Ok set num_bytes to orig_bytes since we aren't
3548 			 * overocmmitted, this way we only try and reclaim what
3549 			 * we need.
3550 			 */
3551 			num_bytes = orig_bytes;
3552 		}
3553 	} else {
3554 		/*
3555 		 * Ok we're over committed, set num_bytes to the overcommitted
3556 		 * amount plus the amount of bytes that we need for this
3557 		 * reservation.
3558 		 */
3559 		wait_ordered = true;
3560 		num_bytes = used - space_info->total_bytes +
3561 			(orig_bytes * (retries + 1));
3562 	}
3563 
3564 	if (ret) {
3565 		u64 profile = btrfs_get_alloc_profile(root, 0);
3566 		u64 avail;
3567 
3568 		/*
3569 		 * If we have a lot of space that's pinned, don't bother doing
3570 		 * the overcommit dance yet and just commit the transaction.
3571 		 */
3572 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3573 		do_div(avail, 10);
3574 		if (space_info->bytes_pinned >= avail && flush && !committed) {
3575 			space_info->flush = 1;
3576 			flushing = true;
3577 			spin_unlock(&space_info->lock);
3578 			ret = may_commit_transaction(root, space_info,
3579 						     orig_bytes, 1);
3580 			if (ret)
3581 				goto out;
3582 			committed = true;
3583 			goto again;
3584 		}
3585 
3586 		spin_lock(&root->fs_info->free_chunk_lock);
3587 		avail = root->fs_info->free_chunk_space;
3588 
3589 		/*
3590 		 * If we have dup, raid1 or raid10 then only half of the free
3591 		 * space is actually useable.
3592 		 */
3593 		if (profile & (BTRFS_BLOCK_GROUP_DUP |
3594 			       BTRFS_BLOCK_GROUP_RAID1 |
3595 			       BTRFS_BLOCK_GROUP_RAID10))
3596 			avail >>= 1;
3597 
3598 		/*
3599 		 * If we aren't flushing don't let us overcommit too much, say
3600 		 * 1/8th of the space.  If we can flush, let it overcommit up to
3601 		 * 1/2 of the space.
3602 		 */
3603 		if (flush)
3604 			avail >>= 3;
3605 		else
3606 			avail >>= 1;
3607 		 spin_unlock(&root->fs_info->free_chunk_lock);
3608 
3609 		if (used + num_bytes < space_info->total_bytes + avail) {
3610 			space_info->bytes_may_use += orig_bytes;
3611 			ret = 0;
3612 		} else {
3613 			wait_ordered = true;
3614 		}
3615 	}
3616 
3617 	/*
3618 	 * Couldn't make our reservation, save our place so while we're trying
3619 	 * to reclaim space we can actually use it instead of somebody else
3620 	 * stealing it from us.
3621 	 */
3622 	if (ret && flush) {
3623 		flushing = true;
3624 		space_info->flush = 1;
3625 	}
3626 
3627 	spin_unlock(&space_info->lock);
3628 
3629 	if (!ret || !flush)
3630 		goto out;
3631 
3632 	/*
3633 	 * We do synchronous shrinking since we don't actually unreserve
3634 	 * metadata until after the IO is completed.
3635 	 */
3636 	ret = shrink_delalloc(root, num_bytes, wait_ordered);
3637 	if (ret < 0)
3638 		goto out;
3639 
3640 	ret = 0;
3641 
3642 	/*
3643 	 * So if we were overcommitted it's possible that somebody else flushed
3644 	 * out enough space and we simply didn't have enough space to reclaim,
3645 	 * so go back around and try again.
3646 	 */
3647 	if (retries < 2) {
3648 		wait_ordered = true;
3649 		retries++;
3650 		goto again;
3651 	}
3652 
3653 	ret = -ENOSPC;
3654 	if (committed)
3655 		goto out;
3656 
3657 	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3658 	if (!ret) {
3659 		committed = true;
3660 		goto again;
3661 	}
3662 
3663 out:
3664 	if (flushing) {
3665 		spin_lock(&space_info->lock);
3666 		space_info->flush = 0;
3667 		wake_up_all(&space_info->wait);
3668 		spin_unlock(&space_info->lock);
3669 	}
3670 	return ret;
3671 }
3672 
3673 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3674 					     struct btrfs_root *root)
3675 {
3676 	struct btrfs_block_rsv *block_rsv = NULL;
3677 
3678 	if (root->ref_cows || root == root->fs_info->csum_root)
3679 		block_rsv = trans->block_rsv;
3680 
3681 	if (!block_rsv)
3682 		block_rsv = root->block_rsv;
3683 
3684 	if (!block_rsv)
3685 		block_rsv = &root->fs_info->empty_block_rsv;
3686 
3687 	return block_rsv;
3688 }
3689 
3690 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3691 			       u64 num_bytes)
3692 {
3693 	int ret = -ENOSPC;
3694 	spin_lock(&block_rsv->lock);
3695 	if (block_rsv->reserved >= num_bytes) {
3696 		block_rsv->reserved -= num_bytes;
3697 		if (block_rsv->reserved < block_rsv->size)
3698 			block_rsv->full = 0;
3699 		ret = 0;
3700 	}
3701 	spin_unlock(&block_rsv->lock);
3702 	return ret;
3703 }
3704 
3705 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3706 				u64 num_bytes, int update_size)
3707 {
3708 	spin_lock(&block_rsv->lock);
3709 	block_rsv->reserved += num_bytes;
3710 	if (update_size)
3711 		block_rsv->size += num_bytes;
3712 	else if (block_rsv->reserved >= block_rsv->size)
3713 		block_rsv->full = 1;
3714 	spin_unlock(&block_rsv->lock);
3715 }
3716 
3717 static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3718 				    struct btrfs_block_rsv *dest, u64 num_bytes)
3719 {
3720 	struct btrfs_space_info *space_info = block_rsv->space_info;
3721 
3722 	spin_lock(&block_rsv->lock);
3723 	if (num_bytes == (u64)-1)
3724 		num_bytes = block_rsv->size;
3725 	block_rsv->size -= num_bytes;
3726 	if (block_rsv->reserved >= block_rsv->size) {
3727 		num_bytes = block_rsv->reserved - block_rsv->size;
3728 		block_rsv->reserved = block_rsv->size;
3729 		block_rsv->full = 1;
3730 	} else {
3731 		num_bytes = 0;
3732 	}
3733 	spin_unlock(&block_rsv->lock);
3734 
3735 	if (num_bytes > 0) {
3736 		if (dest) {
3737 			spin_lock(&dest->lock);
3738 			if (!dest->full) {
3739 				u64 bytes_to_add;
3740 
3741 				bytes_to_add = dest->size - dest->reserved;
3742 				bytes_to_add = min(num_bytes, bytes_to_add);
3743 				dest->reserved += bytes_to_add;
3744 				if (dest->reserved >= dest->size)
3745 					dest->full = 1;
3746 				num_bytes -= bytes_to_add;
3747 			}
3748 			spin_unlock(&dest->lock);
3749 		}
3750 		if (num_bytes) {
3751 			spin_lock(&space_info->lock);
3752 			space_info->bytes_may_use -= num_bytes;
3753 			space_info->reservation_progress++;
3754 			spin_unlock(&space_info->lock);
3755 		}
3756 	}
3757 }
3758 
3759 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3760 				   struct btrfs_block_rsv *dst, u64 num_bytes)
3761 {
3762 	int ret;
3763 
3764 	ret = block_rsv_use_bytes(src, num_bytes);
3765 	if (ret)
3766 		return ret;
3767 
3768 	block_rsv_add_bytes(dst, num_bytes, 1);
3769 	return 0;
3770 }
3771 
3772 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3773 {
3774 	memset(rsv, 0, sizeof(*rsv));
3775 	spin_lock_init(&rsv->lock);
3776 }
3777 
3778 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3779 {
3780 	struct btrfs_block_rsv *block_rsv;
3781 	struct btrfs_fs_info *fs_info = root->fs_info;
3782 
3783 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3784 	if (!block_rsv)
3785 		return NULL;
3786 
3787 	btrfs_init_block_rsv(block_rsv);
3788 	block_rsv->space_info = __find_space_info(fs_info,
3789 						  BTRFS_BLOCK_GROUP_METADATA);
3790 	return block_rsv;
3791 }
3792 
3793 void btrfs_free_block_rsv(struct btrfs_root *root,
3794 			  struct btrfs_block_rsv *rsv)
3795 {
3796 	btrfs_block_rsv_release(root, rsv, (u64)-1);
3797 	kfree(rsv);
3798 }
3799 
3800 static inline int __block_rsv_add(struct btrfs_root *root,
3801 				  struct btrfs_block_rsv *block_rsv,
3802 				  u64 num_bytes, int flush)
3803 {
3804 	int ret;
3805 
3806 	if (num_bytes == 0)
3807 		return 0;
3808 
3809 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3810 	if (!ret) {
3811 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
3812 		return 0;
3813 	}
3814 
3815 	return ret;
3816 }
3817 
3818 int btrfs_block_rsv_add(struct btrfs_root *root,
3819 			struct btrfs_block_rsv *block_rsv,
3820 			u64 num_bytes)
3821 {
3822 	return __block_rsv_add(root, block_rsv, num_bytes, 1);
3823 }
3824 
3825 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3826 				struct btrfs_block_rsv *block_rsv,
3827 				u64 num_bytes)
3828 {
3829 	return __block_rsv_add(root, block_rsv, num_bytes, 0);
3830 }
3831 
3832 int btrfs_block_rsv_check(struct btrfs_root *root,
3833 			  struct btrfs_block_rsv *block_rsv, int min_factor)
3834 {
3835 	u64 num_bytes = 0;
3836 	int ret = -ENOSPC;
3837 
3838 	if (!block_rsv)
3839 		return 0;
3840 
3841 	spin_lock(&block_rsv->lock);
3842 	num_bytes = div_factor(block_rsv->size, min_factor);
3843 	if (block_rsv->reserved >= num_bytes)
3844 		ret = 0;
3845 	spin_unlock(&block_rsv->lock);
3846 
3847 	return ret;
3848 }
3849 
3850 int btrfs_block_rsv_refill(struct btrfs_root *root,
3851 			  struct btrfs_block_rsv *block_rsv,
3852 			  u64 min_reserved)
3853 {
3854 	u64 num_bytes = 0;
3855 	int ret = -ENOSPC;
3856 
3857 	if (!block_rsv)
3858 		return 0;
3859 
3860 	spin_lock(&block_rsv->lock);
3861 	num_bytes = min_reserved;
3862 	if (block_rsv->reserved >= num_bytes)
3863 		ret = 0;
3864 	else
3865 		num_bytes -= block_rsv->reserved;
3866 	spin_unlock(&block_rsv->lock);
3867 
3868 	if (!ret)
3869 		return 0;
3870 
3871 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3872 	if (!ret) {
3873 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
3874 		return 0;
3875 	}
3876 
3877 	return ret;
3878 }
3879 
3880 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3881 			    struct btrfs_block_rsv *dst_rsv,
3882 			    u64 num_bytes)
3883 {
3884 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3885 }
3886 
3887 void btrfs_block_rsv_release(struct btrfs_root *root,
3888 			     struct btrfs_block_rsv *block_rsv,
3889 			     u64 num_bytes)
3890 {
3891 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3892 	if (global_rsv->full || global_rsv == block_rsv ||
3893 	    block_rsv->space_info != global_rsv->space_info)
3894 		global_rsv = NULL;
3895 	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3896 }
3897 
3898 /*
3899  * helper to calculate size of global block reservation.
3900  * the desired value is sum of space used by extent tree,
3901  * checksum tree and root tree
3902  */
3903 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3904 {
3905 	struct btrfs_space_info *sinfo;
3906 	u64 num_bytes;
3907 	u64 meta_used;
3908 	u64 data_used;
3909 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3910 
3911 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3912 	spin_lock(&sinfo->lock);
3913 	data_used = sinfo->bytes_used;
3914 	spin_unlock(&sinfo->lock);
3915 
3916 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3917 	spin_lock(&sinfo->lock);
3918 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3919 		data_used = 0;
3920 	meta_used = sinfo->bytes_used;
3921 	spin_unlock(&sinfo->lock);
3922 
3923 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3924 		    csum_size * 2;
3925 	num_bytes += div64_u64(data_used + meta_used, 50);
3926 
3927 	if (num_bytes * 3 > meta_used)
3928 		num_bytes = div64_u64(meta_used, 3);
3929 
3930 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3931 }
3932 
3933 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3934 {
3935 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3936 	struct btrfs_space_info *sinfo = block_rsv->space_info;
3937 	u64 num_bytes;
3938 
3939 	num_bytes = calc_global_metadata_size(fs_info);
3940 
3941 	spin_lock(&block_rsv->lock);
3942 	spin_lock(&sinfo->lock);
3943 
3944 	block_rsv->size = num_bytes;
3945 
3946 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3947 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
3948 		    sinfo->bytes_may_use;
3949 
3950 	if (sinfo->total_bytes > num_bytes) {
3951 		num_bytes = sinfo->total_bytes - num_bytes;
3952 		block_rsv->reserved += num_bytes;
3953 		sinfo->bytes_may_use += num_bytes;
3954 	}
3955 
3956 	if (block_rsv->reserved >= block_rsv->size) {
3957 		num_bytes = block_rsv->reserved - block_rsv->size;
3958 		sinfo->bytes_may_use -= num_bytes;
3959 		sinfo->reservation_progress++;
3960 		block_rsv->reserved = block_rsv->size;
3961 		block_rsv->full = 1;
3962 	}
3963 
3964 	spin_unlock(&sinfo->lock);
3965 	spin_unlock(&block_rsv->lock);
3966 }
3967 
3968 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3969 {
3970 	struct btrfs_space_info *space_info;
3971 
3972 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3973 	fs_info->chunk_block_rsv.space_info = space_info;
3974 
3975 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3976 	fs_info->global_block_rsv.space_info = space_info;
3977 	fs_info->delalloc_block_rsv.space_info = space_info;
3978 	fs_info->trans_block_rsv.space_info = space_info;
3979 	fs_info->empty_block_rsv.space_info = space_info;
3980 	fs_info->delayed_block_rsv.space_info = space_info;
3981 
3982 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3983 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3984 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3985 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3986 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3987 
3988 	update_global_block_rsv(fs_info);
3989 }
3990 
3991 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3992 {
3993 	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3994 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3995 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3996 	WARN_ON(fs_info->trans_block_rsv.size > 0);
3997 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3998 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
3999 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4000 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4001 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4002 }
4003 
4004 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4005 				  struct btrfs_root *root)
4006 {
4007 	if (!trans->bytes_reserved)
4008 		return;
4009 
4010 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4011 	trans->bytes_reserved = 0;
4012 }
4013 
4014 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4015 				  struct inode *inode)
4016 {
4017 	struct btrfs_root *root = BTRFS_I(inode)->root;
4018 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4019 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4020 
4021 	/*
4022 	 * We need to hold space in order to delete our orphan item once we've
4023 	 * added it, so this takes the reservation so we can release it later
4024 	 * when we are truly done with the orphan item.
4025 	 */
4026 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4027 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4028 }
4029 
4030 void btrfs_orphan_release_metadata(struct inode *inode)
4031 {
4032 	struct btrfs_root *root = BTRFS_I(inode)->root;
4033 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4034 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4035 }
4036 
4037 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4038 				struct btrfs_pending_snapshot *pending)
4039 {
4040 	struct btrfs_root *root = pending->root;
4041 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4042 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4043 	/*
4044 	 * two for root back/forward refs, two for directory entries
4045 	 * and one for root of the snapshot.
4046 	 */
4047 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4048 	dst_rsv->space_info = src_rsv->space_info;
4049 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4050 }
4051 
4052 /**
4053  * drop_outstanding_extent - drop an outstanding extent
4054  * @inode: the inode we're dropping the extent for
4055  *
4056  * This is called when we are freeing up an outstanding extent, either called
4057  * after an error or after an extent is written.  This will return the number of
4058  * reserved extents that need to be freed.  This must be called with
4059  * BTRFS_I(inode)->lock held.
4060  */
4061 static unsigned drop_outstanding_extent(struct inode *inode)
4062 {
4063 	unsigned drop_inode_space = 0;
4064 	unsigned dropped_extents = 0;
4065 
4066 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4067 	BTRFS_I(inode)->outstanding_extents--;
4068 
4069 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4070 	    BTRFS_I(inode)->delalloc_meta_reserved) {
4071 		drop_inode_space = 1;
4072 		BTRFS_I(inode)->delalloc_meta_reserved = 0;
4073 	}
4074 
4075 	/*
4076 	 * If we have more or the same amount of outsanding extents than we have
4077 	 * reserved then we need to leave the reserved extents count alone.
4078 	 */
4079 	if (BTRFS_I(inode)->outstanding_extents >=
4080 	    BTRFS_I(inode)->reserved_extents)
4081 		return drop_inode_space;
4082 
4083 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4084 		BTRFS_I(inode)->outstanding_extents;
4085 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4086 	return dropped_extents + drop_inode_space;
4087 }
4088 
4089 /**
4090  * calc_csum_metadata_size - return the amount of metada space that must be
4091  *	reserved/free'd for the given bytes.
4092  * @inode: the inode we're manipulating
4093  * @num_bytes: the number of bytes in question
4094  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4095  *
4096  * This adjusts the number of csum_bytes in the inode and then returns the
4097  * correct amount of metadata that must either be reserved or freed.  We
4098  * calculate how many checksums we can fit into one leaf and then divide the
4099  * number of bytes that will need to be checksumed by this value to figure out
4100  * how many checksums will be required.  If we are adding bytes then the number
4101  * may go up and we will return the number of additional bytes that must be
4102  * reserved.  If it is going down we will return the number of bytes that must
4103  * be freed.
4104  *
4105  * This must be called with BTRFS_I(inode)->lock held.
4106  */
4107 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4108 				   int reserve)
4109 {
4110 	struct btrfs_root *root = BTRFS_I(inode)->root;
4111 	u64 csum_size;
4112 	int num_csums_per_leaf;
4113 	int num_csums;
4114 	int old_csums;
4115 
4116 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4117 	    BTRFS_I(inode)->csum_bytes == 0)
4118 		return 0;
4119 
4120 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4121 	if (reserve)
4122 		BTRFS_I(inode)->csum_bytes += num_bytes;
4123 	else
4124 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4125 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4126 	num_csums_per_leaf = (int)div64_u64(csum_size,
4127 					    sizeof(struct btrfs_csum_item) +
4128 					    sizeof(struct btrfs_disk_key));
4129 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4130 	num_csums = num_csums + num_csums_per_leaf - 1;
4131 	num_csums = num_csums / num_csums_per_leaf;
4132 
4133 	old_csums = old_csums + num_csums_per_leaf - 1;
4134 	old_csums = old_csums / num_csums_per_leaf;
4135 
4136 	/* No change, no need to reserve more */
4137 	if (old_csums == num_csums)
4138 		return 0;
4139 
4140 	if (reserve)
4141 		return btrfs_calc_trans_metadata_size(root,
4142 						      num_csums - old_csums);
4143 
4144 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4145 }
4146 
4147 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4148 {
4149 	struct btrfs_root *root = BTRFS_I(inode)->root;
4150 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4151 	u64 to_reserve = 0;
4152 	unsigned nr_extents = 0;
4153 	int flush = 1;
4154 	int ret;
4155 
4156 	if (btrfs_is_free_space_inode(root, inode))
4157 		flush = 0;
4158 
4159 	if (flush && btrfs_transaction_in_commit(root->fs_info))
4160 		schedule_timeout(1);
4161 
4162 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4163 
4164 	spin_lock(&BTRFS_I(inode)->lock);
4165 	BTRFS_I(inode)->outstanding_extents++;
4166 
4167 	if (BTRFS_I(inode)->outstanding_extents >
4168 	    BTRFS_I(inode)->reserved_extents) {
4169 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4170 			BTRFS_I(inode)->reserved_extents;
4171 		BTRFS_I(inode)->reserved_extents += nr_extents;
4172 	}
4173 
4174 	/*
4175 	 * Add an item to reserve for updating the inode when we complete the
4176 	 * delalloc io.
4177 	 */
4178 	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4179 		nr_extents++;
4180 		BTRFS_I(inode)->delalloc_meta_reserved = 1;
4181 	}
4182 
4183 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4184 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4185 	spin_unlock(&BTRFS_I(inode)->lock);
4186 
4187 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4188 	if (ret) {
4189 		u64 to_free = 0;
4190 		unsigned dropped;
4191 
4192 		spin_lock(&BTRFS_I(inode)->lock);
4193 		dropped = drop_outstanding_extent(inode);
4194 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4195 		spin_unlock(&BTRFS_I(inode)->lock);
4196 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4197 
4198 		/*
4199 		 * Somebody could have come in and twiddled with the
4200 		 * reservation, so if we have to free more than we would have
4201 		 * reserved from this reservation go ahead and release those
4202 		 * bytes.
4203 		 */
4204 		to_free -= to_reserve;
4205 		if (to_free)
4206 			btrfs_block_rsv_release(root, block_rsv, to_free);
4207 		return ret;
4208 	}
4209 
4210 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4211 
4212 	return 0;
4213 }
4214 
4215 /**
4216  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4217  * @inode: the inode to release the reservation for
4218  * @num_bytes: the number of bytes we're releasing
4219  *
4220  * This will release the metadata reservation for an inode.  This can be called
4221  * once we complete IO for a given set of bytes to release their metadata
4222  * reservations.
4223  */
4224 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4225 {
4226 	struct btrfs_root *root = BTRFS_I(inode)->root;
4227 	u64 to_free = 0;
4228 	unsigned dropped;
4229 
4230 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4231 	spin_lock(&BTRFS_I(inode)->lock);
4232 	dropped = drop_outstanding_extent(inode);
4233 
4234 	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4235 	spin_unlock(&BTRFS_I(inode)->lock);
4236 	if (dropped > 0)
4237 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4238 
4239 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4240 				to_free);
4241 }
4242 
4243 /**
4244  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4245  * @inode: inode we're writing to
4246  * @num_bytes: the number of bytes we want to allocate
4247  *
4248  * This will do the following things
4249  *
4250  * o reserve space in the data space info for num_bytes
4251  * o reserve space in the metadata space info based on number of outstanding
4252  *   extents and how much csums will be needed
4253  * o add to the inodes ->delalloc_bytes
4254  * o add it to the fs_info's delalloc inodes list.
4255  *
4256  * This will return 0 for success and -ENOSPC if there is no space left.
4257  */
4258 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4259 {
4260 	int ret;
4261 
4262 	ret = btrfs_check_data_free_space(inode, num_bytes);
4263 	if (ret)
4264 		return ret;
4265 
4266 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4267 	if (ret) {
4268 		btrfs_free_reserved_data_space(inode, num_bytes);
4269 		return ret;
4270 	}
4271 
4272 	return 0;
4273 }
4274 
4275 /**
4276  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4277  * @inode: inode we're releasing space for
4278  * @num_bytes: the number of bytes we want to free up
4279  *
4280  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4281  * called in the case that we don't need the metadata AND data reservations
4282  * anymore.  So if there is an error or we insert an inline extent.
4283  *
4284  * This function will release the metadata space that was not used and will
4285  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4286  * list if there are no delalloc bytes left.
4287  */
4288 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4289 {
4290 	btrfs_delalloc_release_metadata(inode, num_bytes);
4291 	btrfs_free_reserved_data_space(inode, num_bytes);
4292 }
4293 
4294 static int update_block_group(struct btrfs_trans_handle *trans,
4295 			      struct btrfs_root *root,
4296 			      u64 bytenr, u64 num_bytes, int alloc)
4297 {
4298 	struct btrfs_block_group_cache *cache = NULL;
4299 	struct btrfs_fs_info *info = root->fs_info;
4300 	u64 total = num_bytes;
4301 	u64 old_val;
4302 	u64 byte_in_group;
4303 	int factor;
4304 
4305 	/* block accounting for super block */
4306 	spin_lock(&info->delalloc_lock);
4307 	old_val = btrfs_super_bytes_used(info->super_copy);
4308 	if (alloc)
4309 		old_val += num_bytes;
4310 	else
4311 		old_val -= num_bytes;
4312 	btrfs_set_super_bytes_used(info->super_copy, old_val);
4313 	spin_unlock(&info->delalloc_lock);
4314 
4315 	while (total) {
4316 		cache = btrfs_lookup_block_group(info, bytenr);
4317 		if (!cache)
4318 			return -1;
4319 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4320 				    BTRFS_BLOCK_GROUP_RAID1 |
4321 				    BTRFS_BLOCK_GROUP_RAID10))
4322 			factor = 2;
4323 		else
4324 			factor = 1;
4325 		/*
4326 		 * If this block group has free space cache written out, we
4327 		 * need to make sure to load it if we are removing space.  This
4328 		 * is because we need the unpinning stage to actually add the
4329 		 * space back to the block group, otherwise we will leak space.
4330 		 */
4331 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
4332 			cache_block_group(cache, trans, NULL, 1);
4333 
4334 		byte_in_group = bytenr - cache->key.objectid;
4335 		WARN_ON(byte_in_group > cache->key.offset);
4336 
4337 		spin_lock(&cache->space_info->lock);
4338 		spin_lock(&cache->lock);
4339 
4340 		if (btrfs_test_opt(root, SPACE_CACHE) &&
4341 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
4342 			cache->disk_cache_state = BTRFS_DC_CLEAR;
4343 
4344 		cache->dirty = 1;
4345 		old_val = btrfs_block_group_used(&cache->item);
4346 		num_bytes = min(total, cache->key.offset - byte_in_group);
4347 		if (alloc) {
4348 			old_val += num_bytes;
4349 			btrfs_set_block_group_used(&cache->item, old_val);
4350 			cache->reserved -= num_bytes;
4351 			cache->space_info->bytes_reserved -= num_bytes;
4352 			cache->space_info->bytes_used += num_bytes;
4353 			cache->space_info->disk_used += num_bytes * factor;
4354 			spin_unlock(&cache->lock);
4355 			spin_unlock(&cache->space_info->lock);
4356 		} else {
4357 			old_val -= num_bytes;
4358 			btrfs_set_block_group_used(&cache->item, old_val);
4359 			cache->pinned += num_bytes;
4360 			cache->space_info->bytes_pinned += num_bytes;
4361 			cache->space_info->bytes_used -= num_bytes;
4362 			cache->space_info->disk_used -= num_bytes * factor;
4363 			spin_unlock(&cache->lock);
4364 			spin_unlock(&cache->space_info->lock);
4365 
4366 			set_extent_dirty(info->pinned_extents,
4367 					 bytenr, bytenr + num_bytes - 1,
4368 					 GFP_NOFS | __GFP_NOFAIL);
4369 		}
4370 		btrfs_put_block_group(cache);
4371 		total -= num_bytes;
4372 		bytenr += num_bytes;
4373 	}
4374 	return 0;
4375 }
4376 
4377 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4378 {
4379 	struct btrfs_block_group_cache *cache;
4380 	u64 bytenr;
4381 
4382 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4383 	if (!cache)
4384 		return 0;
4385 
4386 	bytenr = cache->key.objectid;
4387 	btrfs_put_block_group(cache);
4388 
4389 	return bytenr;
4390 }
4391 
4392 static int pin_down_extent(struct btrfs_root *root,
4393 			   struct btrfs_block_group_cache *cache,
4394 			   u64 bytenr, u64 num_bytes, int reserved)
4395 {
4396 	spin_lock(&cache->space_info->lock);
4397 	spin_lock(&cache->lock);
4398 	cache->pinned += num_bytes;
4399 	cache->space_info->bytes_pinned += num_bytes;
4400 	if (reserved) {
4401 		cache->reserved -= num_bytes;
4402 		cache->space_info->bytes_reserved -= num_bytes;
4403 	}
4404 	spin_unlock(&cache->lock);
4405 	spin_unlock(&cache->space_info->lock);
4406 
4407 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4408 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4409 	return 0;
4410 }
4411 
4412 /*
4413  * this function must be called within transaction
4414  */
4415 int btrfs_pin_extent(struct btrfs_root *root,
4416 		     u64 bytenr, u64 num_bytes, int reserved)
4417 {
4418 	struct btrfs_block_group_cache *cache;
4419 
4420 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4421 	BUG_ON(!cache);
4422 
4423 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4424 
4425 	btrfs_put_block_group(cache);
4426 	return 0;
4427 }
4428 
4429 /*
4430  * this function must be called within transaction
4431  */
4432 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4433 				    struct btrfs_root *root,
4434 				    u64 bytenr, u64 num_bytes)
4435 {
4436 	struct btrfs_block_group_cache *cache;
4437 
4438 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4439 	BUG_ON(!cache);
4440 
4441 	/*
4442 	 * pull in the free space cache (if any) so that our pin
4443 	 * removes the free space from the cache.  We have load_only set
4444 	 * to one because the slow code to read in the free extents does check
4445 	 * the pinned extents.
4446 	 */
4447 	cache_block_group(cache, trans, root, 1);
4448 
4449 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
4450 
4451 	/* remove us from the free space cache (if we're there at all) */
4452 	btrfs_remove_free_space(cache, bytenr, num_bytes);
4453 	btrfs_put_block_group(cache);
4454 	return 0;
4455 }
4456 
4457 /**
4458  * btrfs_update_reserved_bytes - update the block_group and space info counters
4459  * @cache:	The cache we are manipulating
4460  * @num_bytes:	The number of bytes in question
4461  * @reserve:	One of the reservation enums
4462  *
4463  * This is called by the allocator when it reserves space, or by somebody who is
4464  * freeing space that was never actually used on disk.  For example if you
4465  * reserve some space for a new leaf in transaction A and before transaction A
4466  * commits you free that leaf, you call this with reserve set to 0 in order to
4467  * clear the reservation.
4468  *
4469  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4470  * ENOSPC accounting.  For data we handle the reservation through clearing the
4471  * delalloc bits in the io_tree.  We have to do this since we could end up
4472  * allocating less disk space for the amount of data we have reserved in the
4473  * case of compression.
4474  *
4475  * If this is a reservation and the block group has become read only we cannot
4476  * make the reservation and return -EAGAIN, otherwise this function always
4477  * succeeds.
4478  */
4479 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4480 				       u64 num_bytes, int reserve)
4481 {
4482 	struct btrfs_space_info *space_info = cache->space_info;
4483 	int ret = 0;
4484 	spin_lock(&space_info->lock);
4485 	spin_lock(&cache->lock);
4486 	if (reserve != RESERVE_FREE) {
4487 		if (cache->ro) {
4488 			ret = -EAGAIN;
4489 		} else {
4490 			cache->reserved += num_bytes;
4491 			space_info->bytes_reserved += num_bytes;
4492 			if (reserve == RESERVE_ALLOC) {
4493 				BUG_ON(space_info->bytes_may_use < num_bytes);
4494 				space_info->bytes_may_use -= num_bytes;
4495 			}
4496 		}
4497 	} else {
4498 		if (cache->ro)
4499 			space_info->bytes_readonly += num_bytes;
4500 		cache->reserved -= num_bytes;
4501 		space_info->bytes_reserved -= num_bytes;
4502 		space_info->reservation_progress++;
4503 	}
4504 	spin_unlock(&cache->lock);
4505 	spin_unlock(&space_info->lock);
4506 	return ret;
4507 }
4508 
4509 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4510 				struct btrfs_root *root)
4511 {
4512 	struct btrfs_fs_info *fs_info = root->fs_info;
4513 	struct btrfs_caching_control *next;
4514 	struct btrfs_caching_control *caching_ctl;
4515 	struct btrfs_block_group_cache *cache;
4516 
4517 	down_write(&fs_info->extent_commit_sem);
4518 
4519 	list_for_each_entry_safe(caching_ctl, next,
4520 				 &fs_info->caching_block_groups, list) {
4521 		cache = caching_ctl->block_group;
4522 		if (block_group_cache_done(cache)) {
4523 			cache->last_byte_to_unpin = (u64)-1;
4524 			list_del_init(&caching_ctl->list);
4525 			put_caching_control(caching_ctl);
4526 		} else {
4527 			cache->last_byte_to_unpin = caching_ctl->progress;
4528 		}
4529 	}
4530 
4531 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4532 		fs_info->pinned_extents = &fs_info->freed_extents[1];
4533 	else
4534 		fs_info->pinned_extents = &fs_info->freed_extents[0];
4535 
4536 	up_write(&fs_info->extent_commit_sem);
4537 
4538 	update_global_block_rsv(fs_info);
4539 	return 0;
4540 }
4541 
4542 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4543 {
4544 	struct btrfs_fs_info *fs_info = root->fs_info;
4545 	struct btrfs_block_group_cache *cache = NULL;
4546 	u64 len;
4547 
4548 	while (start <= end) {
4549 		if (!cache ||
4550 		    start >= cache->key.objectid + cache->key.offset) {
4551 			if (cache)
4552 				btrfs_put_block_group(cache);
4553 			cache = btrfs_lookup_block_group(fs_info, start);
4554 			BUG_ON(!cache);
4555 		}
4556 
4557 		len = cache->key.objectid + cache->key.offset - start;
4558 		len = min(len, end + 1 - start);
4559 
4560 		if (start < cache->last_byte_to_unpin) {
4561 			len = min(len, cache->last_byte_to_unpin - start);
4562 			btrfs_add_free_space(cache, start, len);
4563 		}
4564 
4565 		start += len;
4566 
4567 		spin_lock(&cache->space_info->lock);
4568 		spin_lock(&cache->lock);
4569 		cache->pinned -= len;
4570 		cache->space_info->bytes_pinned -= len;
4571 		if (cache->ro)
4572 			cache->space_info->bytes_readonly += len;
4573 		spin_unlock(&cache->lock);
4574 		spin_unlock(&cache->space_info->lock);
4575 	}
4576 
4577 	if (cache)
4578 		btrfs_put_block_group(cache);
4579 	return 0;
4580 }
4581 
4582 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4583 			       struct btrfs_root *root)
4584 {
4585 	struct btrfs_fs_info *fs_info = root->fs_info;
4586 	struct extent_io_tree *unpin;
4587 	u64 start;
4588 	u64 end;
4589 	int ret;
4590 
4591 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4592 		unpin = &fs_info->freed_extents[1];
4593 	else
4594 		unpin = &fs_info->freed_extents[0];
4595 
4596 	while (1) {
4597 		ret = find_first_extent_bit(unpin, 0, &start, &end,
4598 					    EXTENT_DIRTY);
4599 		if (ret)
4600 			break;
4601 
4602 		if (btrfs_test_opt(root, DISCARD))
4603 			ret = btrfs_discard_extent(root, start,
4604 						   end + 1 - start, NULL);
4605 
4606 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
4607 		unpin_extent_range(root, start, end);
4608 		cond_resched();
4609 	}
4610 
4611 	return 0;
4612 }
4613 
4614 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4615 				struct btrfs_root *root,
4616 				u64 bytenr, u64 num_bytes, u64 parent,
4617 				u64 root_objectid, u64 owner_objectid,
4618 				u64 owner_offset, int refs_to_drop,
4619 				struct btrfs_delayed_extent_op *extent_op)
4620 {
4621 	struct btrfs_key key;
4622 	struct btrfs_path *path;
4623 	struct btrfs_fs_info *info = root->fs_info;
4624 	struct btrfs_root *extent_root = info->extent_root;
4625 	struct extent_buffer *leaf;
4626 	struct btrfs_extent_item *ei;
4627 	struct btrfs_extent_inline_ref *iref;
4628 	int ret;
4629 	int is_data;
4630 	int extent_slot = 0;
4631 	int found_extent = 0;
4632 	int num_to_del = 1;
4633 	u32 item_size;
4634 	u64 refs;
4635 
4636 	path = btrfs_alloc_path();
4637 	if (!path)
4638 		return -ENOMEM;
4639 
4640 	path->reada = 1;
4641 	path->leave_spinning = 1;
4642 
4643 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4644 	BUG_ON(!is_data && refs_to_drop != 1);
4645 
4646 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
4647 				    bytenr, num_bytes, parent,
4648 				    root_objectid, owner_objectid,
4649 				    owner_offset);
4650 	if (ret == 0) {
4651 		extent_slot = path->slots[0];
4652 		while (extent_slot >= 0) {
4653 			btrfs_item_key_to_cpu(path->nodes[0], &key,
4654 					      extent_slot);
4655 			if (key.objectid != bytenr)
4656 				break;
4657 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4658 			    key.offset == num_bytes) {
4659 				found_extent = 1;
4660 				break;
4661 			}
4662 			if (path->slots[0] - extent_slot > 5)
4663 				break;
4664 			extent_slot--;
4665 		}
4666 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4667 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4668 		if (found_extent && item_size < sizeof(*ei))
4669 			found_extent = 0;
4670 #endif
4671 		if (!found_extent) {
4672 			BUG_ON(iref);
4673 			ret = remove_extent_backref(trans, extent_root, path,
4674 						    NULL, refs_to_drop,
4675 						    is_data);
4676 			BUG_ON(ret);
4677 			btrfs_release_path(path);
4678 			path->leave_spinning = 1;
4679 
4680 			key.objectid = bytenr;
4681 			key.type = BTRFS_EXTENT_ITEM_KEY;
4682 			key.offset = num_bytes;
4683 
4684 			ret = btrfs_search_slot(trans, extent_root,
4685 						&key, path, -1, 1);
4686 			if (ret) {
4687 				printk(KERN_ERR "umm, got %d back from search"
4688 				       ", was looking for %llu\n", ret,
4689 				       (unsigned long long)bytenr);
4690 				if (ret > 0)
4691 					btrfs_print_leaf(extent_root,
4692 							 path->nodes[0]);
4693 			}
4694 			BUG_ON(ret);
4695 			extent_slot = path->slots[0];
4696 		}
4697 	} else {
4698 		btrfs_print_leaf(extent_root, path->nodes[0]);
4699 		WARN_ON(1);
4700 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4701 		       "parent %llu root %llu  owner %llu offset %llu\n",
4702 		       (unsigned long long)bytenr,
4703 		       (unsigned long long)parent,
4704 		       (unsigned long long)root_objectid,
4705 		       (unsigned long long)owner_objectid,
4706 		       (unsigned long long)owner_offset);
4707 	}
4708 
4709 	leaf = path->nodes[0];
4710 	item_size = btrfs_item_size_nr(leaf, extent_slot);
4711 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4712 	if (item_size < sizeof(*ei)) {
4713 		BUG_ON(found_extent || extent_slot != path->slots[0]);
4714 		ret = convert_extent_item_v0(trans, extent_root, path,
4715 					     owner_objectid, 0);
4716 		BUG_ON(ret < 0);
4717 
4718 		btrfs_release_path(path);
4719 		path->leave_spinning = 1;
4720 
4721 		key.objectid = bytenr;
4722 		key.type = BTRFS_EXTENT_ITEM_KEY;
4723 		key.offset = num_bytes;
4724 
4725 		ret = btrfs_search_slot(trans, extent_root, &key, path,
4726 					-1, 1);
4727 		if (ret) {
4728 			printk(KERN_ERR "umm, got %d back from search"
4729 			       ", was looking for %llu\n", ret,
4730 			       (unsigned long long)bytenr);
4731 			btrfs_print_leaf(extent_root, path->nodes[0]);
4732 		}
4733 		BUG_ON(ret);
4734 		extent_slot = path->slots[0];
4735 		leaf = path->nodes[0];
4736 		item_size = btrfs_item_size_nr(leaf, extent_slot);
4737 	}
4738 #endif
4739 	BUG_ON(item_size < sizeof(*ei));
4740 	ei = btrfs_item_ptr(leaf, extent_slot,
4741 			    struct btrfs_extent_item);
4742 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4743 		struct btrfs_tree_block_info *bi;
4744 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4745 		bi = (struct btrfs_tree_block_info *)(ei + 1);
4746 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4747 	}
4748 
4749 	refs = btrfs_extent_refs(leaf, ei);
4750 	BUG_ON(refs < refs_to_drop);
4751 	refs -= refs_to_drop;
4752 
4753 	if (refs > 0) {
4754 		if (extent_op)
4755 			__run_delayed_extent_op(extent_op, leaf, ei);
4756 		/*
4757 		 * In the case of inline back ref, reference count will
4758 		 * be updated by remove_extent_backref
4759 		 */
4760 		if (iref) {
4761 			BUG_ON(!found_extent);
4762 		} else {
4763 			btrfs_set_extent_refs(leaf, ei, refs);
4764 			btrfs_mark_buffer_dirty(leaf);
4765 		}
4766 		if (found_extent) {
4767 			ret = remove_extent_backref(trans, extent_root, path,
4768 						    iref, refs_to_drop,
4769 						    is_data);
4770 			BUG_ON(ret);
4771 		}
4772 	} else {
4773 		if (found_extent) {
4774 			BUG_ON(is_data && refs_to_drop !=
4775 			       extent_data_ref_count(root, path, iref));
4776 			if (iref) {
4777 				BUG_ON(path->slots[0] != extent_slot);
4778 			} else {
4779 				BUG_ON(path->slots[0] != extent_slot + 1);
4780 				path->slots[0] = extent_slot;
4781 				num_to_del = 2;
4782 			}
4783 		}
4784 
4785 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4786 				      num_to_del);
4787 		BUG_ON(ret);
4788 		btrfs_release_path(path);
4789 
4790 		if (is_data) {
4791 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4792 			BUG_ON(ret);
4793 		} else {
4794 			invalidate_mapping_pages(info->btree_inode->i_mapping,
4795 			     bytenr >> PAGE_CACHE_SHIFT,
4796 			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4797 		}
4798 
4799 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4800 		BUG_ON(ret);
4801 	}
4802 	btrfs_free_path(path);
4803 	return ret;
4804 }
4805 
4806 /*
4807  * when we free an block, it is possible (and likely) that we free the last
4808  * delayed ref for that extent as well.  This searches the delayed ref tree for
4809  * a given extent, and if there are no other delayed refs to be processed, it
4810  * removes it from the tree.
4811  */
4812 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4813 				      struct btrfs_root *root, u64 bytenr)
4814 {
4815 	struct btrfs_delayed_ref_head *head;
4816 	struct btrfs_delayed_ref_root *delayed_refs;
4817 	struct btrfs_delayed_ref_node *ref;
4818 	struct rb_node *node;
4819 	int ret = 0;
4820 
4821 	delayed_refs = &trans->transaction->delayed_refs;
4822 	spin_lock(&delayed_refs->lock);
4823 	head = btrfs_find_delayed_ref_head(trans, bytenr);
4824 	if (!head)
4825 		goto out;
4826 
4827 	node = rb_prev(&head->node.rb_node);
4828 	if (!node)
4829 		goto out;
4830 
4831 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4832 
4833 	/* there are still entries for this ref, we can't drop it */
4834 	if (ref->bytenr == bytenr)
4835 		goto out;
4836 
4837 	if (head->extent_op) {
4838 		if (!head->must_insert_reserved)
4839 			goto out;
4840 		kfree(head->extent_op);
4841 		head->extent_op = NULL;
4842 	}
4843 
4844 	/*
4845 	 * waiting for the lock here would deadlock.  If someone else has it
4846 	 * locked they are already in the process of dropping it anyway
4847 	 */
4848 	if (!mutex_trylock(&head->mutex))
4849 		goto out;
4850 
4851 	/*
4852 	 * at this point we have a head with no other entries.  Go
4853 	 * ahead and process it.
4854 	 */
4855 	head->node.in_tree = 0;
4856 	rb_erase(&head->node.rb_node, &delayed_refs->root);
4857 
4858 	delayed_refs->num_entries--;
4859 
4860 	/*
4861 	 * we don't take a ref on the node because we're removing it from the
4862 	 * tree, so we just steal the ref the tree was holding.
4863 	 */
4864 	delayed_refs->num_heads--;
4865 	if (list_empty(&head->cluster))
4866 		delayed_refs->num_heads_ready--;
4867 
4868 	list_del_init(&head->cluster);
4869 	spin_unlock(&delayed_refs->lock);
4870 
4871 	BUG_ON(head->extent_op);
4872 	if (head->must_insert_reserved)
4873 		ret = 1;
4874 
4875 	mutex_unlock(&head->mutex);
4876 	btrfs_put_delayed_ref(&head->node);
4877 	return ret;
4878 out:
4879 	spin_unlock(&delayed_refs->lock);
4880 	return 0;
4881 }
4882 
4883 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4884 			   struct btrfs_root *root,
4885 			   struct extent_buffer *buf,
4886 			   u64 parent, int last_ref)
4887 {
4888 	struct btrfs_block_group_cache *cache = NULL;
4889 	int ret;
4890 
4891 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4892 		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4893 						parent, root->root_key.objectid,
4894 						btrfs_header_level(buf),
4895 						BTRFS_DROP_DELAYED_REF, NULL);
4896 		BUG_ON(ret);
4897 	}
4898 
4899 	if (!last_ref)
4900 		return;
4901 
4902 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4903 
4904 	if (btrfs_header_generation(buf) == trans->transid) {
4905 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4906 			ret = check_ref_cleanup(trans, root, buf->start);
4907 			if (!ret)
4908 				goto out;
4909 		}
4910 
4911 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4912 			pin_down_extent(root, cache, buf->start, buf->len, 1);
4913 			goto out;
4914 		}
4915 
4916 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4917 
4918 		btrfs_add_free_space(cache, buf->start, buf->len);
4919 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4920 	}
4921 out:
4922 	/*
4923 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4924 	 * anymore.
4925 	 */
4926 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4927 	btrfs_put_block_group(cache);
4928 }
4929 
4930 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4931 		      struct btrfs_root *root,
4932 		      u64 bytenr, u64 num_bytes, u64 parent,
4933 		      u64 root_objectid, u64 owner, u64 offset)
4934 {
4935 	int ret;
4936 
4937 	/*
4938 	 * tree log blocks never actually go into the extent allocation
4939 	 * tree, just update pinning info and exit early.
4940 	 */
4941 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4942 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4943 		/* unlocks the pinned mutex */
4944 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
4945 		ret = 0;
4946 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4947 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4948 					parent, root_objectid, (int)owner,
4949 					BTRFS_DROP_DELAYED_REF, NULL);
4950 		BUG_ON(ret);
4951 	} else {
4952 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4953 					parent, root_objectid, owner,
4954 					offset, BTRFS_DROP_DELAYED_REF, NULL);
4955 		BUG_ON(ret);
4956 	}
4957 	return ret;
4958 }
4959 
4960 static u64 stripe_align(struct btrfs_root *root, u64 val)
4961 {
4962 	u64 mask = ((u64)root->stripesize - 1);
4963 	u64 ret = (val + mask) & ~mask;
4964 	return ret;
4965 }
4966 
4967 /*
4968  * when we wait for progress in the block group caching, its because
4969  * our allocation attempt failed at least once.  So, we must sleep
4970  * and let some progress happen before we try again.
4971  *
4972  * This function will sleep at least once waiting for new free space to
4973  * show up, and then it will check the block group free space numbers
4974  * for our min num_bytes.  Another option is to have it go ahead
4975  * and look in the rbtree for a free extent of a given size, but this
4976  * is a good start.
4977  */
4978 static noinline int
4979 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4980 				u64 num_bytes)
4981 {
4982 	struct btrfs_caching_control *caching_ctl;
4983 	DEFINE_WAIT(wait);
4984 
4985 	caching_ctl = get_caching_control(cache);
4986 	if (!caching_ctl)
4987 		return 0;
4988 
4989 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4990 		   (cache->free_space_ctl->free_space >= num_bytes));
4991 
4992 	put_caching_control(caching_ctl);
4993 	return 0;
4994 }
4995 
4996 static noinline int
4997 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4998 {
4999 	struct btrfs_caching_control *caching_ctl;
5000 	DEFINE_WAIT(wait);
5001 
5002 	caching_ctl = get_caching_control(cache);
5003 	if (!caching_ctl)
5004 		return 0;
5005 
5006 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
5007 
5008 	put_caching_control(caching_ctl);
5009 	return 0;
5010 }
5011 
5012 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5013 {
5014 	int index;
5015 	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
5016 		index = 0;
5017 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
5018 		index = 1;
5019 	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
5020 		index = 2;
5021 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
5022 		index = 3;
5023 	else
5024 		index = 4;
5025 	return index;
5026 }
5027 
5028 enum btrfs_loop_type {
5029 	LOOP_FIND_IDEAL = 0,
5030 	LOOP_CACHING_NOWAIT = 1,
5031 	LOOP_CACHING_WAIT = 2,
5032 	LOOP_ALLOC_CHUNK = 3,
5033 	LOOP_NO_EMPTY_SIZE = 4,
5034 };
5035 
5036 /*
5037  * walks the btree of allocated extents and find a hole of a given size.
5038  * The key ins is changed to record the hole:
5039  * ins->objectid == block start
5040  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5041  * ins->offset == number of blocks
5042  * Any available blocks before search_start are skipped.
5043  */
5044 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5045 				     struct btrfs_root *orig_root,
5046 				     u64 num_bytes, u64 empty_size,
5047 				     u64 search_start, u64 search_end,
5048 				     u64 hint_byte, struct btrfs_key *ins,
5049 				     u64 data)
5050 {
5051 	int ret = 0;
5052 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5053 	struct btrfs_free_cluster *last_ptr = NULL;
5054 	struct btrfs_block_group_cache *block_group = NULL;
5055 	int empty_cluster = 2 * 1024 * 1024;
5056 	int allowed_chunk_alloc = 0;
5057 	int done_chunk_alloc = 0;
5058 	struct btrfs_space_info *space_info;
5059 	int last_ptr_loop = 0;
5060 	int loop = 0;
5061 	int index = 0;
5062 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5063 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5064 	bool found_uncached_bg = false;
5065 	bool failed_cluster_refill = false;
5066 	bool failed_alloc = false;
5067 	bool use_cluster = true;
5068 	bool have_caching_bg = false;
5069 	u64 ideal_cache_percent = 0;
5070 	u64 ideal_cache_offset = 0;
5071 
5072 	WARN_ON(num_bytes < root->sectorsize);
5073 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5074 	ins->objectid = 0;
5075 	ins->offset = 0;
5076 
5077 	space_info = __find_space_info(root->fs_info, data);
5078 	if (!space_info) {
5079 		printk(KERN_ERR "No space info for %llu\n", data);
5080 		return -ENOSPC;
5081 	}
5082 
5083 	/*
5084 	 * If the space info is for both data and metadata it means we have a
5085 	 * small filesystem and we can't use the clustering stuff.
5086 	 */
5087 	if (btrfs_mixed_space_info(space_info))
5088 		use_cluster = false;
5089 
5090 	if (orig_root->ref_cows || empty_size)
5091 		allowed_chunk_alloc = 1;
5092 
5093 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5094 		last_ptr = &root->fs_info->meta_alloc_cluster;
5095 		if (!btrfs_test_opt(root, SSD))
5096 			empty_cluster = 64 * 1024;
5097 	}
5098 
5099 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5100 	    btrfs_test_opt(root, SSD)) {
5101 		last_ptr = &root->fs_info->data_alloc_cluster;
5102 	}
5103 
5104 	if (last_ptr) {
5105 		spin_lock(&last_ptr->lock);
5106 		if (last_ptr->block_group)
5107 			hint_byte = last_ptr->window_start;
5108 		spin_unlock(&last_ptr->lock);
5109 	}
5110 
5111 	search_start = max(search_start, first_logical_byte(root, 0));
5112 	search_start = max(search_start, hint_byte);
5113 
5114 	if (!last_ptr)
5115 		empty_cluster = 0;
5116 
5117 	if (search_start == hint_byte) {
5118 ideal_cache:
5119 		block_group = btrfs_lookup_block_group(root->fs_info,
5120 						       search_start);
5121 		/*
5122 		 * we don't want to use the block group if it doesn't match our
5123 		 * allocation bits, or if its not cached.
5124 		 *
5125 		 * However if we are re-searching with an ideal block group
5126 		 * picked out then we don't care that the block group is cached.
5127 		 */
5128 		if (block_group && block_group_bits(block_group, data) &&
5129 		    (block_group->cached != BTRFS_CACHE_NO ||
5130 		     search_start == ideal_cache_offset)) {
5131 			down_read(&space_info->groups_sem);
5132 			if (list_empty(&block_group->list) ||
5133 			    block_group->ro) {
5134 				/*
5135 				 * someone is removing this block group,
5136 				 * we can't jump into the have_block_group
5137 				 * target because our list pointers are not
5138 				 * valid
5139 				 */
5140 				btrfs_put_block_group(block_group);
5141 				up_read(&space_info->groups_sem);
5142 			} else {
5143 				index = get_block_group_index(block_group);
5144 				goto have_block_group;
5145 			}
5146 		} else if (block_group) {
5147 			btrfs_put_block_group(block_group);
5148 		}
5149 	}
5150 search:
5151 	have_caching_bg = false;
5152 	down_read(&space_info->groups_sem);
5153 	list_for_each_entry(block_group, &space_info->block_groups[index],
5154 			    list) {
5155 		u64 offset;
5156 		int cached;
5157 
5158 		btrfs_get_block_group(block_group);
5159 		search_start = block_group->key.objectid;
5160 
5161 		/*
5162 		 * this can happen if we end up cycling through all the
5163 		 * raid types, but we want to make sure we only allocate
5164 		 * for the proper type.
5165 		 */
5166 		if (!block_group_bits(block_group, data)) {
5167 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5168 				BTRFS_BLOCK_GROUP_RAID1 |
5169 				BTRFS_BLOCK_GROUP_RAID10;
5170 
5171 			/*
5172 			 * if they asked for extra copies and this block group
5173 			 * doesn't provide them, bail.  This does allow us to
5174 			 * fill raid0 from raid1.
5175 			 */
5176 			if ((data & extra) && !(block_group->flags & extra))
5177 				goto loop;
5178 		}
5179 
5180 have_block_group:
5181 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
5182 			u64 free_percent;
5183 
5184 			ret = cache_block_group(block_group, trans,
5185 						orig_root, 1);
5186 			if (block_group->cached == BTRFS_CACHE_FINISHED)
5187 				goto have_block_group;
5188 
5189 			free_percent = btrfs_block_group_used(&block_group->item);
5190 			free_percent *= 100;
5191 			free_percent = div64_u64(free_percent,
5192 						 block_group->key.offset);
5193 			free_percent = 100 - free_percent;
5194 			if (free_percent > ideal_cache_percent &&
5195 			    likely(!block_group->ro)) {
5196 				ideal_cache_offset = block_group->key.objectid;
5197 				ideal_cache_percent = free_percent;
5198 			}
5199 
5200 			/*
5201 			 * The caching workers are limited to 2 threads, so we
5202 			 * can queue as much work as we care to.
5203 			 */
5204 			if (loop > LOOP_FIND_IDEAL) {
5205 				ret = cache_block_group(block_group, trans,
5206 							orig_root, 0);
5207 				BUG_ON(ret);
5208 			}
5209 			found_uncached_bg = true;
5210 
5211 			/*
5212 			 * If loop is set for cached only, try the next block
5213 			 * group.
5214 			 */
5215 			if (loop == LOOP_FIND_IDEAL)
5216 				goto loop;
5217 		}
5218 
5219 		cached = block_group_cache_done(block_group);
5220 		if (unlikely(!cached))
5221 			found_uncached_bg = true;
5222 
5223 		if (unlikely(block_group->ro))
5224 			goto loop;
5225 
5226 		spin_lock(&block_group->free_space_ctl->tree_lock);
5227 		if (cached &&
5228 		    block_group->free_space_ctl->free_space <
5229 		    num_bytes + empty_size) {
5230 			spin_unlock(&block_group->free_space_ctl->tree_lock);
5231 			goto loop;
5232 		}
5233 		spin_unlock(&block_group->free_space_ctl->tree_lock);
5234 
5235 		/*
5236 		 * Ok we want to try and use the cluster allocator, so lets look
5237 		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
5238 		 * have tried the cluster allocator plenty of times at this
5239 		 * point and not have found anything, so we are likely way too
5240 		 * fragmented for the clustering stuff to find anything, so lets
5241 		 * just skip it and let the allocator find whatever block it can
5242 		 * find
5243 		 */
5244 		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
5245 			/*
5246 			 * the refill lock keeps out other
5247 			 * people trying to start a new cluster
5248 			 */
5249 			spin_lock(&last_ptr->refill_lock);
5250 			if (last_ptr->block_group &&
5251 			    (last_ptr->block_group->ro ||
5252 			    !block_group_bits(last_ptr->block_group, data))) {
5253 				offset = 0;
5254 				goto refill_cluster;
5255 			}
5256 
5257 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
5258 						 num_bytes, search_start);
5259 			if (offset) {
5260 				/* we have a block, we're done */
5261 				spin_unlock(&last_ptr->refill_lock);
5262 				goto checks;
5263 			}
5264 
5265 			spin_lock(&last_ptr->lock);
5266 			/*
5267 			 * whoops, this cluster doesn't actually point to
5268 			 * this block group.  Get a ref on the block
5269 			 * group is does point to and try again
5270 			 */
5271 			if (!last_ptr_loop && last_ptr->block_group &&
5272 			    last_ptr->block_group != block_group &&
5273 			    index <=
5274 				 get_block_group_index(last_ptr->block_group)) {
5275 
5276 				btrfs_put_block_group(block_group);
5277 				block_group = last_ptr->block_group;
5278 				btrfs_get_block_group(block_group);
5279 				spin_unlock(&last_ptr->lock);
5280 				spin_unlock(&last_ptr->refill_lock);
5281 
5282 				last_ptr_loop = 1;
5283 				search_start = block_group->key.objectid;
5284 				/*
5285 				 * we know this block group is properly
5286 				 * in the list because
5287 				 * btrfs_remove_block_group, drops the
5288 				 * cluster before it removes the block
5289 				 * group from the list
5290 				 */
5291 				goto have_block_group;
5292 			}
5293 			spin_unlock(&last_ptr->lock);
5294 refill_cluster:
5295 			/*
5296 			 * this cluster didn't work out, free it and
5297 			 * start over
5298 			 */
5299 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5300 
5301 			last_ptr_loop = 0;
5302 
5303 			/* allocate a cluster in this block group */
5304 			ret = btrfs_find_space_cluster(trans, root,
5305 					       block_group, last_ptr,
5306 					       offset, num_bytes,
5307 					       empty_cluster + empty_size);
5308 			if (ret == 0) {
5309 				/*
5310 				 * now pull our allocation out of this
5311 				 * cluster
5312 				 */
5313 				offset = btrfs_alloc_from_cluster(block_group,
5314 						  last_ptr, num_bytes,
5315 						  search_start);
5316 				if (offset) {
5317 					/* we found one, proceed */
5318 					spin_unlock(&last_ptr->refill_lock);
5319 					goto checks;
5320 				}
5321 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
5322 				   && !failed_cluster_refill) {
5323 				spin_unlock(&last_ptr->refill_lock);
5324 
5325 				failed_cluster_refill = true;
5326 				wait_block_group_cache_progress(block_group,
5327 				       num_bytes + empty_cluster + empty_size);
5328 				goto have_block_group;
5329 			}
5330 
5331 			/*
5332 			 * at this point we either didn't find a cluster
5333 			 * or we weren't able to allocate a block from our
5334 			 * cluster.  Free the cluster we've been trying
5335 			 * to use, and go to the next block group
5336 			 */
5337 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5338 			spin_unlock(&last_ptr->refill_lock);
5339 			goto loop;
5340 		}
5341 
5342 		offset = btrfs_find_space_for_alloc(block_group, search_start,
5343 						    num_bytes, empty_size);
5344 		/*
5345 		 * If we didn't find a chunk, and we haven't failed on this
5346 		 * block group before, and this block group is in the middle of
5347 		 * caching and we are ok with waiting, then go ahead and wait
5348 		 * for progress to be made, and set failed_alloc to true.
5349 		 *
5350 		 * If failed_alloc is true then we've already waited on this
5351 		 * block group once and should move on to the next block group.
5352 		 */
5353 		if (!offset && !failed_alloc && !cached &&
5354 		    loop > LOOP_CACHING_NOWAIT) {
5355 			wait_block_group_cache_progress(block_group,
5356 						num_bytes + empty_size);
5357 			failed_alloc = true;
5358 			goto have_block_group;
5359 		} else if (!offset) {
5360 			if (!cached)
5361 				have_caching_bg = true;
5362 			goto loop;
5363 		}
5364 checks:
5365 		search_start = stripe_align(root, offset);
5366 		/* move on to the next group */
5367 		if (search_start + num_bytes >= search_end) {
5368 			btrfs_add_free_space(block_group, offset, num_bytes);
5369 			goto loop;
5370 		}
5371 
5372 		/* move on to the next group */
5373 		if (search_start + num_bytes >
5374 		    block_group->key.objectid + block_group->key.offset) {
5375 			btrfs_add_free_space(block_group, offset, num_bytes);
5376 			goto loop;
5377 		}
5378 
5379 		ins->objectid = search_start;
5380 		ins->offset = num_bytes;
5381 
5382 		if (offset < search_start)
5383 			btrfs_add_free_space(block_group, offset,
5384 					     search_start - offset);
5385 		BUG_ON(offset > search_start);
5386 
5387 		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5388 						  alloc_type);
5389 		if (ret == -EAGAIN) {
5390 			btrfs_add_free_space(block_group, offset, num_bytes);
5391 			goto loop;
5392 		}
5393 
5394 		/* we are all good, lets return */
5395 		ins->objectid = search_start;
5396 		ins->offset = num_bytes;
5397 
5398 		if (offset < search_start)
5399 			btrfs_add_free_space(block_group, offset,
5400 					     search_start - offset);
5401 		BUG_ON(offset > search_start);
5402 		btrfs_put_block_group(block_group);
5403 		break;
5404 loop:
5405 		failed_cluster_refill = false;
5406 		failed_alloc = false;
5407 		BUG_ON(index != get_block_group_index(block_group));
5408 		btrfs_put_block_group(block_group);
5409 	}
5410 	up_read(&space_info->groups_sem);
5411 
5412 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5413 		goto search;
5414 
5415 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5416 		goto search;
5417 
5418 	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5419 	 *			for them to make caching progress.  Also
5420 	 *			determine the best possible bg to cache
5421 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5422 	 *			caching kthreads as we move along
5423 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5424 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5425 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5426 	 *			again
5427 	 */
5428 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5429 		index = 0;
5430 		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5431 			found_uncached_bg = false;
5432 			loop++;
5433 			if (!ideal_cache_percent)
5434 				goto search;
5435 
5436 			/*
5437 			 * 1 of the following 2 things have happened so far
5438 			 *
5439 			 * 1) We found an ideal block group for caching that
5440 			 * is mostly full and will cache quickly, so we might
5441 			 * as well wait for it.
5442 			 *
5443 			 * 2) We searched for cached only and we didn't find
5444 			 * anything, and we didn't start any caching kthreads
5445 			 * either, so chances are we will loop through and
5446 			 * start a couple caching kthreads, and then come back
5447 			 * around and just wait for them.  This will be slower
5448 			 * because we will have 2 caching kthreads reading at
5449 			 * the same time when we could have just started one
5450 			 * and waited for it to get far enough to give us an
5451 			 * allocation, so go ahead and go to the wait caching
5452 			 * loop.
5453 			 */
5454 			loop = LOOP_CACHING_WAIT;
5455 			search_start = ideal_cache_offset;
5456 			ideal_cache_percent = 0;
5457 			goto ideal_cache;
5458 		} else if (loop == LOOP_FIND_IDEAL) {
5459 			/*
5460 			 * Didn't find a uncached bg, wait on anything we find
5461 			 * next.
5462 			 */
5463 			loop = LOOP_CACHING_WAIT;
5464 			goto search;
5465 		}
5466 
5467 		loop++;
5468 
5469 		if (loop == LOOP_ALLOC_CHUNK) {
5470 		       if (allowed_chunk_alloc) {
5471 				ret = do_chunk_alloc(trans, root, num_bytes +
5472 						     2 * 1024 * 1024, data,
5473 						     CHUNK_ALLOC_LIMITED);
5474 				allowed_chunk_alloc = 0;
5475 				if (ret == 1)
5476 					done_chunk_alloc = 1;
5477 			} else if (!done_chunk_alloc &&
5478 				   space_info->force_alloc ==
5479 				   CHUNK_ALLOC_NO_FORCE) {
5480 				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5481 			}
5482 
5483 		       /*
5484 			* We didn't allocate a chunk, go ahead and drop the
5485 			* empty size and loop again.
5486 			*/
5487 		       if (!done_chunk_alloc)
5488 			       loop = LOOP_NO_EMPTY_SIZE;
5489 		}
5490 
5491 		if (loop == LOOP_NO_EMPTY_SIZE) {
5492 			empty_size = 0;
5493 			empty_cluster = 0;
5494 		}
5495 
5496 		goto search;
5497 	} else if (!ins->objectid) {
5498 		ret = -ENOSPC;
5499 	} else if (ins->objectid) {
5500 		ret = 0;
5501 	}
5502 
5503 	return ret;
5504 }
5505 
5506 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5507 			    int dump_block_groups)
5508 {
5509 	struct btrfs_block_group_cache *cache;
5510 	int index = 0;
5511 
5512 	spin_lock(&info->lock);
5513 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5514 	       (unsigned long long)info->flags,
5515 	       (unsigned long long)(info->total_bytes - info->bytes_used -
5516 				    info->bytes_pinned - info->bytes_reserved -
5517 				    info->bytes_readonly),
5518 	       (info->full) ? "" : "not ");
5519 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5520 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
5521 	       (unsigned long long)info->total_bytes,
5522 	       (unsigned long long)info->bytes_used,
5523 	       (unsigned long long)info->bytes_pinned,
5524 	       (unsigned long long)info->bytes_reserved,
5525 	       (unsigned long long)info->bytes_may_use,
5526 	       (unsigned long long)info->bytes_readonly);
5527 	spin_unlock(&info->lock);
5528 
5529 	if (!dump_block_groups)
5530 		return;
5531 
5532 	down_read(&info->groups_sem);
5533 again:
5534 	list_for_each_entry(cache, &info->block_groups[index], list) {
5535 		spin_lock(&cache->lock);
5536 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5537 		       "%llu pinned %llu reserved\n",
5538 		       (unsigned long long)cache->key.objectid,
5539 		       (unsigned long long)cache->key.offset,
5540 		       (unsigned long long)btrfs_block_group_used(&cache->item),
5541 		       (unsigned long long)cache->pinned,
5542 		       (unsigned long long)cache->reserved);
5543 		btrfs_dump_free_space(cache, bytes);
5544 		spin_unlock(&cache->lock);
5545 	}
5546 	if (++index < BTRFS_NR_RAID_TYPES)
5547 		goto again;
5548 	up_read(&info->groups_sem);
5549 }
5550 
5551 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5552 			 struct btrfs_root *root,
5553 			 u64 num_bytes, u64 min_alloc_size,
5554 			 u64 empty_size, u64 hint_byte,
5555 			 u64 search_end, struct btrfs_key *ins,
5556 			 u64 data)
5557 {
5558 	int ret;
5559 	u64 search_start = 0;
5560 
5561 	data = btrfs_get_alloc_profile(root, data);
5562 again:
5563 	/*
5564 	 * the only place that sets empty_size is btrfs_realloc_node, which
5565 	 * is not called recursively on allocations
5566 	 */
5567 	if (empty_size || root->ref_cows)
5568 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5569 				     num_bytes + 2 * 1024 * 1024, data,
5570 				     CHUNK_ALLOC_NO_FORCE);
5571 
5572 	WARN_ON(num_bytes < root->sectorsize);
5573 	ret = find_free_extent(trans, root, num_bytes, empty_size,
5574 			       search_start, search_end, hint_byte,
5575 			       ins, data);
5576 
5577 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5578 		num_bytes = num_bytes >> 1;
5579 		num_bytes = num_bytes & ~(root->sectorsize - 1);
5580 		num_bytes = max(num_bytes, min_alloc_size);
5581 		do_chunk_alloc(trans, root->fs_info->extent_root,
5582 			       num_bytes, data, CHUNK_ALLOC_FORCE);
5583 		goto again;
5584 	}
5585 	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5586 		struct btrfs_space_info *sinfo;
5587 
5588 		sinfo = __find_space_info(root->fs_info, data);
5589 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
5590 		       "wanted %llu\n", (unsigned long long)data,
5591 		       (unsigned long long)num_bytes);
5592 		dump_space_info(sinfo, num_bytes, 1);
5593 	}
5594 
5595 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5596 
5597 	return ret;
5598 }
5599 
5600 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5601 					u64 start, u64 len, int pin)
5602 {
5603 	struct btrfs_block_group_cache *cache;
5604 	int ret = 0;
5605 
5606 	cache = btrfs_lookup_block_group(root->fs_info, start);
5607 	if (!cache) {
5608 		printk(KERN_ERR "Unable to find block group for %llu\n",
5609 		       (unsigned long long)start);
5610 		return -ENOSPC;
5611 	}
5612 
5613 	if (btrfs_test_opt(root, DISCARD))
5614 		ret = btrfs_discard_extent(root, start, len, NULL);
5615 
5616 	if (pin)
5617 		pin_down_extent(root, cache, start, len, 1);
5618 	else {
5619 		btrfs_add_free_space(cache, start, len);
5620 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5621 	}
5622 	btrfs_put_block_group(cache);
5623 
5624 	trace_btrfs_reserved_extent_free(root, start, len);
5625 
5626 	return ret;
5627 }
5628 
5629 int btrfs_free_reserved_extent(struct btrfs_root *root,
5630 					u64 start, u64 len)
5631 {
5632 	return __btrfs_free_reserved_extent(root, start, len, 0);
5633 }
5634 
5635 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5636 				       u64 start, u64 len)
5637 {
5638 	return __btrfs_free_reserved_extent(root, start, len, 1);
5639 }
5640 
5641 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5642 				      struct btrfs_root *root,
5643 				      u64 parent, u64 root_objectid,
5644 				      u64 flags, u64 owner, u64 offset,
5645 				      struct btrfs_key *ins, int ref_mod)
5646 {
5647 	int ret;
5648 	struct btrfs_fs_info *fs_info = root->fs_info;
5649 	struct btrfs_extent_item *extent_item;
5650 	struct btrfs_extent_inline_ref *iref;
5651 	struct btrfs_path *path;
5652 	struct extent_buffer *leaf;
5653 	int type;
5654 	u32 size;
5655 
5656 	if (parent > 0)
5657 		type = BTRFS_SHARED_DATA_REF_KEY;
5658 	else
5659 		type = BTRFS_EXTENT_DATA_REF_KEY;
5660 
5661 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5662 
5663 	path = btrfs_alloc_path();
5664 	if (!path)
5665 		return -ENOMEM;
5666 
5667 	path->leave_spinning = 1;
5668 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5669 				      ins, size);
5670 	BUG_ON(ret);
5671 
5672 	leaf = path->nodes[0];
5673 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
5674 				     struct btrfs_extent_item);
5675 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5676 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5677 	btrfs_set_extent_flags(leaf, extent_item,
5678 			       flags | BTRFS_EXTENT_FLAG_DATA);
5679 
5680 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5681 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
5682 	if (parent > 0) {
5683 		struct btrfs_shared_data_ref *ref;
5684 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
5685 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5686 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5687 	} else {
5688 		struct btrfs_extent_data_ref *ref;
5689 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5690 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5691 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5692 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5693 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5694 	}
5695 
5696 	btrfs_mark_buffer_dirty(path->nodes[0]);
5697 	btrfs_free_path(path);
5698 
5699 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5700 	if (ret) {
5701 		printk(KERN_ERR "btrfs update block group failed for %llu "
5702 		       "%llu\n", (unsigned long long)ins->objectid,
5703 		       (unsigned long long)ins->offset);
5704 		BUG();
5705 	}
5706 	return ret;
5707 }
5708 
5709 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5710 				     struct btrfs_root *root,
5711 				     u64 parent, u64 root_objectid,
5712 				     u64 flags, struct btrfs_disk_key *key,
5713 				     int level, struct btrfs_key *ins)
5714 {
5715 	int ret;
5716 	struct btrfs_fs_info *fs_info = root->fs_info;
5717 	struct btrfs_extent_item *extent_item;
5718 	struct btrfs_tree_block_info *block_info;
5719 	struct btrfs_extent_inline_ref *iref;
5720 	struct btrfs_path *path;
5721 	struct extent_buffer *leaf;
5722 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5723 
5724 	path = btrfs_alloc_path();
5725 	if (!path)
5726 		return -ENOMEM;
5727 
5728 	path->leave_spinning = 1;
5729 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5730 				      ins, size);
5731 	BUG_ON(ret);
5732 
5733 	leaf = path->nodes[0];
5734 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
5735 				     struct btrfs_extent_item);
5736 	btrfs_set_extent_refs(leaf, extent_item, 1);
5737 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5738 	btrfs_set_extent_flags(leaf, extent_item,
5739 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5740 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5741 
5742 	btrfs_set_tree_block_key(leaf, block_info, key);
5743 	btrfs_set_tree_block_level(leaf, block_info, level);
5744 
5745 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5746 	if (parent > 0) {
5747 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5748 		btrfs_set_extent_inline_ref_type(leaf, iref,
5749 						 BTRFS_SHARED_BLOCK_REF_KEY);
5750 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5751 	} else {
5752 		btrfs_set_extent_inline_ref_type(leaf, iref,
5753 						 BTRFS_TREE_BLOCK_REF_KEY);
5754 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5755 	}
5756 
5757 	btrfs_mark_buffer_dirty(leaf);
5758 	btrfs_free_path(path);
5759 
5760 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5761 	if (ret) {
5762 		printk(KERN_ERR "btrfs update block group failed for %llu "
5763 		       "%llu\n", (unsigned long long)ins->objectid,
5764 		       (unsigned long long)ins->offset);
5765 		BUG();
5766 	}
5767 	return ret;
5768 }
5769 
5770 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5771 				     struct btrfs_root *root,
5772 				     u64 root_objectid, u64 owner,
5773 				     u64 offset, struct btrfs_key *ins)
5774 {
5775 	int ret;
5776 
5777 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5778 
5779 	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5780 					 0, root_objectid, owner, offset,
5781 					 BTRFS_ADD_DELAYED_EXTENT, NULL);
5782 	return ret;
5783 }
5784 
5785 /*
5786  * this is used by the tree logging recovery code.  It records that
5787  * an extent has been allocated and makes sure to clear the free
5788  * space cache bits as well
5789  */
5790 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5791 				   struct btrfs_root *root,
5792 				   u64 root_objectid, u64 owner, u64 offset,
5793 				   struct btrfs_key *ins)
5794 {
5795 	int ret;
5796 	struct btrfs_block_group_cache *block_group;
5797 	struct btrfs_caching_control *caching_ctl;
5798 	u64 start = ins->objectid;
5799 	u64 num_bytes = ins->offset;
5800 
5801 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5802 	cache_block_group(block_group, trans, NULL, 0);
5803 	caching_ctl = get_caching_control(block_group);
5804 
5805 	if (!caching_ctl) {
5806 		BUG_ON(!block_group_cache_done(block_group));
5807 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
5808 		BUG_ON(ret);
5809 	} else {
5810 		mutex_lock(&caching_ctl->mutex);
5811 
5812 		if (start >= caching_ctl->progress) {
5813 			ret = add_excluded_extent(root, start, num_bytes);
5814 			BUG_ON(ret);
5815 		} else if (start + num_bytes <= caching_ctl->progress) {
5816 			ret = btrfs_remove_free_space(block_group,
5817 						      start, num_bytes);
5818 			BUG_ON(ret);
5819 		} else {
5820 			num_bytes = caching_ctl->progress - start;
5821 			ret = btrfs_remove_free_space(block_group,
5822 						      start, num_bytes);
5823 			BUG_ON(ret);
5824 
5825 			start = caching_ctl->progress;
5826 			num_bytes = ins->objectid + ins->offset -
5827 				    caching_ctl->progress;
5828 			ret = add_excluded_extent(root, start, num_bytes);
5829 			BUG_ON(ret);
5830 		}
5831 
5832 		mutex_unlock(&caching_ctl->mutex);
5833 		put_caching_control(caching_ctl);
5834 	}
5835 
5836 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5837 					  RESERVE_ALLOC_NO_ACCOUNT);
5838 	BUG_ON(ret);
5839 	btrfs_put_block_group(block_group);
5840 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5841 					 0, owner, offset, ins, 1);
5842 	return ret;
5843 }
5844 
5845 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5846 					    struct btrfs_root *root,
5847 					    u64 bytenr, u32 blocksize,
5848 					    int level)
5849 {
5850 	struct extent_buffer *buf;
5851 
5852 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5853 	if (!buf)
5854 		return ERR_PTR(-ENOMEM);
5855 	btrfs_set_header_generation(buf, trans->transid);
5856 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5857 	btrfs_tree_lock(buf);
5858 	clean_tree_block(trans, root, buf);
5859 
5860 	btrfs_set_lock_blocking(buf);
5861 	btrfs_set_buffer_uptodate(buf);
5862 
5863 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5864 		/*
5865 		 * we allow two log transactions at a time, use different
5866 		 * EXENT bit to differentiate dirty pages.
5867 		 */
5868 		if (root->log_transid % 2 == 0)
5869 			set_extent_dirty(&root->dirty_log_pages, buf->start,
5870 					buf->start + buf->len - 1, GFP_NOFS);
5871 		else
5872 			set_extent_new(&root->dirty_log_pages, buf->start,
5873 					buf->start + buf->len - 1, GFP_NOFS);
5874 	} else {
5875 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5876 			 buf->start + buf->len - 1, GFP_NOFS);
5877 	}
5878 	trans->blocks_used++;
5879 	/* this returns a buffer locked for blocking */
5880 	return buf;
5881 }
5882 
5883 static struct btrfs_block_rsv *
5884 use_block_rsv(struct btrfs_trans_handle *trans,
5885 	      struct btrfs_root *root, u32 blocksize)
5886 {
5887 	struct btrfs_block_rsv *block_rsv;
5888 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5889 	int ret;
5890 
5891 	block_rsv = get_block_rsv(trans, root);
5892 
5893 	if (block_rsv->size == 0) {
5894 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5895 		/*
5896 		 * If we couldn't reserve metadata bytes try and use some from
5897 		 * the global reserve.
5898 		 */
5899 		if (ret && block_rsv != global_rsv) {
5900 			ret = block_rsv_use_bytes(global_rsv, blocksize);
5901 			if (!ret)
5902 				return global_rsv;
5903 			return ERR_PTR(ret);
5904 		} else if (ret) {
5905 			return ERR_PTR(ret);
5906 		}
5907 		return block_rsv;
5908 	}
5909 
5910 	ret = block_rsv_use_bytes(block_rsv, blocksize);
5911 	if (!ret)
5912 		return block_rsv;
5913 	if (ret) {
5914 		static DEFINE_RATELIMIT_STATE(_rs,
5915 				DEFAULT_RATELIMIT_INTERVAL,
5916 				/*DEFAULT_RATELIMIT_BURST*/ 2);
5917 		if (__ratelimit(&_rs)) {
5918 			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5919 			WARN_ON(1);
5920 		}
5921 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5922 		if (!ret) {
5923 			return block_rsv;
5924 		} else if (ret && block_rsv != global_rsv) {
5925 			ret = block_rsv_use_bytes(global_rsv, blocksize);
5926 			if (!ret)
5927 				return global_rsv;
5928 		}
5929 	}
5930 
5931 	return ERR_PTR(-ENOSPC);
5932 }
5933 
5934 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5935 {
5936 	block_rsv_add_bytes(block_rsv, blocksize, 0);
5937 	block_rsv_release_bytes(block_rsv, NULL, 0);
5938 }
5939 
5940 /*
5941  * finds a free extent and does all the dirty work required for allocation
5942  * returns the key for the extent through ins, and a tree buffer for
5943  * the first block of the extent through buf.
5944  *
5945  * returns the tree buffer or NULL.
5946  */
5947 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5948 					struct btrfs_root *root, u32 blocksize,
5949 					u64 parent, u64 root_objectid,
5950 					struct btrfs_disk_key *key, int level,
5951 					u64 hint, u64 empty_size)
5952 {
5953 	struct btrfs_key ins;
5954 	struct btrfs_block_rsv *block_rsv;
5955 	struct extent_buffer *buf;
5956 	u64 flags = 0;
5957 	int ret;
5958 
5959 
5960 	block_rsv = use_block_rsv(trans, root, blocksize);
5961 	if (IS_ERR(block_rsv))
5962 		return ERR_CAST(block_rsv);
5963 
5964 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5965 				   empty_size, hint, (u64)-1, &ins, 0);
5966 	if (ret) {
5967 		unuse_block_rsv(block_rsv, blocksize);
5968 		return ERR_PTR(ret);
5969 	}
5970 
5971 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5972 				    blocksize, level);
5973 	BUG_ON(IS_ERR(buf));
5974 
5975 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5976 		if (parent == 0)
5977 			parent = ins.objectid;
5978 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5979 	} else
5980 		BUG_ON(parent > 0);
5981 
5982 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5983 		struct btrfs_delayed_extent_op *extent_op;
5984 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5985 		BUG_ON(!extent_op);
5986 		if (key)
5987 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
5988 		else
5989 			memset(&extent_op->key, 0, sizeof(extent_op->key));
5990 		extent_op->flags_to_set = flags;
5991 		extent_op->update_key = 1;
5992 		extent_op->update_flags = 1;
5993 		extent_op->is_data = 0;
5994 
5995 		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5996 					ins.offset, parent, root_objectid,
5997 					level, BTRFS_ADD_DELAYED_EXTENT,
5998 					extent_op);
5999 		BUG_ON(ret);
6000 	}
6001 	return buf;
6002 }
6003 
6004 struct walk_control {
6005 	u64 refs[BTRFS_MAX_LEVEL];
6006 	u64 flags[BTRFS_MAX_LEVEL];
6007 	struct btrfs_key update_progress;
6008 	int stage;
6009 	int level;
6010 	int shared_level;
6011 	int update_ref;
6012 	int keep_locks;
6013 	int reada_slot;
6014 	int reada_count;
6015 };
6016 
6017 #define DROP_REFERENCE	1
6018 #define UPDATE_BACKREF	2
6019 
6020 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6021 				     struct btrfs_root *root,
6022 				     struct walk_control *wc,
6023 				     struct btrfs_path *path)
6024 {
6025 	u64 bytenr;
6026 	u64 generation;
6027 	u64 refs;
6028 	u64 flags;
6029 	u32 nritems;
6030 	u32 blocksize;
6031 	struct btrfs_key key;
6032 	struct extent_buffer *eb;
6033 	int ret;
6034 	int slot;
6035 	int nread = 0;
6036 
6037 	if (path->slots[wc->level] < wc->reada_slot) {
6038 		wc->reada_count = wc->reada_count * 2 / 3;
6039 		wc->reada_count = max(wc->reada_count, 2);
6040 	} else {
6041 		wc->reada_count = wc->reada_count * 3 / 2;
6042 		wc->reada_count = min_t(int, wc->reada_count,
6043 					BTRFS_NODEPTRS_PER_BLOCK(root));
6044 	}
6045 
6046 	eb = path->nodes[wc->level];
6047 	nritems = btrfs_header_nritems(eb);
6048 	blocksize = btrfs_level_size(root, wc->level - 1);
6049 
6050 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6051 		if (nread >= wc->reada_count)
6052 			break;
6053 
6054 		cond_resched();
6055 		bytenr = btrfs_node_blockptr(eb, slot);
6056 		generation = btrfs_node_ptr_generation(eb, slot);
6057 
6058 		if (slot == path->slots[wc->level])
6059 			goto reada;
6060 
6061 		if (wc->stage == UPDATE_BACKREF &&
6062 		    generation <= root->root_key.offset)
6063 			continue;
6064 
6065 		/* We don't lock the tree block, it's OK to be racy here */
6066 		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6067 					       &refs, &flags);
6068 		BUG_ON(ret);
6069 		BUG_ON(refs == 0);
6070 
6071 		if (wc->stage == DROP_REFERENCE) {
6072 			if (refs == 1)
6073 				goto reada;
6074 
6075 			if (wc->level == 1 &&
6076 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6077 				continue;
6078 			if (!wc->update_ref ||
6079 			    generation <= root->root_key.offset)
6080 				continue;
6081 			btrfs_node_key_to_cpu(eb, &key, slot);
6082 			ret = btrfs_comp_cpu_keys(&key,
6083 						  &wc->update_progress);
6084 			if (ret < 0)
6085 				continue;
6086 		} else {
6087 			if (wc->level == 1 &&
6088 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6089 				continue;
6090 		}
6091 reada:
6092 		ret = readahead_tree_block(root, bytenr, blocksize,
6093 					   generation);
6094 		if (ret)
6095 			break;
6096 		nread++;
6097 	}
6098 	wc->reada_slot = slot;
6099 }
6100 
6101 /*
6102  * hepler to process tree block while walking down the tree.
6103  *
6104  * when wc->stage == UPDATE_BACKREF, this function updates
6105  * back refs for pointers in the block.
6106  *
6107  * NOTE: return value 1 means we should stop walking down.
6108  */
6109 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6110 				   struct btrfs_root *root,
6111 				   struct btrfs_path *path,
6112 				   struct walk_control *wc, int lookup_info)
6113 {
6114 	int level = wc->level;
6115 	struct extent_buffer *eb = path->nodes[level];
6116 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6117 	int ret;
6118 
6119 	if (wc->stage == UPDATE_BACKREF &&
6120 	    btrfs_header_owner(eb) != root->root_key.objectid)
6121 		return 1;
6122 
6123 	/*
6124 	 * when reference count of tree block is 1, it won't increase
6125 	 * again. once full backref flag is set, we never clear it.
6126 	 */
6127 	if (lookup_info &&
6128 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6129 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6130 		BUG_ON(!path->locks[level]);
6131 		ret = btrfs_lookup_extent_info(trans, root,
6132 					       eb->start, eb->len,
6133 					       &wc->refs[level],
6134 					       &wc->flags[level]);
6135 		BUG_ON(ret);
6136 		BUG_ON(wc->refs[level] == 0);
6137 	}
6138 
6139 	if (wc->stage == DROP_REFERENCE) {
6140 		if (wc->refs[level] > 1)
6141 			return 1;
6142 
6143 		if (path->locks[level] && !wc->keep_locks) {
6144 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6145 			path->locks[level] = 0;
6146 		}
6147 		return 0;
6148 	}
6149 
6150 	/* wc->stage == UPDATE_BACKREF */
6151 	if (!(wc->flags[level] & flag)) {
6152 		BUG_ON(!path->locks[level]);
6153 		ret = btrfs_inc_ref(trans, root, eb, 1);
6154 		BUG_ON(ret);
6155 		ret = btrfs_dec_ref(trans, root, eb, 0);
6156 		BUG_ON(ret);
6157 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6158 						  eb->len, flag, 0);
6159 		BUG_ON(ret);
6160 		wc->flags[level] |= flag;
6161 	}
6162 
6163 	/*
6164 	 * the block is shared by multiple trees, so it's not good to
6165 	 * keep the tree lock
6166 	 */
6167 	if (path->locks[level] && level > 0) {
6168 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6169 		path->locks[level] = 0;
6170 	}
6171 	return 0;
6172 }
6173 
6174 /*
6175  * hepler to process tree block pointer.
6176  *
6177  * when wc->stage == DROP_REFERENCE, this function checks
6178  * reference count of the block pointed to. if the block
6179  * is shared and we need update back refs for the subtree
6180  * rooted at the block, this function changes wc->stage to
6181  * UPDATE_BACKREF. if the block is shared and there is no
6182  * need to update back, this function drops the reference
6183  * to the block.
6184  *
6185  * NOTE: return value 1 means we should stop walking down.
6186  */
6187 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6188 				 struct btrfs_root *root,
6189 				 struct btrfs_path *path,
6190 				 struct walk_control *wc, int *lookup_info)
6191 {
6192 	u64 bytenr;
6193 	u64 generation;
6194 	u64 parent;
6195 	u32 blocksize;
6196 	struct btrfs_key key;
6197 	struct extent_buffer *next;
6198 	int level = wc->level;
6199 	int reada = 0;
6200 	int ret = 0;
6201 
6202 	generation = btrfs_node_ptr_generation(path->nodes[level],
6203 					       path->slots[level]);
6204 	/*
6205 	 * if the lower level block was created before the snapshot
6206 	 * was created, we know there is no need to update back refs
6207 	 * for the subtree
6208 	 */
6209 	if (wc->stage == UPDATE_BACKREF &&
6210 	    generation <= root->root_key.offset) {
6211 		*lookup_info = 1;
6212 		return 1;
6213 	}
6214 
6215 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6216 	blocksize = btrfs_level_size(root, level - 1);
6217 
6218 	next = btrfs_find_tree_block(root, bytenr, blocksize);
6219 	if (!next) {
6220 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6221 		if (!next)
6222 			return -ENOMEM;
6223 		reada = 1;
6224 	}
6225 	btrfs_tree_lock(next);
6226 	btrfs_set_lock_blocking(next);
6227 
6228 	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6229 				       &wc->refs[level - 1],
6230 				       &wc->flags[level - 1]);
6231 	BUG_ON(ret);
6232 	BUG_ON(wc->refs[level - 1] == 0);
6233 	*lookup_info = 0;
6234 
6235 	if (wc->stage == DROP_REFERENCE) {
6236 		if (wc->refs[level - 1] > 1) {
6237 			if (level == 1 &&
6238 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6239 				goto skip;
6240 
6241 			if (!wc->update_ref ||
6242 			    generation <= root->root_key.offset)
6243 				goto skip;
6244 
6245 			btrfs_node_key_to_cpu(path->nodes[level], &key,
6246 					      path->slots[level]);
6247 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6248 			if (ret < 0)
6249 				goto skip;
6250 
6251 			wc->stage = UPDATE_BACKREF;
6252 			wc->shared_level = level - 1;
6253 		}
6254 	} else {
6255 		if (level == 1 &&
6256 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6257 			goto skip;
6258 	}
6259 
6260 	if (!btrfs_buffer_uptodate(next, generation)) {
6261 		btrfs_tree_unlock(next);
6262 		free_extent_buffer(next);
6263 		next = NULL;
6264 		*lookup_info = 1;
6265 	}
6266 
6267 	if (!next) {
6268 		if (reada && level == 1)
6269 			reada_walk_down(trans, root, wc, path);
6270 		next = read_tree_block(root, bytenr, blocksize, generation);
6271 		if (!next)
6272 			return -EIO;
6273 		btrfs_tree_lock(next);
6274 		btrfs_set_lock_blocking(next);
6275 	}
6276 
6277 	level--;
6278 	BUG_ON(level != btrfs_header_level(next));
6279 	path->nodes[level] = next;
6280 	path->slots[level] = 0;
6281 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6282 	wc->level = level;
6283 	if (wc->level == 1)
6284 		wc->reada_slot = 0;
6285 	return 0;
6286 skip:
6287 	wc->refs[level - 1] = 0;
6288 	wc->flags[level - 1] = 0;
6289 	if (wc->stage == DROP_REFERENCE) {
6290 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6291 			parent = path->nodes[level]->start;
6292 		} else {
6293 			BUG_ON(root->root_key.objectid !=
6294 			       btrfs_header_owner(path->nodes[level]));
6295 			parent = 0;
6296 		}
6297 
6298 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6299 					root->root_key.objectid, level - 1, 0);
6300 		BUG_ON(ret);
6301 	}
6302 	btrfs_tree_unlock(next);
6303 	free_extent_buffer(next);
6304 	*lookup_info = 1;
6305 	return 1;
6306 }
6307 
6308 /*
6309  * hepler to process tree block while walking up the tree.
6310  *
6311  * when wc->stage == DROP_REFERENCE, this function drops
6312  * reference count on the block.
6313  *
6314  * when wc->stage == UPDATE_BACKREF, this function changes
6315  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6316  * to UPDATE_BACKREF previously while processing the block.
6317  *
6318  * NOTE: return value 1 means we should stop walking up.
6319  */
6320 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6321 				 struct btrfs_root *root,
6322 				 struct btrfs_path *path,
6323 				 struct walk_control *wc)
6324 {
6325 	int ret;
6326 	int level = wc->level;
6327 	struct extent_buffer *eb = path->nodes[level];
6328 	u64 parent = 0;
6329 
6330 	if (wc->stage == UPDATE_BACKREF) {
6331 		BUG_ON(wc->shared_level < level);
6332 		if (level < wc->shared_level)
6333 			goto out;
6334 
6335 		ret = find_next_key(path, level + 1, &wc->update_progress);
6336 		if (ret > 0)
6337 			wc->update_ref = 0;
6338 
6339 		wc->stage = DROP_REFERENCE;
6340 		wc->shared_level = -1;
6341 		path->slots[level] = 0;
6342 
6343 		/*
6344 		 * check reference count again if the block isn't locked.
6345 		 * we should start walking down the tree again if reference
6346 		 * count is one.
6347 		 */
6348 		if (!path->locks[level]) {
6349 			BUG_ON(level == 0);
6350 			btrfs_tree_lock(eb);
6351 			btrfs_set_lock_blocking(eb);
6352 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6353 
6354 			ret = btrfs_lookup_extent_info(trans, root,
6355 						       eb->start, eb->len,
6356 						       &wc->refs[level],
6357 						       &wc->flags[level]);
6358 			BUG_ON(ret);
6359 			BUG_ON(wc->refs[level] == 0);
6360 			if (wc->refs[level] == 1) {
6361 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6362 				return 1;
6363 			}
6364 		}
6365 	}
6366 
6367 	/* wc->stage == DROP_REFERENCE */
6368 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6369 
6370 	if (wc->refs[level] == 1) {
6371 		if (level == 0) {
6372 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6373 				ret = btrfs_dec_ref(trans, root, eb, 1);
6374 			else
6375 				ret = btrfs_dec_ref(trans, root, eb, 0);
6376 			BUG_ON(ret);
6377 		}
6378 		/* make block locked assertion in clean_tree_block happy */
6379 		if (!path->locks[level] &&
6380 		    btrfs_header_generation(eb) == trans->transid) {
6381 			btrfs_tree_lock(eb);
6382 			btrfs_set_lock_blocking(eb);
6383 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6384 		}
6385 		clean_tree_block(trans, root, eb);
6386 	}
6387 
6388 	if (eb == root->node) {
6389 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6390 			parent = eb->start;
6391 		else
6392 			BUG_ON(root->root_key.objectid !=
6393 			       btrfs_header_owner(eb));
6394 	} else {
6395 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6396 			parent = path->nodes[level + 1]->start;
6397 		else
6398 			BUG_ON(root->root_key.objectid !=
6399 			       btrfs_header_owner(path->nodes[level + 1]));
6400 	}
6401 
6402 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6403 out:
6404 	wc->refs[level] = 0;
6405 	wc->flags[level] = 0;
6406 	return 0;
6407 }
6408 
6409 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6410 				   struct btrfs_root *root,
6411 				   struct btrfs_path *path,
6412 				   struct walk_control *wc)
6413 {
6414 	int level = wc->level;
6415 	int lookup_info = 1;
6416 	int ret;
6417 
6418 	while (level >= 0) {
6419 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
6420 		if (ret > 0)
6421 			break;
6422 
6423 		if (level == 0)
6424 			break;
6425 
6426 		if (path->slots[level] >=
6427 		    btrfs_header_nritems(path->nodes[level]))
6428 			break;
6429 
6430 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
6431 		if (ret > 0) {
6432 			path->slots[level]++;
6433 			continue;
6434 		} else if (ret < 0)
6435 			return ret;
6436 		level = wc->level;
6437 	}
6438 	return 0;
6439 }
6440 
6441 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6442 				 struct btrfs_root *root,
6443 				 struct btrfs_path *path,
6444 				 struct walk_control *wc, int max_level)
6445 {
6446 	int level = wc->level;
6447 	int ret;
6448 
6449 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6450 	while (level < max_level && path->nodes[level]) {
6451 		wc->level = level;
6452 		if (path->slots[level] + 1 <
6453 		    btrfs_header_nritems(path->nodes[level])) {
6454 			path->slots[level]++;
6455 			return 0;
6456 		} else {
6457 			ret = walk_up_proc(trans, root, path, wc);
6458 			if (ret > 0)
6459 				return 0;
6460 
6461 			if (path->locks[level]) {
6462 				btrfs_tree_unlock_rw(path->nodes[level],
6463 						     path->locks[level]);
6464 				path->locks[level] = 0;
6465 			}
6466 			free_extent_buffer(path->nodes[level]);
6467 			path->nodes[level] = NULL;
6468 			level++;
6469 		}
6470 	}
6471 	return 1;
6472 }
6473 
6474 /*
6475  * drop a subvolume tree.
6476  *
6477  * this function traverses the tree freeing any blocks that only
6478  * referenced by the tree.
6479  *
6480  * when a shared tree block is found. this function decreases its
6481  * reference count by one. if update_ref is true, this function
6482  * also make sure backrefs for the shared block and all lower level
6483  * blocks are properly updated.
6484  */
6485 void btrfs_drop_snapshot(struct btrfs_root *root,
6486 			 struct btrfs_block_rsv *block_rsv, int update_ref)
6487 {
6488 	struct btrfs_path *path;
6489 	struct btrfs_trans_handle *trans;
6490 	struct btrfs_root *tree_root = root->fs_info->tree_root;
6491 	struct btrfs_root_item *root_item = &root->root_item;
6492 	struct walk_control *wc;
6493 	struct btrfs_key key;
6494 	int err = 0;
6495 	int ret;
6496 	int level;
6497 
6498 	path = btrfs_alloc_path();
6499 	if (!path) {
6500 		err = -ENOMEM;
6501 		goto out;
6502 	}
6503 
6504 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6505 	if (!wc) {
6506 		btrfs_free_path(path);
6507 		err = -ENOMEM;
6508 		goto out;
6509 	}
6510 
6511 	trans = btrfs_start_transaction(tree_root, 0);
6512 	BUG_ON(IS_ERR(trans));
6513 
6514 	if (block_rsv)
6515 		trans->block_rsv = block_rsv;
6516 
6517 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6518 		level = btrfs_header_level(root->node);
6519 		path->nodes[level] = btrfs_lock_root_node(root);
6520 		btrfs_set_lock_blocking(path->nodes[level]);
6521 		path->slots[level] = 0;
6522 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6523 		memset(&wc->update_progress, 0,
6524 		       sizeof(wc->update_progress));
6525 	} else {
6526 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6527 		memcpy(&wc->update_progress, &key,
6528 		       sizeof(wc->update_progress));
6529 
6530 		level = root_item->drop_level;
6531 		BUG_ON(level == 0);
6532 		path->lowest_level = level;
6533 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6534 		path->lowest_level = 0;
6535 		if (ret < 0) {
6536 			err = ret;
6537 			goto out_free;
6538 		}
6539 		WARN_ON(ret > 0);
6540 
6541 		/*
6542 		 * unlock our path, this is safe because only this
6543 		 * function is allowed to delete this snapshot
6544 		 */
6545 		btrfs_unlock_up_safe(path, 0);
6546 
6547 		level = btrfs_header_level(root->node);
6548 		while (1) {
6549 			btrfs_tree_lock(path->nodes[level]);
6550 			btrfs_set_lock_blocking(path->nodes[level]);
6551 
6552 			ret = btrfs_lookup_extent_info(trans, root,
6553 						path->nodes[level]->start,
6554 						path->nodes[level]->len,
6555 						&wc->refs[level],
6556 						&wc->flags[level]);
6557 			BUG_ON(ret);
6558 			BUG_ON(wc->refs[level] == 0);
6559 
6560 			if (level == root_item->drop_level)
6561 				break;
6562 
6563 			btrfs_tree_unlock(path->nodes[level]);
6564 			WARN_ON(wc->refs[level] != 1);
6565 			level--;
6566 		}
6567 	}
6568 
6569 	wc->level = level;
6570 	wc->shared_level = -1;
6571 	wc->stage = DROP_REFERENCE;
6572 	wc->update_ref = update_ref;
6573 	wc->keep_locks = 0;
6574 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6575 
6576 	while (1) {
6577 		ret = walk_down_tree(trans, root, path, wc);
6578 		if (ret < 0) {
6579 			err = ret;
6580 			break;
6581 		}
6582 
6583 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6584 		if (ret < 0) {
6585 			err = ret;
6586 			break;
6587 		}
6588 
6589 		if (ret > 0) {
6590 			BUG_ON(wc->stage != DROP_REFERENCE);
6591 			break;
6592 		}
6593 
6594 		if (wc->stage == DROP_REFERENCE) {
6595 			level = wc->level;
6596 			btrfs_node_key(path->nodes[level],
6597 				       &root_item->drop_progress,
6598 				       path->slots[level]);
6599 			root_item->drop_level = level;
6600 		}
6601 
6602 		BUG_ON(wc->level == 0);
6603 		if (btrfs_should_end_transaction(trans, tree_root)) {
6604 			ret = btrfs_update_root(trans, tree_root,
6605 						&root->root_key,
6606 						root_item);
6607 			BUG_ON(ret);
6608 
6609 			btrfs_end_transaction_throttle(trans, tree_root);
6610 			trans = btrfs_start_transaction(tree_root, 0);
6611 			BUG_ON(IS_ERR(trans));
6612 			if (block_rsv)
6613 				trans->block_rsv = block_rsv;
6614 		}
6615 	}
6616 	btrfs_release_path(path);
6617 	BUG_ON(err);
6618 
6619 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
6620 	BUG_ON(ret);
6621 
6622 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6623 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6624 					   NULL, NULL);
6625 		BUG_ON(ret < 0);
6626 		if (ret > 0) {
6627 			/* if we fail to delete the orphan item this time
6628 			 * around, it'll get picked up the next time.
6629 			 *
6630 			 * The most common failure here is just -ENOENT.
6631 			 */
6632 			btrfs_del_orphan_item(trans, tree_root,
6633 					      root->root_key.objectid);
6634 		}
6635 	}
6636 
6637 	if (root->in_radix) {
6638 		btrfs_free_fs_root(tree_root->fs_info, root);
6639 	} else {
6640 		free_extent_buffer(root->node);
6641 		free_extent_buffer(root->commit_root);
6642 		kfree(root);
6643 	}
6644 out_free:
6645 	btrfs_end_transaction_throttle(trans, tree_root);
6646 	kfree(wc);
6647 	btrfs_free_path(path);
6648 out:
6649 	if (err)
6650 		btrfs_std_error(root->fs_info, err);
6651 	return;
6652 }
6653 
6654 /*
6655  * drop subtree rooted at tree block 'node'.
6656  *
6657  * NOTE: this function will unlock and release tree block 'node'
6658  */
6659 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6660 			struct btrfs_root *root,
6661 			struct extent_buffer *node,
6662 			struct extent_buffer *parent)
6663 {
6664 	struct btrfs_path *path;
6665 	struct walk_control *wc;
6666 	int level;
6667 	int parent_level;
6668 	int ret = 0;
6669 	int wret;
6670 
6671 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6672 
6673 	path = btrfs_alloc_path();
6674 	if (!path)
6675 		return -ENOMEM;
6676 
6677 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6678 	if (!wc) {
6679 		btrfs_free_path(path);
6680 		return -ENOMEM;
6681 	}
6682 
6683 	btrfs_assert_tree_locked(parent);
6684 	parent_level = btrfs_header_level(parent);
6685 	extent_buffer_get(parent);
6686 	path->nodes[parent_level] = parent;
6687 	path->slots[parent_level] = btrfs_header_nritems(parent);
6688 
6689 	btrfs_assert_tree_locked(node);
6690 	level = btrfs_header_level(node);
6691 	path->nodes[level] = node;
6692 	path->slots[level] = 0;
6693 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6694 
6695 	wc->refs[parent_level] = 1;
6696 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6697 	wc->level = level;
6698 	wc->shared_level = -1;
6699 	wc->stage = DROP_REFERENCE;
6700 	wc->update_ref = 0;
6701 	wc->keep_locks = 1;
6702 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6703 
6704 	while (1) {
6705 		wret = walk_down_tree(trans, root, path, wc);
6706 		if (wret < 0) {
6707 			ret = wret;
6708 			break;
6709 		}
6710 
6711 		wret = walk_up_tree(trans, root, path, wc, parent_level);
6712 		if (wret < 0)
6713 			ret = wret;
6714 		if (wret != 0)
6715 			break;
6716 	}
6717 
6718 	kfree(wc);
6719 	btrfs_free_path(path);
6720 	return ret;
6721 }
6722 
6723 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6724 {
6725 	u64 num_devices;
6726 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6727 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6728 
6729 	/*
6730 	 * we add in the count of missing devices because we want
6731 	 * to make sure that any RAID levels on a degraded FS
6732 	 * continue to be honored.
6733 	 */
6734 	num_devices = root->fs_info->fs_devices->rw_devices +
6735 		root->fs_info->fs_devices->missing_devices;
6736 
6737 	if (num_devices == 1) {
6738 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6739 		stripped = flags & ~stripped;
6740 
6741 		/* turn raid0 into single device chunks */
6742 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
6743 			return stripped;
6744 
6745 		/* turn mirroring into duplication */
6746 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6747 			     BTRFS_BLOCK_GROUP_RAID10))
6748 			return stripped | BTRFS_BLOCK_GROUP_DUP;
6749 		return flags;
6750 	} else {
6751 		/* they already had raid on here, just return */
6752 		if (flags & stripped)
6753 			return flags;
6754 
6755 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6756 		stripped = flags & ~stripped;
6757 
6758 		/* switch duplicated blocks with raid1 */
6759 		if (flags & BTRFS_BLOCK_GROUP_DUP)
6760 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
6761 
6762 		/* turn single device chunks into raid0 */
6763 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
6764 	}
6765 	return flags;
6766 }
6767 
6768 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6769 {
6770 	struct btrfs_space_info *sinfo = cache->space_info;
6771 	u64 num_bytes;
6772 	u64 min_allocable_bytes;
6773 	int ret = -ENOSPC;
6774 
6775 
6776 	/*
6777 	 * We need some metadata space and system metadata space for
6778 	 * allocating chunks in some corner cases until we force to set
6779 	 * it to be readonly.
6780 	 */
6781 	if ((sinfo->flags &
6782 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6783 	    !force)
6784 		min_allocable_bytes = 1 * 1024 * 1024;
6785 	else
6786 		min_allocable_bytes = 0;
6787 
6788 	spin_lock(&sinfo->lock);
6789 	spin_lock(&cache->lock);
6790 
6791 	if (cache->ro) {
6792 		ret = 0;
6793 		goto out;
6794 	}
6795 
6796 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6797 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
6798 
6799 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6800 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6801 	    min_allocable_bytes <= sinfo->total_bytes) {
6802 		sinfo->bytes_readonly += num_bytes;
6803 		cache->ro = 1;
6804 		ret = 0;
6805 	}
6806 out:
6807 	spin_unlock(&cache->lock);
6808 	spin_unlock(&sinfo->lock);
6809 	return ret;
6810 }
6811 
6812 int btrfs_set_block_group_ro(struct btrfs_root *root,
6813 			     struct btrfs_block_group_cache *cache)
6814 
6815 {
6816 	struct btrfs_trans_handle *trans;
6817 	u64 alloc_flags;
6818 	int ret;
6819 
6820 	BUG_ON(cache->ro);
6821 
6822 	trans = btrfs_join_transaction(root);
6823 	BUG_ON(IS_ERR(trans));
6824 
6825 	alloc_flags = update_block_group_flags(root, cache->flags);
6826 	if (alloc_flags != cache->flags)
6827 		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6828 			       CHUNK_ALLOC_FORCE);
6829 
6830 	ret = set_block_group_ro(cache, 0);
6831 	if (!ret)
6832 		goto out;
6833 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
6834 	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6835 			     CHUNK_ALLOC_FORCE);
6836 	if (ret < 0)
6837 		goto out;
6838 	ret = set_block_group_ro(cache, 0);
6839 out:
6840 	btrfs_end_transaction(trans, root);
6841 	return ret;
6842 }
6843 
6844 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
6845 			    struct btrfs_root *root, u64 type)
6846 {
6847 	u64 alloc_flags = get_alloc_profile(root, type);
6848 	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6849 			      CHUNK_ALLOC_FORCE);
6850 }
6851 
6852 /*
6853  * helper to account the unused space of all the readonly block group in the
6854  * list. takes mirrors into account.
6855  */
6856 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
6857 {
6858 	struct btrfs_block_group_cache *block_group;
6859 	u64 free_bytes = 0;
6860 	int factor;
6861 
6862 	list_for_each_entry(block_group, groups_list, list) {
6863 		spin_lock(&block_group->lock);
6864 
6865 		if (!block_group->ro) {
6866 			spin_unlock(&block_group->lock);
6867 			continue;
6868 		}
6869 
6870 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
6871 					  BTRFS_BLOCK_GROUP_RAID10 |
6872 					  BTRFS_BLOCK_GROUP_DUP))
6873 			factor = 2;
6874 		else
6875 			factor = 1;
6876 
6877 		free_bytes += (block_group->key.offset -
6878 			       btrfs_block_group_used(&block_group->item)) *
6879 			       factor;
6880 
6881 		spin_unlock(&block_group->lock);
6882 	}
6883 
6884 	return free_bytes;
6885 }
6886 
6887 /*
6888  * helper to account the unused space of all the readonly block group in the
6889  * space_info. takes mirrors into account.
6890  */
6891 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6892 {
6893 	int i;
6894 	u64 free_bytes = 0;
6895 
6896 	spin_lock(&sinfo->lock);
6897 
6898 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
6899 		if (!list_empty(&sinfo->block_groups[i]))
6900 			free_bytes += __btrfs_get_ro_block_group_free_space(
6901 						&sinfo->block_groups[i]);
6902 
6903 	spin_unlock(&sinfo->lock);
6904 
6905 	return free_bytes;
6906 }
6907 
6908 int btrfs_set_block_group_rw(struct btrfs_root *root,
6909 			      struct btrfs_block_group_cache *cache)
6910 {
6911 	struct btrfs_space_info *sinfo = cache->space_info;
6912 	u64 num_bytes;
6913 
6914 	BUG_ON(!cache->ro);
6915 
6916 	spin_lock(&sinfo->lock);
6917 	spin_lock(&cache->lock);
6918 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6919 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
6920 	sinfo->bytes_readonly -= num_bytes;
6921 	cache->ro = 0;
6922 	spin_unlock(&cache->lock);
6923 	spin_unlock(&sinfo->lock);
6924 	return 0;
6925 }
6926 
6927 /*
6928  * checks to see if its even possible to relocate this block group.
6929  *
6930  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6931  * ok to go ahead and try.
6932  */
6933 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6934 {
6935 	struct btrfs_block_group_cache *block_group;
6936 	struct btrfs_space_info *space_info;
6937 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6938 	struct btrfs_device *device;
6939 	u64 min_free;
6940 	u64 dev_min = 1;
6941 	u64 dev_nr = 0;
6942 	int index;
6943 	int full = 0;
6944 	int ret = 0;
6945 
6946 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6947 
6948 	/* odd, couldn't find the block group, leave it alone */
6949 	if (!block_group)
6950 		return -1;
6951 
6952 	min_free = btrfs_block_group_used(&block_group->item);
6953 
6954 	/* no bytes used, we're good */
6955 	if (!min_free)
6956 		goto out;
6957 
6958 	space_info = block_group->space_info;
6959 	spin_lock(&space_info->lock);
6960 
6961 	full = space_info->full;
6962 
6963 	/*
6964 	 * if this is the last block group we have in this space, we can't
6965 	 * relocate it unless we're able to allocate a new chunk below.
6966 	 *
6967 	 * Otherwise, we need to make sure we have room in the space to handle
6968 	 * all of the extents from this block group.  If we can, we're good
6969 	 */
6970 	if ((space_info->total_bytes != block_group->key.offset) &&
6971 	    (space_info->bytes_used + space_info->bytes_reserved +
6972 	     space_info->bytes_pinned + space_info->bytes_readonly +
6973 	     min_free < space_info->total_bytes)) {
6974 		spin_unlock(&space_info->lock);
6975 		goto out;
6976 	}
6977 	spin_unlock(&space_info->lock);
6978 
6979 	/*
6980 	 * ok we don't have enough space, but maybe we have free space on our
6981 	 * devices to allocate new chunks for relocation, so loop through our
6982 	 * alloc devices and guess if we have enough space.  However, if we
6983 	 * were marked as full, then we know there aren't enough chunks, and we
6984 	 * can just return.
6985 	 */
6986 	ret = -1;
6987 	if (full)
6988 		goto out;
6989 
6990 	/*
6991 	 * index:
6992 	 *      0: raid10
6993 	 *      1: raid1
6994 	 *      2: dup
6995 	 *      3: raid0
6996 	 *      4: single
6997 	 */
6998 	index = get_block_group_index(block_group);
6999 	if (index == 0) {
7000 		dev_min = 4;
7001 		/* Divide by 2 */
7002 		min_free >>= 1;
7003 	} else if (index == 1) {
7004 		dev_min = 2;
7005 	} else if (index == 2) {
7006 		/* Multiply by 2 */
7007 		min_free <<= 1;
7008 	} else if (index == 3) {
7009 		dev_min = fs_devices->rw_devices;
7010 		do_div(min_free, dev_min);
7011 	}
7012 
7013 	mutex_lock(&root->fs_info->chunk_mutex);
7014 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7015 		u64 dev_offset;
7016 
7017 		/*
7018 		 * check to make sure we can actually find a chunk with enough
7019 		 * space to fit our block group in.
7020 		 */
7021 		if (device->total_bytes > device->bytes_used + min_free) {
7022 			ret = find_free_dev_extent(NULL, device, min_free,
7023 						   &dev_offset, NULL);
7024 			if (!ret)
7025 				dev_nr++;
7026 
7027 			if (dev_nr >= dev_min)
7028 				break;
7029 
7030 			ret = -1;
7031 		}
7032 	}
7033 	mutex_unlock(&root->fs_info->chunk_mutex);
7034 out:
7035 	btrfs_put_block_group(block_group);
7036 	return ret;
7037 }
7038 
7039 static int find_first_block_group(struct btrfs_root *root,
7040 		struct btrfs_path *path, struct btrfs_key *key)
7041 {
7042 	int ret = 0;
7043 	struct btrfs_key found_key;
7044 	struct extent_buffer *leaf;
7045 	int slot;
7046 
7047 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7048 	if (ret < 0)
7049 		goto out;
7050 
7051 	while (1) {
7052 		slot = path->slots[0];
7053 		leaf = path->nodes[0];
7054 		if (slot >= btrfs_header_nritems(leaf)) {
7055 			ret = btrfs_next_leaf(root, path);
7056 			if (ret == 0)
7057 				continue;
7058 			if (ret < 0)
7059 				goto out;
7060 			break;
7061 		}
7062 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7063 
7064 		if (found_key.objectid >= key->objectid &&
7065 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7066 			ret = 0;
7067 			goto out;
7068 		}
7069 		path->slots[0]++;
7070 	}
7071 out:
7072 	return ret;
7073 }
7074 
7075 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7076 {
7077 	struct btrfs_block_group_cache *block_group;
7078 	u64 last = 0;
7079 
7080 	while (1) {
7081 		struct inode *inode;
7082 
7083 		block_group = btrfs_lookup_first_block_group(info, last);
7084 		while (block_group) {
7085 			spin_lock(&block_group->lock);
7086 			if (block_group->iref)
7087 				break;
7088 			spin_unlock(&block_group->lock);
7089 			block_group = next_block_group(info->tree_root,
7090 						       block_group);
7091 		}
7092 		if (!block_group) {
7093 			if (last == 0)
7094 				break;
7095 			last = 0;
7096 			continue;
7097 		}
7098 
7099 		inode = block_group->inode;
7100 		block_group->iref = 0;
7101 		block_group->inode = NULL;
7102 		spin_unlock(&block_group->lock);
7103 		iput(inode);
7104 		last = block_group->key.objectid + block_group->key.offset;
7105 		btrfs_put_block_group(block_group);
7106 	}
7107 }
7108 
7109 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7110 {
7111 	struct btrfs_block_group_cache *block_group;
7112 	struct btrfs_space_info *space_info;
7113 	struct btrfs_caching_control *caching_ctl;
7114 	struct rb_node *n;
7115 
7116 	down_write(&info->extent_commit_sem);
7117 	while (!list_empty(&info->caching_block_groups)) {
7118 		caching_ctl = list_entry(info->caching_block_groups.next,
7119 					 struct btrfs_caching_control, list);
7120 		list_del(&caching_ctl->list);
7121 		put_caching_control(caching_ctl);
7122 	}
7123 	up_write(&info->extent_commit_sem);
7124 
7125 	spin_lock(&info->block_group_cache_lock);
7126 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7127 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7128 				       cache_node);
7129 		rb_erase(&block_group->cache_node,
7130 			 &info->block_group_cache_tree);
7131 		spin_unlock(&info->block_group_cache_lock);
7132 
7133 		down_write(&block_group->space_info->groups_sem);
7134 		list_del(&block_group->list);
7135 		up_write(&block_group->space_info->groups_sem);
7136 
7137 		if (block_group->cached == BTRFS_CACHE_STARTED)
7138 			wait_block_group_cache_done(block_group);
7139 
7140 		/*
7141 		 * We haven't cached this block group, which means we could
7142 		 * possibly have excluded extents on this block group.
7143 		 */
7144 		if (block_group->cached == BTRFS_CACHE_NO)
7145 			free_excluded_extents(info->extent_root, block_group);
7146 
7147 		btrfs_remove_free_space_cache(block_group);
7148 		btrfs_put_block_group(block_group);
7149 
7150 		spin_lock(&info->block_group_cache_lock);
7151 	}
7152 	spin_unlock(&info->block_group_cache_lock);
7153 
7154 	/* now that all the block groups are freed, go through and
7155 	 * free all the space_info structs.  This is only called during
7156 	 * the final stages of unmount, and so we know nobody is
7157 	 * using them.  We call synchronize_rcu() once before we start,
7158 	 * just to be on the safe side.
7159 	 */
7160 	synchronize_rcu();
7161 
7162 	release_global_block_rsv(info);
7163 
7164 	while(!list_empty(&info->space_info)) {
7165 		space_info = list_entry(info->space_info.next,
7166 					struct btrfs_space_info,
7167 					list);
7168 		if (space_info->bytes_pinned > 0 ||
7169 		    space_info->bytes_reserved > 0 ||
7170 		    space_info->bytes_may_use > 0) {
7171 			WARN_ON(1);
7172 			dump_space_info(space_info, 0, 0);
7173 		}
7174 		list_del(&space_info->list);
7175 		kfree(space_info);
7176 	}
7177 	return 0;
7178 }
7179 
7180 static void __link_block_group(struct btrfs_space_info *space_info,
7181 			       struct btrfs_block_group_cache *cache)
7182 {
7183 	int index = get_block_group_index(cache);
7184 
7185 	down_write(&space_info->groups_sem);
7186 	list_add_tail(&cache->list, &space_info->block_groups[index]);
7187 	up_write(&space_info->groups_sem);
7188 }
7189 
7190 int btrfs_read_block_groups(struct btrfs_root *root)
7191 {
7192 	struct btrfs_path *path;
7193 	int ret;
7194 	struct btrfs_block_group_cache *cache;
7195 	struct btrfs_fs_info *info = root->fs_info;
7196 	struct btrfs_space_info *space_info;
7197 	struct btrfs_key key;
7198 	struct btrfs_key found_key;
7199 	struct extent_buffer *leaf;
7200 	int need_clear = 0;
7201 	u64 cache_gen;
7202 
7203 	root = info->extent_root;
7204 	key.objectid = 0;
7205 	key.offset = 0;
7206 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7207 	path = btrfs_alloc_path();
7208 	if (!path)
7209 		return -ENOMEM;
7210 	path->reada = 1;
7211 
7212 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7213 	if (btrfs_test_opt(root, SPACE_CACHE) &&
7214 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7215 		need_clear = 1;
7216 	if (btrfs_test_opt(root, CLEAR_CACHE))
7217 		need_clear = 1;
7218 
7219 	while (1) {
7220 		ret = find_first_block_group(root, path, &key);
7221 		if (ret > 0)
7222 			break;
7223 		if (ret != 0)
7224 			goto error;
7225 		leaf = path->nodes[0];
7226 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7227 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7228 		if (!cache) {
7229 			ret = -ENOMEM;
7230 			goto error;
7231 		}
7232 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7233 						GFP_NOFS);
7234 		if (!cache->free_space_ctl) {
7235 			kfree(cache);
7236 			ret = -ENOMEM;
7237 			goto error;
7238 		}
7239 
7240 		atomic_set(&cache->count, 1);
7241 		spin_lock_init(&cache->lock);
7242 		cache->fs_info = info;
7243 		INIT_LIST_HEAD(&cache->list);
7244 		INIT_LIST_HEAD(&cache->cluster_list);
7245 
7246 		if (need_clear)
7247 			cache->disk_cache_state = BTRFS_DC_CLEAR;
7248 
7249 		read_extent_buffer(leaf, &cache->item,
7250 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7251 				   sizeof(cache->item));
7252 		memcpy(&cache->key, &found_key, sizeof(found_key));
7253 
7254 		key.objectid = found_key.objectid + found_key.offset;
7255 		btrfs_release_path(path);
7256 		cache->flags = btrfs_block_group_flags(&cache->item);
7257 		cache->sectorsize = root->sectorsize;
7258 
7259 		btrfs_init_free_space_ctl(cache);
7260 
7261 		/*
7262 		 * We need to exclude the super stripes now so that the space
7263 		 * info has super bytes accounted for, otherwise we'll think
7264 		 * we have more space than we actually do.
7265 		 */
7266 		exclude_super_stripes(root, cache);
7267 
7268 		/*
7269 		 * check for two cases, either we are full, and therefore
7270 		 * don't need to bother with the caching work since we won't
7271 		 * find any space, or we are empty, and we can just add all
7272 		 * the space in and be done with it.  This saves us _alot_ of
7273 		 * time, particularly in the full case.
7274 		 */
7275 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7276 			cache->last_byte_to_unpin = (u64)-1;
7277 			cache->cached = BTRFS_CACHE_FINISHED;
7278 			free_excluded_extents(root, cache);
7279 		} else if (btrfs_block_group_used(&cache->item) == 0) {
7280 			cache->last_byte_to_unpin = (u64)-1;
7281 			cache->cached = BTRFS_CACHE_FINISHED;
7282 			add_new_free_space(cache, root->fs_info,
7283 					   found_key.objectid,
7284 					   found_key.objectid +
7285 					   found_key.offset);
7286 			free_excluded_extents(root, cache);
7287 		}
7288 
7289 		ret = update_space_info(info, cache->flags, found_key.offset,
7290 					btrfs_block_group_used(&cache->item),
7291 					&space_info);
7292 		BUG_ON(ret);
7293 		cache->space_info = space_info;
7294 		spin_lock(&cache->space_info->lock);
7295 		cache->space_info->bytes_readonly += cache->bytes_super;
7296 		spin_unlock(&cache->space_info->lock);
7297 
7298 		__link_block_group(space_info, cache);
7299 
7300 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7301 		BUG_ON(ret);
7302 
7303 		set_avail_alloc_bits(root->fs_info, cache->flags);
7304 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7305 			set_block_group_ro(cache, 1);
7306 	}
7307 
7308 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7309 		if (!(get_alloc_profile(root, space_info->flags) &
7310 		      (BTRFS_BLOCK_GROUP_RAID10 |
7311 		       BTRFS_BLOCK_GROUP_RAID1 |
7312 		       BTRFS_BLOCK_GROUP_DUP)))
7313 			continue;
7314 		/*
7315 		 * avoid allocating from un-mirrored block group if there are
7316 		 * mirrored block groups.
7317 		 */
7318 		list_for_each_entry(cache, &space_info->block_groups[3], list)
7319 			set_block_group_ro(cache, 1);
7320 		list_for_each_entry(cache, &space_info->block_groups[4], list)
7321 			set_block_group_ro(cache, 1);
7322 	}
7323 
7324 	init_global_block_rsv(info);
7325 	ret = 0;
7326 error:
7327 	btrfs_free_path(path);
7328 	return ret;
7329 }
7330 
7331 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7332 			   struct btrfs_root *root, u64 bytes_used,
7333 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7334 			   u64 size)
7335 {
7336 	int ret;
7337 	struct btrfs_root *extent_root;
7338 	struct btrfs_block_group_cache *cache;
7339 
7340 	extent_root = root->fs_info->extent_root;
7341 
7342 	root->fs_info->last_trans_log_full_commit = trans->transid;
7343 
7344 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7345 	if (!cache)
7346 		return -ENOMEM;
7347 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7348 					GFP_NOFS);
7349 	if (!cache->free_space_ctl) {
7350 		kfree(cache);
7351 		return -ENOMEM;
7352 	}
7353 
7354 	cache->key.objectid = chunk_offset;
7355 	cache->key.offset = size;
7356 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7357 	cache->sectorsize = root->sectorsize;
7358 	cache->fs_info = root->fs_info;
7359 
7360 	atomic_set(&cache->count, 1);
7361 	spin_lock_init(&cache->lock);
7362 	INIT_LIST_HEAD(&cache->list);
7363 	INIT_LIST_HEAD(&cache->cluster_list);
7364 
7365 	btrfs_init_free_space_ctl(cache);
7366 
7367 	btrfs_set_block_group_used(&cache->item, bytes_used);
7368 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7369 	cache->flags = type;
7370 	btrfs_set_block_group_flags(&cache->item, type);
7371 
7372 	cache->last_byte_to_unpin = (u64)-1;
7373 	cache->cached = BTRFS_CACHE_FINISHED;
7374 	exclude_super_stripes(root, cache);
7375 
7376 	add_new_free_space(cache, root->fs_info, chunk_offset,
7377 			   chunk_offset + size);
7378 
7379 	free_excluded_extents(root, cache);
7380 
7381 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7382 				&cache->space_info);
7383 	BUG_ON(ret);
7384 
7385 	spin_lock(&cache->space_info->lock);
7386 	cache->space_info->bytes_readonly += cache->bytes_super;
7387 	spin_unlock(&cache->space_info->lock);
7388 
7389 	__link_block_group(cache->space_info, cache);
7390 
7391 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7392 	BUG_ON(ret);
7393 
7394 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7395 				sizeof(cache->item));
7396 	BUG_ON(ret);
7397 
7398 	set_avail_alloc_bits(extent_root->fs_info, type);
7399 
7400 	return 0;
7401 }
7402 
7403 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7404 			     struct btrfs_root *root, u64 group_start)
7405 {
7406 	struct btrfs_path *path;
7407 	struct btrfs_block_group_cache *block_group;
7408 	struct btrfs_free_cluster *cluster;
7409 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7410 	struct btrfs_key key;
7411 	struct inode *inode;
7412 	int ret;
7413 	int factor;
7414 
7415 	root = root->fs_info->extent_root;
7416 
7417 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7418 	BUG_ON(!block_group);
7419 	BUG_ON(!block_group->ro);
7420 
7421 	/*
7422 	 * Free the reserved super bytes from this block group before
7423 	 * remove it.
7424 	 */
7425 	free_excluded_extents(root, block_group);
7426 
7427 	memcpy(&key, &block_group->key, sizeof(key));
7428 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7429 				  BTRFS_BLOCK_GROUP_RAID1 |
7430 				  BTRFS_BLOCK_GROUP_RAID10))
7431 		factor = 2;
7432 	else
7433 		factor = 1;
7434 
7435 	/* make sure this block group isn't part of an allocation cluster */
7436 	cluster = &root->fs_info->data_alloc_cluster;
7437 	spin_lock(&cluster->refill_lock);
7438 	btrfs_return_cluster_to_free_space(block_group, cluster);
7439 	spin_unlock(&cluster->refill_lock);
7440 
7441 	/*
7442 	 * make sure this block group isn't part of a metadata
7443 	 * allocation cluster
7444 	 */
7445 	cluster = &root->fs_info->meta_alloc_cluster;
7446 	spin_lock(&cluster->refill_lock);
7447 	btrfs_return_cluster_to_free_space(block_group, cluster);
7448 	spin_unlock(&cluster->refill_lock);
7449 
7450 	path = btrfs_alloc_path();
7451 	if (!path) {
7452 		ret = -ENOMEM;
7453 		goto out;
7454 	}
7455 
7456 	inode = lookup_free_space_inode(tree_root, block_group, path);
7457 	if (!IS_ERR(inode)) {
7458 		ret = btrfs_orphan_add(trans, inode);
7459 		BUG_ON(ret);
7460 		clear_nlink(inode);
7461 		/* One for the block groups ref */
7462 		spin_lock(&block_group->lock);
7463 		if (block_group->iref) {
7464 			block_group->iref = 0;
7465 			block_group->inode = NULL;
7466 			spin_unlock(&block_group->lock);
7467 			iput(inode);
7468 		} else {
7469 			spin_unlock(&block_group->lock);
7470 		}
7471 		/* One for our lookup ref */
7472 		btrfs_add_delayed_iput(inode);
7473 	}
7474 
7475 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7476 	key.offset = block_group->key.objectid;
7477 	key.type = 0;
7478 
7479 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7480 	if (ret < 0)
7481 		goto out;
7482 	if (ret > 0)
7483 		btrfs_release_path(path);
7484 	if (ret == 0) {
7485 		ret = btrfs_del_item(trans, tree_root, path);
7486 		if (ret)
7487 			goto out;
7488 		btrfs_release_path(path);
7489 	}
7490 
7491 	spin_lock(&root->fs_info->block_group_cache_lock);
7492 	rb_erase(&block_group->cache_node,
7493 		 &root->fs_info->block_group_cache_tree);
7494 	spin_unlock(&root->fs_info->block_group_cache_lock);
7495 
7496 	down_write(&block_group->space_info->groups_sem);
7497 	/*
7498 	 * we must use list_del_init so people can check to see if they
7499 	 * are still on the list after taking the semaphore
7500 	 */
7501 	list_del_init(&block_group->list);
7502 	up_write(&block_group->space_info->groups_sem);
7503 
7504 	if (block_group->cached == BTRFS_CACHE_STARTED)
7505 		wait_block_group_cache_done(block_group);
7506 
7507 	btrfs_remove_free_space_cache(block_group);
7508 
7509 	spin_lock(&block_group->space_info->lock);
7510 	block_group->space_info->total_bytes -= block_group->key.offset;
7511 	block_group->space_info->bytes_readonly -= block_group->key.offset;
7512 	block_group->space_info->disk_total -= block_group->key.offset * factor;
7513 	spin_unlock(&block_group->space_info->lock);
7514 
7515 	memcpy(&key, &block_group->key, sizeof(key));
7516 
7517 	btrfs_clear_space_info_full(root->fs_info);
7518 
7519 	btrfs_put_block_group(block_group);
7520 	btrfs_put_block_group(block_group);
7521 
7522 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7523 	if (ret > 0)
7524 		ret = -EIO;
7525 	if (ret < 0)
7526 		goto out;
7527 
7528 	ret = btrfs_del_item(trans, root, path);
7529 out:
7530 	btrfs_free_path(path);
7531 	return ret;
7532 }
7533 
7534 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7535 {
7536 	struct btrfs_space_info *space_info;
7537 	struct btrfs_super_block *disk_super;
7538 	u64 features;
7539 	u64 flags;
7540 	int mixed = 0;
7541 	int ret;
7542 
7543 	disk_super = fs_info->super_copy;
7544 	if (!btrfs_super_root(disk_super))
7545 		return 1;
7546 
7547 	features = btrfs_super_incompat_flags(disk_super);
7548 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7549 		mixed = 1;
7550 
7551 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
7552 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7553 	if (ret)
7554 		goto out;
7555 
7556 	if (mixed) {
7557 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7558 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7559 	} else {
7560 		flags = BTRFS_BLOCK_GROUP_METADATA;
7561 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7562 		if (ret)
7563 			goto out;
7564 
7565 		flags = BTRFS_BLOCK_GROUP_DATA;
7566 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7567 	}
7568 out:
7569 	return ret;
7570 }
7571 
7572 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7573 {
7574 	return unpin_extent_range(root, start, end);
7575 }
7576 
7577 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7578 			       u64 num_bytes, u64 *actual_bytes)
7579 {
7580 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7581 }
7582 
7583 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7584 {
7585 	struct btrfs_fs_info *fs_info = root->fs_info;
7586 	struct btrfs_block_group_cache *cache = NULL;
7587 	u64 group_trimmed;
7588 	u64 start;
7589 	u64 end;
7590 	u64 trimmed = 0;
7591 	int ret = 0;
7592 
7593 	cache = btrfs_lookup_block_group(fs_info, range->start);
7594 
7595 	while (cache) {
7596 		if (cache->key.objectid >= (range->start + range->len)) {
7597 			btrfs_put_block_group(cache);
7598 			break;
7599 		}
7600 
7601 		start = max(range->start, cache->key.objectid);
7602 		end = min(range->start + range->len,
7603 				cache->key.objectid + cache->key.offset);
7604 
7605 		if (end - start >= range->minlen) {
7606 			if (!block_group_cache_done(cache)) {
7607 				ret = cache_block_group(cache, NULL, root, 0);
7608 				if (!ret)
7609 					wait_block_group_cache_done(cache);
7610 			}
7611 			ret = btrfs_trim_block_group(cache,
7612 						     &group_trimmed,
7613 						     start,
7614 						     end,
7615 						     range->minlen);
7616 
7617 			trimmed += group_trimmed;
7618 			if (ret) {
7619 				btrfs_put_block_group(cache);
7620 				break;
7621 			}
7622 		}
7623 
7624 		cache = next_block_group(fs_info->tree_root, cache);
7625 	}
7626 
7627 	range->len = trimmed;
7628 	return ret;
7629 }
7630