xref: /linux/fs/btrfs/extent-tree.c (revision a67ff6a54095e27093ea501fb143fefe51a536c2)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 
37 /* control flags for do_chunk_alloc's force field
38  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
39  * if we really need one.
40  *
41  * CHUNK_ALLOC_FORCE means it must try to allocate one
42  *
43  * CHUNK_ALLOC_LIMITED means to only try and allocate one
44  * if we have very few chunks already allocated.  This is
45  * used as part of the clustering code to help make sure
46  * we have a good pool of storage to cluster in, without
47  * filling the FS with empty chunks
48  *
49  */
50 enum {
51 	CHUNK_ALLOC_NO_FORCE = 0,
52 	CHUNK_ALLOC_FORCE = 1,
53 	CHUNK_ALLOC_LIMITED = 2,
54 };
55 
56 /*
57  * Control how reservations are dealt with.
58  *
59  * RESERVE_FREE - freeing a reservation.
60  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61  *   ENOSPC accounting
62  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63  *   bytes_may_use as the ENOSPC accounting is done elsewhere
64  */
65 enum {
66 	RESERVE_FREE = 0,
67 	RESERVE_ALLOC = 1,
68 	RESERVE_ALLOC_NO_ACCOUNT = 2,
69 };
70 
71 static int update_block_group(struct btrfs_trans_handle *trans,
72 			      struct btrfs_root *root,
73 			      u64 bytenr, u64 num_bytes, int alloc);
74 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
75 				struct btrfs_root *root,
76 				u64 bytenr, u64 num_bytes, u64 parent,
77 				u64 root_objectid, u64 owner_objectid,
78 				u64 owner_offset, int refs_to_drop,
79 				struct btrfs_delayed_extent_op *extra_op);
80 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
81 				    struct extent_buffer *leaf,
82 				    struct btrfs_extent_item *ei);
83 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84 				      struct btrfs_root *root,
85 				      u64 parent, u64 root_objectid,
86 				      u64 flags, u64 owner, u64 offset,
87 				      struct btrfs_key *ins, int ref_mod);
88 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
89 				     struct btrfs_root *root,
90 				     u64 parent, u64 root_objectid,
91 				     u64 flags, struct btrfs_disk_key *key,
92 				     int level, struct btrfs_key *ins);
93 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
94 			  struct btrfs_root *extent_root, u64 alloc_bytes,
95 			  u64 flags, int force);
96 static int find_next_key(struct btrfs_path *path, int level,
97 			 struct btrfs_key *key);
98 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
99 			    int dump_block_groups);
100 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 				       u64 num_bytes, int reserve);
102 
103 static noinline int
104 block_group_cache_done(struct btrfs_block_group_cache *cache)
105 {
106 	smp_mb();
107 	return cache->cached == BTRFS_CACHE_FINISHED;
108 }
109 
110 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
111 {
112 	return (cache->flags & bits) == bits;
113 }
114 
115 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
116 {
117 	atomic_inc(&cache->count);
118 }
119 
120 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
121 {
122 	if (atomic_dec_and_test(&cache->count)) {
123 		WARN_ON(cache->pinned > 0);
124 		WARN_ON(cache->reserved > 0);
125 		kfree(cache->free_space_ctl);
126 		kfree(cache);
127 	}
128 }
129 
130 /*
131  * this adds the block group to the fs_info rb tree for the block group
132  * cache
133  */
134 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
135 				struct btrfs_block_group_cache *block_group)
136 {
137 	struct rb_node **p;
138 	struct rb_node *parent = NULL;
139 	struct btrfs_block_group_cache *cache;
140 
141 	spin_lock(&info->block_group_cache_lock);
142 	p = &info->block_group_cache_tree.rb_node;
143 
144 	while (*p) {
145 		parent = *p;
146 		cache = rb_entry(parent, struct btrfs_block_group_cache,
147 				 cache_node);
148 		if (block_group->key.objectid < cache->key.objectid) {
149 			p = &(*p)->rb_left;
150 		} else if (block_group->key.objectid > cache->key.objectid) {
151 			p = &(*p)->rb_right;
152 		} else {
153 			spin_unlock(&info->block_group_cache_lock);
154 			return -EEXIST;
155 		}
156 	}
157 
158 	rb_link_node(&block_group->cache_node, parent, p);
159 	rb_insert_color(&block_group->cache_node,
160 			&info->block_group_cache_tree);
161 	spin_unlock(&info->block_group_cache_lock);
162 
163 	return 0;
164 }
165 
166 /*
167  * This will return the block group at or after bytenr if contains is 0, else
168  * it will return the block group that contains the bytenr
169  */
170 static struct btrfs_block_group_cache *
171 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
172 			      int contains)
173 {
174 	struct btrfs_block_group_cache *cache, *ret = NULL;
175 	struct rb_node *n;
176 	u64 end, start;
177 
178 	spin_lock(&info->block_group_cache_lock);
179 	n = info->block_group_cache_tree.rb_node;
180 
181 	while (n) {
182 		cache = rb_entry(n, struct btrfs_block_group_cache,
183 				 cache_node);
184 		end = cache->key.objectid + cache->key.offset - 1;
185 		start = cache->key.objectid;
186 
187 		if (bytenr < start) {
188 			if (!contains && (!ret || start < ret->key.objectid))
189 				ret = cache;
190 			n = n->rb_left;
191 		} else if (bytenr > start) {
192 			if (contains && bytenr <= end) {
193 				ret = cache;
194 				break;
195 			}
196 			n = n->rb_right;
197 		} else {
198 			ret = cache;
199 			break;
200 		}
201 	}
202 	if (ret)
203 		btrfs_get_block_group(ret);
204 	spin_unlock(&info->block_group_cache_lock);
205 
206 	return ret;
207 }
208 
209 static int add_excluded_extent(struct btrfs_root *root,
210 			       u64 start, u64 num_bytes)
211 {
212 	u64 end = start + num_bytes - 1;
213 	set_extent_bits(&root->fs_info->freed_extents[0],
214 			start, end, EXTENT_UPTODATE, GFP_NOFS);
215 	set_extent_bits(&root->fs_info->freed_extents[1],
216 			start, end, EXTENT_UPTODATE, GFP_NOFS);
217 	return 0;
218 }
219 
220 static void free_excluded_extents(struct btrfs_root *root,
221 				  struct btrfs_block_group_cache *cache)
222 {
223 	u64 start, end;
224 
225 	start = cache->key.objectid;
226 	end = start + cache->key.offset - 1;
227 
228 	clear_extent_bits(&root->fs_info->freed_extents[0],
229 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
230 	clear_extent_bits(&root->fs_info->freed_extents[1],
231 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
232 }
233 
234 static int exclude_super_stripes(struct btrfs_root *root,
235 				 struct btrfs_block_group_cache *cache)
236 {
237 	u64 bytenr;
238 	u64 *logical;
239 	int stripe_len;
240 	int i, nr, ret;
241 
242 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
243 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
244 		cache->bytes_super += stripe_len;
245 		ret = add_excluded_extent(root, cache->key.objectid,
246 					  stripe_len);
247 		BUG_ON(ret);
248 	}
249 
250 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
251 		bytenr = btrfs_sb_offset(i);
252 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
253 				       cache->key.objectid, bytenr,
254 				       0, &logical, &nr, &stripe_len);
255 		BUG_ON(ret);
256 
257 		while (nr--) {
258 			cache->bytes_super += stripe_len;
259 			ret = add_excluded_extent(root, logical[nr],
260 						  stripe_len);
261 			BUG_ON(ret);
262 		}
263 
264 		kfree(logical);
265 	}
266 	return 0;
267 }
268 
269 static struct btrfs_caching_control *
270 get_caching_control(struct btrfs_block_group_cache *cache)
271 {
272 	struct btrfs_caching_control *ctl;
273 
274 	spin_lock(&cache->lock);
275 	if (cache->cached != BTRFS_CACHE_STARTED) {
276 		spin_unlock(&cache->lock);
277 		return NULL;
278 	}
279 
280 	/* We're loading it the fast way, so we don't have a caching_ctl. */
281 	if (!cache->caching_ctl) {
282 		spin_unlock(&cache->lock);
283 		return NULL;
284 	}
285 
286 	ctl = cache->caching_ctl;
287 	atomic_inc(&ctl->count);
288 	spin_unlock(&cache->lock);
289 	return ctl;
290 }
291 
292 static void put_caching_control(struct btrfs_caching_control *ctl)
293 {
294 	if (atomic_dec_and_test(&ctl->count))
295 		kfree(ctl);
296 }
297 
298 /*
299  * this is only called by cache_block_group, since we could have freed extents
300  * we need to check the pinned_extents for any extents that can't be used yet
301  * since their free space will be released as soon as the transaction commits.
302  */
303 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
304 			      struct btrfs_fs_info *info, u64 start, u64 end)
305 {
306 	u64 extent_start, extent_end, size, total_added = 0;
307 	int ret;
308 
309 	while (start < end) {
310 		ret = find_first_extent_bit(info->pinned_extents, start,
311 					    &extent_start, &extent_end,
312 					    EXTENT_DIRTY | EXTENT_UPTODATE);
313 		if (ret)
314 			break;
315 
316 		if (extent_start <= start) {
317 			start = extent_end + 1;
318 		} else if (extent_start > start && extent_start < end) {
319 			size = extent_start - start;
320 			total_added += size;
321 			ret = btrfs_add_free_space(block_group, start,
322 						   size);
323 			BUG_ON(ret);
324 			start = extent_end + 1;
325 		} else {
326 			break;
327 		}
328 	}
329 
330 	if (start < end) {
331 		size = end - start;
332 		total_added += size;
333 		ret = btrfs_add_free_space(block_group, start, size);
334 		BUG_ON(ret);
335 	}
336 
337 	return total_added;
338 }
339 
340 static noinline void caching_thread(struct btrfs_work *work)
341 {
342 	struct btrfs_block_group_cache *block_group;
343 	struct btrfs_fs_info *fs_info;
344 	struct btrfs_caching_control *caching_ctl;
345 	struct btrfs_root *extent_root;
346 	struct btrfs_path *path;
347 	struct extent_buffer *leaf;
348 	struct btrfs_key key;
349 	u64 total_found = 0;
350 	u64 last = 0;
351 	u32 nritems;
352 	int ret = 0;
353 
354 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
355 	block_group = caching_ctl->block_group;
356 	fs_info = block_group->fs_info;
357 	extent_root = fs_info->extent_root;
358 
359 	path = btrfs_alloc_path();
360 	if (!path)
361 		goto out;
362 
363 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
364 
365 	/*
366 	 * We don't want to deadlock with somebody trying to allocate a new
367 	 * extent for the extent root while also trying to search the extent
368 	 * root to add free space.  So we skip locking and search the commit
369 	 * root, since its read-only
370 	 */
371 	path->skip_locking = 1;
372 	path->search_commit_root = 1;
373 	path->reada = 1;
374 
375 	key.objectid = last;
376 	key.offset = 0;
377 	key.type = BTRFS_EXTENT_ITEM_KEY;
378 again:
379 	mutex_lock(&caching_ctl->mutex);
380 	/* need to make sure the commit_root doesn't disappear */
381 	down_read(&fs_info->extent_commit_sem);
382 
383 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
384 	if (ret < 0)
385 		goto err;
386 
387 	leaf = path->nodes[0];
388 	nritems = btrfs_header_nritems(leaf);
389 
390 	while (1) {
391 		if (btrfs_fs_closing(fs_info) > 1) {
392 			last = (u64)-1;
393 			break;
394 		}
395 
396 		if (path->slots[0] < nritems) {
397 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
398 		} else {
399 			ret = find_next_key(path, 0, &key);
400 			if (ret)
401 				break;
402 
403 			if (need_resched() ||
404 			    btrfs_next_leaf(extent_root, path)) {
405 				caching_ctl->progress = last;
406 				btrfs_release_path(path);
407 				up_read(&fs_info->extent_commit_sem);
408 				mutex_unlock(&caching_ctl->mutex);
409 				cond_resched();
410 				goto again;
411 			}
412 			leaf = path->nodes[0];
413 			nritems = btrfs_header_nritems(leaf);
414 			continue;
415 		}
416 
417 		if (key.objectid < block_group->key.objectid) {
418 			path->slots[0]++;
419 			continue;
420 		}
421 
422 		if (key.objectid >= block_group->key.objectid +
423 		    block_group->key.offset)
424 			break;
425 
426 		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
427 			total_found += add_new_free_space(block_group,
428 							  fs_info, last,
429 							  key.objectid);
430 			last = key.objectid + key.offset;
431 
432 			if (total_found > (1024 * 1024 * 2)) {
433 				total_found = 0;
434 				wake_up(&caching_ctl->wait);
435 			}
436 		}
437 		path->slots[0]++;
438 	}
439 	ret = 0;
440 
441 	total_found += add_new_free_space(block_group, fs_info, last,
442 					  block_group->key.objectid +
443 					  block_group->key.offset);
444 	caching_ctl->progress = (u64)-1;
445 
446 	spin_lock(&block_group->lock);
447 	block_group->caching_ctl = NULL;
448 	block_group->cached = BTRFS_CACHE_FINISHED;
449 	spin_unlock(&block_group->lock);
450 
451 err:
452 	btrfs_free_path(path);
453 	up_read(&fs_info->extent_commit_sem);
454 
455 	free_excluded_extents(extent_root, block_group);
456 
457 	mutex_unlock(&caching_ctl->mutex);
458 out:
459 	wake_up(&caching_ctl->wait);
460 
461 	put_caching_control(caching_ctl);
462 	btrfs_put_block_group(block_group);
463 }
464 
465 static int cache_block_group(struct btrfs_block_group_cache *cache,
466 			     struct btrfs_trans_handle *trans,
467 			     struct btrfs_root *root,
468 			     int load_cache_only)
469 {
470 	struct btrfs_fs_info *fs_info = cache->fs_info;
471 	struct btrfs_caching_control *caching_ctl;
472 	int ret = 0;
473 
474 	smp_mb();
475 	if (cache->cached != BTRFS_CACHE_NO)
476 		return 0;
477 
478 	/*
479 	 * We can't do the read from on-disk cache during a commit since we need
480 	 * to have the normal tree locking.  Also if we are currently trying to
481 	 * allocate blocks for the tree root we can't do the fast caching since
482 	 * we likely hold important locks.
483 	 */
484 	if (trans && (!trans->transaction->in_commit) &&
485 	    (root && root != root->fs_info->tree_root) &&
486 	    btrfs_test_opt(root, SPACE_CACHE)) {
487 		spin_lock(&cache->lock);
488 		if (cache->cached != BTRFS_CACHE_NO) {
489 			spin_unlock(&cache->lock);
490 			return 0;
491 		}
492 		cache->cached = BTRFS_CACHE_STARTED;
493 		spin_unlock(&cache->lock);
494 
495 		ret = load_free_space_cache(fs_info, cache);
496 
497 		spin_lock(&cache->lock);
498 		if (ret == 1) {
499 			cache->cached = BTRFS_CACHE_FINISHED;
500 			cache->last_byte_to_unpin = (u64)-1;
501 		} else {
502 			cache->cached = BTRFS_CACHE_NO;
503 		}
504 		spin_unlock(&cache->lock);
505 		if (ret == 1) {
506 			free_excluded_extents(fs_info->extent_root, cache);
507 			return 0;
508 		}
509 	}
510 
511 	if (load_cache_only)
512 		return 0;
513 
514 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
515 	BUG_ON(!caching_ctl);
516 
517 	INIT_LIST_HEAD(&caching_ctl->list);
518 	mutex_init(&caching_ctl->mutex);
519 	init_waitqueue_head(&caching_ctl->wait);
520 	caching_ctl->block_group = cache;
521 	caching_ctl->progress = cache->key.objectid;
522 	/* one for caching kthread, one for caching block group list */
523 	atomic_set(&caching_ctl->count, 2);
524 	caching_ctl->work.func = caching_thread;
525 
526 	spin_lock(&cache->lock);
527 	if (cache->cached != BTRFS_CACHE_NO) {
528 		spin_unlock(&cache->lock);
529 		kfree(caching_ctl);
530 		return 0;
531 	}
532 	cache->caching_ctl = caching_ctl;
533 	cache->cached = BTRFS_CACHE_STARTED;
534 	spin_unlock(&cache->lock);
535 
536 	down_write(&fs_info->extent_commit_sem);
537 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
538 	up_write(&fs_info->extent_commit_sem);
539 
540 	btrfs_get_block_group(cache);
541 
542 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
543 
544 	return ret;
545 }
546 
547 /*
548  * return the block group that starts at or after bytenr
549  */
550 static struct btrfs_block_group_cache *
551 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
552 {
553 	struct btrfs_block_group_cache *cache;
554 
555 	cache = block_group_cache_tree_search(info, bytenr, 0);
556 
557 	return cache;
558 }
559 
560 /*
561  * return the block group that contains the given bytenr
562  */
563 struct btrfs_block_group_cache *btrfs_lookup_block_group(
564 						 struct btrfs_fs_info *info,
565 						 u64 bytenr)
566 {
567 	struct btrfs_block_group_cache *cache;
568 
569 	cache = block_group_cache_tree_search(info, bytenr, 1);
570 
571 	return cache;
572 }
573 
574 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
575 						  u64 flags)
576 {
577 	struct list_head *head = &info->space_info;
578 	struct btrfs_space_info *found;
579 
580 	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
581 		 BTRFS_BLOCK_GROUP_METADATA;
582 
583 	rcu_read_lock();
584 	list_for_each_entry_rcu(found, head, list) {
585 		if (found->flags & flags) {
586 			rcu_read_unlock();
587 			return found;
588 		}
589 	}
590 	rcu_read_unlock();
591 	return NULL;
592 }
593 
594 /*
595  * after adding space to the filesystem, we need to clear the full flags
596  * on all the space infos.
597  */
598 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
599 {
600 	struct list_head *head = &info->space_info;
601 	struct btrfs_space_info *found;
602 
603 	rcu_read_lock();
604 	list_for_each_entry_rcu(found, head, list)
605 		found->full = 0;
606 	rcu_read_unlock();
607 }
608 
609 static u64 div_factor(u64 num, int factor)
610 {
611 	if (factor == 10)
612 		return num;
613 	num *= factor;
614 	do_div(num, 10);
615 	return num;
616 }
617 
618 static u64 div_factor_fine(u64 num, int factor)
619 {
620 	if (factor == 100)
621 		return num;
622 	num *= factor;
623 	do_div(num, 100);
624 	return num;
625 }
626 
627 u64 btrfs_find_block_group(struct btrfs_root *root,
628 			   u64 search_start, u64 search_hint, int owner)
629 {
630 	struct btrfs_block_group_cache *cache;
631 	u64 used;
632 	u64 last = max(search_hint, search_start);
633 	u64 group_start = 0;
634 	int full_search = 0;
635 	int factor = 9;
636 	int wrapped = 0;
637 again:
638 	while (1) {
639 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
640 		if (!cache)
641 			break;
642 
643 		spin_lock(&cache->lock);
644 		last = cache->key.objectid + cache->key.offset;
645 		used = btrfs_block_group_used(&cache->item);
646 
647 		if ((full_search || !cache->ro) &&
648 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
649 			if (used + cache->pinned + cache->reserved <
650 			    div_factor(cache->key.offset, factor)) {
651 				group_start = cache->key.objectid;
652 				spin_unlock(&cache->lock);
653 				btrfs_put_block_group(cache);
654 				goto found;
655 			}
656 		}
657 		spin_unlock(&cache->lock);
658 		btrfs_put_block_group(cache);
659 		cond_resched();
660 	}
661 	if (!wrapped) {
662 		last = search_start;
663 		wrapped = 1;
664 		goto again;
665 	}
666 	if (!full_search && factor < 10) {
667 		last = search_start;
668 		full_search = 1;
669 		factor = 10;
670 		goto again;
671 	}
672 found:
673 	return group_start;
674 }
675 
676 /* simple helper to search for an existing extent at a given offset */
677 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
678 {
679 	int ret;
680 	struct btrfs_key key;
681 	struct btrfs_path *path;
682 
683 	path = btrfs_alloc_path();
684 	if (!path)
685 		return -ENOMEM;
686 
687 	key.objectid = start;
688 	key.offset = len;
689 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
690 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
691 				0, 0);
692 	btrfs_free_path(path);
693 	return ret;
694 }
695 
696 /*
697  * helper function to lookup reference count and flags of extent.
698  *
699  * the head node for delayed ref is used to store the sum of all the
700  * reference count modifications queued up in the rbtree. the head
701  * node may also store the extent flags to set. This way you can check
702  * to see what the reference count and extent flags would be if all of
703  * the delayed refs are not processed.
704  */
705 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
706 			     struct btrfs_root *root, u64 bytenr,
707 			     u64 num_bytes, u64 *refs, u64 *flags)
708 {
709 	struct btrfs_delayed_ref_head *head;
710 	struct btrfs_delayed_ref_root *delayed_refs;
711 	struct btrfs_path *path;
712 	struct btrfs_extent_item *ei;
713 	struct extent_buffer *leaf;
714 	struct btrfs_key key;
715 	u32 item_size;
716 	u64 num_refs;
717 	u64 extent_flags;
718 	int ret;
719 
720 	path = btrfs_alloc_path();
721 	if (!path)
722 		return -ENOMEM;
723 
724 	key.objectid = bytenr;
725 	key.type = BTRFS_EXTENT_ITEM_KEY;
726 	key.offset = num_bytes;
727 	if (!trans) {
728 		path->skip_locking = 1;
729 		path->search_commit_root = 1;
730 	}
731 again:
732 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
733 				&key, path, 0, 0);
734 	if (ret < 0)
735 		goto out_free;
736 
737 	if (ret == 0) {
738 		leaf = path->nodes[0];
739 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
740 		if (item_size >= sizeof(*ei)) {
741 			ei = btrfs_item_ptr(leaf, path->slots[0],
742 					    struct btrfs_extent_item);
743 			num_refs = btrfs_extent_refs(leaf, ei);
744 			extent_flags = btrfs_extent_flags(leaf, ei);
745 		} else {
746 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
747 			struct btrfs_extent_item_v0 *ei0;
748 			BUG_ON(item_size != sizeof(*ei0));
749 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
750 					     struct btrfs_extent_item_v0);
751 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
752 			/* FIXME: this isn't correct for data */
753 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
754 #else
755 			BUG();
756 #endif
757 		}
758 		BUG_ON(num_refs == 0);
759 	} else {
760 		num_refs = 0;
761 		extent_flags = 0;
762 		ret = 0;
763 	}
764 
765 	if (!trans)
766 		goto out;
767 
768 	delayed_refs = &trans->transaction->delayed_refs;
769 	spin_lock(&delayed_refs->lock);
770 	head = btrfs_find_delayed_ref_head(trans, bytenr);
771 	if (head) {
772 		if (!mutex_trylock(&head->mutex)) {
773 			atomic_inc(&head->node.refs);
774 			spin_unlock(&delayed_refs->lock);
775 
776 			btrfs_release_path(path);
777 
778 			/*
779 			 * Mutex was contended, block until it's released and try
780 			 * again
781 			 */
782 			mutex_lock(&head->mutex);
783 			mutex_unlock(&head->mutex);
784 			btrfs_put_delayed_ref(&head->node);
785 			goto again;
786 		}
787 		if (head->extent_op && head->extent_op->update_flags)
788 			extent_flags |= head->extent_op->flags_to_set;
789 		else
790 			BUG_ON(num_refs == 0);
791 
792 		num_refs += head->node.ref_mod;
793 		mutex_unlock(&head->mutex);
794 	}
795 	spin_unlock(&delayed_refs->lock);
796 out:
797 	WARN_ON(num_refs == 0);
798 	if (refs)
799 		*refs = num_refs;
800 	if (flags)
801 		*flags = extent_flags;
802 out_free:
803 	btrfs_free_path(path);
804 	return ret;
805 }
806 
807 /*
808  * Back reference rules.  Back refs have three main goals:
809  *
810  * 1) differentiate between all holders of references to an extent so that
811  *    when a reference is dropped we can make sure it was a valid reference
812  *    before freeing the extent.
813  *
814  * 2) Provide enough information to quickly find the holders of an extent
815  *    if we notice a given block is corrupted or bad.
816  *
817  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
818  *    maintenance.  This is actually the same as #2, but with a slightly
819  *    different use case.
820  *
821  * There are two kinds of back refs. The implicit back refs is optimized
822  * for pointers in non-shared tree blocks. For a given pointer in a block,
823  * back refs of this kind provide information about the block's owner tree
824  * and the pointer's key. These information allow us to find the block by
825  * b-tree searching. The full back refs is for pointers in tree blocks not
826  * referenced by their owner trees. The location of tree block is recorded
827  * in the back refs. Actually the full back refs is generic, and can be
828  * used in all cases the implicit back refs is used. The major shortcoming
829  * of the full back refs is its overhead. Every time a tree block gets
830  * COWed, we have to update back refs entry for all pointers in it.
831  *
832  * For a newly allocated tree block, we use implicit back refs for
833  * pointers in it. This means most tree related operations only involve
834  * implicit back refs. For a tree block created in old transaction, the
835  * only way to drop a reference to it is COW it. So we can detect the
836  * event that tree block loses its owner tree's reference and do the
837  * back refs conversion.
838  *
839  * When a tree block is COW'd through a tree, there are four cases:
840  *
841  * The reference count of the block is one and the tree is the block's
842  * owner tree. Nothing to do in this case.
843  *
844  * The reference count of the block is one and the tree is not the
845  * block's owner tree. In this case, full back refs is used for pointers
846  * in the block. Remove these full back refs, add implicit back refs for
847  * every pointers in the new block.
848  *
849  * The reference count of the block is greater than one and the tree is
850  * the block's owner tree. In this case, implicit back refs is used for
851  * pointers in the block. Add full back refs for every pointers in the
852  * block, increase lower level extents' reference counts. The original
853  * implicit back refs are entailed to the new block.
854  *
855  * The reference count of the block is greater than one and the tree is
856  * not the block's owner tree. Add implicit back refs for every pointer in
857  * the new block, increase lower level extents' reference count.
858  *
859  * Back Reference Key composing:
860  *
861  * The key objectid corresponds to the first byte in the extent,
862  * The key type is used to differentiate between types of back refs.
863  * There are different meanings of the key offset for different types
864  * of back refs.
865  *
866  * File extents can be referenced by:
867  *
868  * - multiple snapshots, subvolumes, or different generations in one subvol
869  * - different files inside a single subvolume
870  * - different offsets inside a file (bookend extents in file.c)
871  *
872  * The extent ref structure for the implicit back refs has fields for:
873  *
874  * - Objectid of the subvolume root
875  * - objectid of the file holding the reference
876  * - original offset in the file
877  * - how many bookend extents
878  *
879  * The key offset for the implicit back refs is hash of the first
880  * three fields.
881  *
882  * The extent ref structure for the full back refs has field for:
883  *
884  * - number of pointers in the tree leaf
885  *
886  * The key offset for the implicit back refs is the first byte of
887  * the tree leaf
888  *
889  * When a file extent is allocated, The implicit back refs is used.
890  * the fields are filled in:
891  *
892  *     (root_key.objectid, inode objectid, offset in file, 1)
893  *
894  * When a file extent is removed file truncation, we find the
895  * corresponding implicit back refs and check the following fields:
896  *
897  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
898  *
899  * Btree extents can be referenced by:
900  *
901  * - Different subvolumes
902  *
903  * Both the implicit back refs and the full back refs for tree blocks
904  * only consist of key. The key offset for the implicit back refs is
905  * objectid of block's owner tree. The key offset for the full back refs
906  * is the first byte of parent block.
907  *
908  * When implicit back refs is used, information about the lowest key and
909  * level of the tree block are required. These information are stored in
910  * tree block info structure.
911  */
912 
913 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
914 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
915 				  struct btrfs_root *root,
916 				  struct btrfs_path *path,
917 				  u64 owner, u32 extra_size)
918 {
919 	struct btrfs_extent_item *item;
920 	struct btrfs_extent_item_v0 *ei0;
921 	struct btrfs_extent_ref_v0 *ref0;
922 	struct btrfs_tree_block_info *bi;
923 	struct extent_buffer *leaf;
924 	struct btrfs_key key;
925 	struct btrfs_key found_key;
926 	u32 new_size = sizeof(*item);
927 	u64 refs;
928 	int ret;
929 
930 	leaf = path->nodes[0];
931 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
932 
933 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
934 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
935 			     struct btrfs_extent_item_v0);
936 	refs = btrfs_extent_refs_v0(leaf, ei0);
937 
938 	if (owner == (u64)-1) {
939 		while (1) {
940 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
941 				ret = btrfs_next_leaf(root, path);
942 				if (ret < 0)
943 					return ret;
944 				BUG_ON(ret > 0);
945 				leaf = path->nodes[0];
946 			}
947 			btrfs_item_key_to_cpu(leaf, &found_key,
948 					      path->slots[0]);
949 			BUG_ON(key.objectid != found_key.objectid);
950 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
951 				path->slots[0]++;
952 				continue;
953 			}
954 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
955 					      struct btrfs_extent_ref_v0);
956 			owner = btrfs_ref_objectid_v0(leaf, ref0);
957 			break;
958 		}
959 	}
960 	btrfs_release_path(path);
961 
962 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
963 		new_size += sizeof(*bi);
964 
965 	new_size -= sizeof(*ei0);
966 	ret = btrfs_search_slot(trans, root, &key, path,
967 				new_size + extra_size, 1);
968 	if (ret < 0)
969 		return ret;
970 	BUG_ON(ret);
971 
972 	ret = btrfs_extend_item(trans, root, path, new_size);
973 
974 	leaf = path->nodes[0];
975 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
976 	btrfs_set_extent_refs(leaf, item, refs);
977 	/* FIXME: get real generation */
978 	btrfs_set_extent_generation(leaf, item, 0);
979 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
980 		btrfs_set_extent_flags(leaf, item,
981 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
982 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
983 		bi = (struct btrfs_tree_block_info *)(item + 1);
984 		/* FIXME: get first key of the block */
985 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
986 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
987 	} else {
988 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
989 	}
990 	btrfs_mark_buffer_dirty(leaf);
991 	return 0;
992 }
993 #endif
994 
995 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
996 {
997 	u32 high_crc = ~(u32)0;
998 	u32 low_crc = ~(u32)0;
999 	__le64 lenum;
1000 
1001 	lenum = cpu_to_le64(root_objectid);
1002 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1003 	lenum = cpu_to_le64(owner);
1004 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1005 	lenum = cpu_to_le64(offset);
1006 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1007 
1008 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1009 }
1010 
1011 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1012 				     struct btrfs_extent_data_ref *ref)
1013 {
1014 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1015 				    btrfs_extent_data_ref_objectid(leaf, ref),
1016 				    btrfs_extent_data_ref_offset(leaf, ref));
1017 }
1018 
1019 static int match_extent_data_ref(struct extent_buffer *leaf,
1020 				 struct btrfs_extent_data_ref *ref,
1021 				 u64 root_objectid, u64 owner, u64 offset)
1022 {
1023 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1024 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1025 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1026 		return 0;
1027 	return 1;
1028 }
1029 
1030 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1031 					   struct btrfs_root *root,
1032 					   struct btrfs_path *path,
1033 					   u64 bytenr, u64 parent,
1034 					   u64 root_objectid,
1035 					   u64 owner, u64 offset)
1036 {
1037 	struct btrfs_key key;
1038 	struct btrfs_extent_data_ref *ref;
1039 	struct extent_buffer *leaf;
1040 	u32 nritems;
1041 	int ret;
1042 	int recow;
1043 	int err = -ENOENT;
1044 
1045 	key.objectid = bytenr;
1046 	if (parent) {
1047 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1048 		key.offset = parent;
1049 	} else {
1050 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1051 		key.offset = hash_extent_data_ref(root_objectid,
1052 						  owner, offset);
1053 	}
1054 again:
1055 	recow = 0;
1056 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1057 	if (ret < 0) {
1058 		err = ret;
1059 		goto fail;
1060 	}
1061 
1062 	if (parent) {
1063 		if (!ret)
1064 			return 0;
1065 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1066 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1067 		btrfs_release_path(path);
1068 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1069 		if (ret < 0) {
1070 			err = ret;
1071 			goto fail;
1072 		}
1073 		if (!ret)
1074 			return 0;
1075 #endif
1076 		goto fail;
1077 	}
1078 
1079 	leaf = path->nodes[0];
1080 	nritems = btrfs_header_nritems(leaf);
1081 	while (1) {
1082 		if (path->slots[0] >= nritems) {
1083 			ret = btrfs_next_leaf(root, path);
1084 			if (ret < 0)
1085 				err = ret;
1086 			if (ret)
1087 				goto fail;
1088 
1089 			leaf = path->nodes[0];
1090 			nritems = btrfs_header_nritems(leaf);
1091 			recow = 1;
1092 		}
1093 
1094 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1095 		if (key.objectid != bytenr ||
1096 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1097 			goto fail;
1098 
1099 		ref = btrfs_item_ptr(leaf, path->slots[0],
1100 				     struct btrfs_extent_data_ref);
1101 
1102 		if (match_extent_data_ref(leaf, ref, root_objectid,
1103 					  owner, offset)) {
1104 			if (recow) {
1105 				btrfs_release_path(path);
1106 				goto again;
1107 			}
1108 			err = 0;
1109 			break;
1110 		}
1111 		path->slots[0]++;
1112 	}
1113 fail:
1114 	return err;
1115 }
1116 
1117 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1118 					   struct btrfs_root *root,
1119 					   struct btrfs_path *path,
1120 					   u64 bytenr, u64 parent,
1121 					   u64 root_objectid, u64 owner,
1122 					   u64 offset, int refs_to_add)
1123 {
1124 	struct btrfs_key key;
1125 	struct extent_buffer *leaf;
1126 	u32 size;
1127 	u32 num_refs;
1128 	int ret;
1129 
1130 	key.objectid = bytenr;
1131 	if (parent) {
1132 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1133 		key.offset = parent;
1134 		size = sizeof(struct btrfs_shared_data_ref);
1135 	} else {
1136 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1137 		key.offset = hash_extent_data_ref(root_objectid,
1138 						  owner, offset);
1139 		size = sizeof(struct btrfs_extent_data_ref);
1140 	}
1141 
1142 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1143 	if (ret && ret != -EEXIST)
1144 		goto fail;
1145 
1146 	leaf = path->nodes[0];
1147 	if (parent) {
1148 		struct btrfs_shared_data_ref *ref;
1149 		ref = btrfs_item_ptr(leaf, path->slots[0],
1150 				     struct btrfs_shared_data_ref);
1151 		if (ret == 0) {
1152 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1153 		} else {
1154 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1155 			num_refs += refs_to_add;
1156 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1157 		}
1158 	} else {
1159 		struct btrfs_extent_data_ref *ref;
1160 		while (ret == -EEXIST) {
1161 			ref = btrfs_item_ptr(leaf, path->slots[0],
1162 					     struct btrfs_extent_data_ref);
1163 			if (match_extent_data_ref(leaf, ref, root_objectid,
1164 						  owner, offset))
1165 				break;
1166 			btrfs_release_path(path);
1167 			key.offset++;
1168 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1169 						      size);
1170 			if (ret && ret != -EEXIST)
1171 				goto fail;
1172 
1173 			leaf = path->nodes[0];
1174 		}
1175 		ref = btrfs_item_ptr(leaf, path->slots[0],
1176 				     struct btrfs_extent_data_ref);
1177 		if (ret == 0) {
1178 			btrfs_set_extent_data_ref_root(leaf, ref,
1179 						       root_objectid);
1180 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1181 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1182 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1183 		} else {
1184 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1185 			num_refs += refs_to_add;
1186 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1187 		}
1188 	}
1189 	btrfs_mark_buffer_dirty(leaf);
1190 	ret = 0;
1191 fail:
1192 	btrfs_release_path(path);
1193 	return ret;
1194 }
1195 
1196 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1197 					   struct btrfs_root *root,
1198 					   struct btrfs_path *path,
1199 					   int refs_to_drop)
1200 {
1201 	struct btrfs_key key;
1202 	struct btrfs_extent_data_ref *ref1 = NULL;
1203 	struct btrfs_shared_data_ref *ref2 = NULL;
1204 	struct extent_buffer *leaf;
1205 	u32 num_refs = 0;
1206 	int ret = 0;
1207 
1208 	leaf = path->nodes[0];
1209 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1210 
1211 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1212 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1213 				      struct btrfs_extent_data_ref);
1214 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1215 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1216 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1217 				      struct btrfs_shared_data_ref);
1218 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1219 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1220 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1221 		struct btrfs_extent_ref_v0 *ref0;
1222 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1223 				      struct btrfs_extent_ref_v0);
1224 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1225 #endif
1226 	} else {
1227 		BUG();
1228 	}
1229 
1230 	BUG_ON(num_refs < refs_to_drop);
1231 	num_refs -= refs_to_drop;
1232 
1233 	if (num_refs == 0) {
1234 		ret = btrfs_del_item(trans, root, path);
1235 	} else {
1236 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1237 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1238 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1239 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1240 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1241 		else {
1242 			struct btrfs_extent_ref_v0 *ref0;
1243 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1244 					struct btrfs_extent_ref_v0);
1245 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1246 		}
1247 #endif
1248 		btrfs_mark_buffer_dirty(leaf);
1249 	}
1250 	return ret;
1251 }
1252 
1253 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1254 					  struct btrfs_path *path,
1255 					  struct btrfs_extent_inline_ref *iref)
1256 {
1257 	struct btrfs_key key;
1258 	struct extent_buffer *leaf;
1259 	struct btrfs_extent_data_ref *ref1;
1260 	struct btrfs_shared_data_ref *ref2;
1261 	u32 num_refs = 0;
1262 
1263 	leaf = path->nodes[0];
1264 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1265 	if (iref) {
1266 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1267 		    BTRFS_EXTENT_DATA_REF_KEY) {
1268 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1269 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1270 		} else {
1271 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1272 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1273 		}
1274 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1275 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1276 				      struct btrfs_extent_data_ref);
1277 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1278 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1279 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1280 				      struct btrfs_shared_data_ref);
1281 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1283 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1284 		struct btrfs_extent_ref_v0 *ref0;
1285 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1286 				      struct btrfs_extent_ref_v0);
1287 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1288 #endif
1289 	} else {
1290 		WARN_ON(1);
1291 	}
1292 	return num_refs;
1293 }
1294 
1295 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1296 					  struct btrfs_root *root,
1297 					  struct btrfs_path *path,
1298 					  u64 bytenr, u64 parent,
1299 					  u64 root_objectid)
1300 {
1301 	struct btrfs_key key;
1302 	int ret;
1303 
1304 	key.objectid = bytenr;
1305 	if (parent) {
1306 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1307 		key.offset = parent;
1308 	} else {
1309 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1310 		key.offset = root_objectid;
1311 	}
1312 
1313 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1314 	if (ret > 0)
1315 		ret = -ENOENT;
1316 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1317 	if (ret == -ENOENT && parent) {
1318 		btrfs_release_path(path);
1319 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1320 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1321 		if (ret > 0)
1322 			ret = -ENOENT;
1323 	}
1324 #endif
1325 	return ret;
1326 }
1327 
1328 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1329 					  struct btrfs_root *root,
1330 					  struct btrfs_path *path,
1331 					  u64 bytenr, u64 parent,
1332 					  u64 root_objectid)
1333 {
1334 	struct btrfs_key key;
1335 	int ret;
1336 
1337 	key.objectid = bytenr;
1338 	if (parent) {
1339 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1340 		key.offset = parent;
1341 	} else {
1342 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1343 		key.offset = root_objectid;
1344 	}
1345 
1346 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1347 	btrfs_release_path(path);
1348 	return ret;
1349 }
1350 
1351 static inline int extent_ref_type(u64 parent, u64 owner)
1352 {
1353 	int type;
1354 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1355 		if (parent > 0)
1356 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1357 		else
1358 			type = BTRFS_TREE_BLOCK_REF_KEY;
1359 	} else {
1360 		if (parent > 0)
1361 			type = BTRFS_SHARED_DATA_REF_KEY;
1362 		else
1363 			type = BTRFS_EXTENT_DATA_REF_KEY;
1364 	}
1365 	return type;
1366 }
1367 
1368 static int find_next_key(struct btrfs_path *path, int level,
1369 			 struct btrfs_key *key)
1370 
1371 {
1372 	for (; level < BTRFS_MAX_LEVEL; level++) {
1373 		if (!path->nodes[level])
1374 			break;
1375 		if (path->slots[level] + 1 >=
1376 		    btrfs_header_nritems(path->nodes[level]))
1377 			continue;
1378 		if (level == 0)
1379 			btrfs_item_key_to_cpu(path->nodes[level], key,
1380 					      path->slots[level] + 1);
1381 		else
1382 			btrfs_node_key_to_cpu(path->nodes[level], key,
1383 					      path->slots[level] + 1);
1384 		return 0;
1385 	}
1386 	return 1;
1387 }
1388 
1389 /*
1390  * look for inline back ref. if back ref is found, *ref_ret is set
1391  * to the address of inline back ref, and 0 is returned.
1392  *
1393  * if back ref isn't found, *ref_ret is set to the address where it
1394  * should be inserted, and -ENOENT is returned.
1395  *
1396  * if insert is true and there are too many inline back refs, the path
1397  * points to the extent item, and -EAGAIN is returned.
1398  *
1399  * NOTE: inline back refs are ordered in the same way that back ref
1400  *	 items in the tree are ordered.
1401  */
1402 static noinline_for_stack
1403 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1404 				 struct btrfs_root *root,
1405 				 struct btrfs_path *path,
1406 				 struct btrfs_extent_inline_ref **ref_ret,
1407 				 u64 bytenr, u64 num_bytes,
1408 				 u64 parent, u64 root_objectid,
1409 				 u64 owner, u64 offset, int insert)
1410 {
1411 	struct btrfs_key key;
1412 	struct extent_buffer *leaf;
1413 	struct btrfs_extent_item *ei;
1414 	struct btrfs_extent_inline_ref *iref;
1415 	u64 flags;
1416 	u64 item_size;
1417 	unsigned long ptr;
1418 	unsigned long end;
1419 	int extra_size;
1420 	int type;
1421 	int want;
1422 	int ret;
1423 	int err = 0;
1424 
1425 	key.objectid = bytenr;
1426 	key.type = BTRFS_EXTENT_ITEM_KEY;
1427 	key.offset = num_bytes;
1428 
1429 	want = extent_ref_type(parent, owner);
1430 	if (insert) {
1431 		extra_size = btrfs_extent_inline_ref_size(want);
1432 		path->keep_locks = 1;
1433 	} else
1434 		extra_size = -1;
1435 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1436 	if (ret < 0) {
1437 		err = ret;
1438 		goto out;
1439 	}
1440 	BUG_ON(ret);
1441 
1442 	leaf = path->nodes[0];
1443 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1444 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1445 	if (item_size < sizeof(*ei)) {
1446 		if (!insert) {
1447 			err = -ENOENT;
1448 			goto out;
1449 		}
1450 		ret = convert_extent_item_v0(trans, root, path, owner,
1451 					     extra_size);
1452 		if (ret < 0) {
1453 			err = ret;
1454 			goto out;
1455 		}
1456 		leaf = path->nodes[0];
1457 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1458 	}
1459 #endif
1460 	BUG_ON(item_size < sizeof(*ei));
1461 
1462 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1463 	flags = btrfs_extent_flags(leaf, ei);
1464 
1465 	ptr = (unsigned long)(ei + 1);
1466 	end = (unsigned long)ei + item_size;
1467 
1468 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1469 		ptr += sizeof(struct btrfs_tree_block_info);
1470 		BUG_ON(ptr > end);
1471 	} else {
1472 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1473 	}
1474 
1475 	err = -ENOENT;
1476 	while (1) {
1477 		if (ptr >= end) {
1478 			WARN_ON(ptr > end);
1479 			break;
1480 		}
1481 		iref = (struct btrfs_extent_inline_ref *)ptr;
1482 		type = btrfs_extent_inline_ref_type(leaf, iref);
1483 		if (want < type)
1484 			break;
1485 		if (want > type) {
1486 			ptr += btrfs_extent_inline_ref_size(type);
1487 			continue;
1488 		}
1489 
1490 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1491 			struct btrfs_extent_data_ref *dref;
1492 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1493 			if (match_extent_data_ref(leaf, dref, root_objectid,
1494 						  owner, offset)) {
1495 				err = 0;
1496 				break;
1497 			}
1498 			if (hash_extent_data_ref_item(leaf, dref) <
1499 			    hash_extent_data_ref(root_objectid, owner, offset))
1500 				break;
1501 		} else {
1502 			u64 ref_offset;
1503 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1504 			if (parent > 0) {
1505 				if (parent == ref_offset) {
1506 					err = 0;
1507 					break;
1508 				}
1509 				if (ref_offset < parent)
1510 					break;
1511 			} else {
1512 				if (root_objectid == ref_offset) {
1513 					err = 0;
1514 					break;
1515 				}
1516 				if (ref_offset < root_objectid)
1517 					break;
1518 			}
1519 		}
1520 		ptr += btrfs_extent_inline_ref_size(type);
1521 	}
1522 	if (err == -ENOENT && insert) {
1523 		if (item_size + extra_size >=
1524 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1525 			err = -EAGAIN;
1526 			goto out;
1527 		}
1528 		/*
1529 		 * To add new inline back ref, we have to make sure
1530 		 * there is no corresponding back ref item.
1531 		 * For simplicity, we just do not add new inline back
1532 		 * ref if there is any kind of item for this block
1533 		 */
1534 		if (find_next_key(path, 0, &key) == 0 &&
1535 		    key.objectid == bytenr &&
1536 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1537 			err = -EAGAIN;
1538 			goto out;
1539 		}
1540 	}
1541 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1542 out:
1543 	if (insert) {
1544 		path->keep_locks = 0;
1545 		btrfs_unlock_up_safe(path, 1);
1546 	}
1547 	return err;
1548 }
1549 
1550 /*
1551  * helper to add new inline back ref
1552  */
1553 static noinline_for_stack
1554 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1555 				struct btrfs_root *root,
1556 				struct btrfs_path *path,
1557 				struct btrfs_extent_inline_ref *iref,
1558 				u64 parent, u64 root_objectid,
1559 				u64 owner, u64 offset, int refs_to_add,
1560 				struct btrfs_delayed_extent_op *extent_op)
1561 {
1562 	struct extent_buffer *leaf;
1563 	struct btrfs_extent_item *ei;
1564 	unsigned long ptr;
1565 	unsigned long end;
1566 	unsigned long item_offset;
1567 	u64 refs;
1568 	int size;
1569 	int type;
1570 	int ret;
1571 
1572 	leaf = path->nodes[0];
1573 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1574 	item_offset = (unsigned long)iref - (unsigned long)ei;
1575 
1576 	type = extent_ref_type(parent, owner);
1577 	size = btrfs_extent_inline_ref_size(type);
1578 
1579 	ret = btrfs_extend_item(trans, root, path, size);
1580 
1581 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1582 	refs = btrfs_extent_refs(leaf, ei);
1583 	refs += refs_to_add;
1584 	btrfs_set_extent_refs(leaf, ei, refs);
1585 	if (extent_op)
1586 		__run_delayed_extent_op(extent_op, leaf, ei);
1587 
1588 	ptr = (unsigned long)ei + item_offset;
1589 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1590 	if (ptr < end - size)
1591 		memmove_extent_buffer(leaf, ptr + size, ptr,
1592 				      end - size - ptr);
1593 
1594 	iref = (struct btrfs_extent_inline_ref *)ptr;
1595 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1596 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1597 		struct btrfs_extent_data_ref *dref;
1598 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1599 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1600 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1601 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1602 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1603 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1604 		struct btrfs_shared_data_ref *sref;
1605 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1606 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1607 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1608 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1609 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1610 	} else {
1611 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1612 	}
1613 	btrfs_mark_buffer_dirty(leaf);
1614 	return 0;
1615 }
1616 
1617 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1618 				 struct btrfs_root *root,
1619 				 struct btrfs_path *path,
1620 				 struct btrfs_extent_inline_ref **ref_ret,
1621 				 u64 bytenr, u64 num_bytes, u64 parent,
1622 				 u64 root_objectid, u64 owner, u64 offset)
1623 {
1624 	int ret;
1625 
1626 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1627 					   bytenr, num_bytes, parent,
1628 					   root_objectid, owner, offset, 0);
1629 	if (ret != -ENOENT)
1630 		return ret;
1631 
1632 	btrfs_release_path(path);
1633 	*ref_ret = NULL;
1634 
1635 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1636 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1637 					    root_objectid);
1638 	} else {
1639 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1640 					     root_objectid, owner, offset);
1641 	}
1642 	return ret;
1643 }
1644 
1645 /*
1646  * helper to update/remove inline back ref
1647  */
1648 static noinline_for_stack
1649 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1650 				 struct btrfs_root *root,
1651 				 struct btrfs_path *path,
1652 				 struct btrfs_extent_inline_ref *iref,
1653 				 int refs_to_mod,
1654 				 struct btrfs_delayed_extent_op *extent_op)
1655 {
1656 	struct extent_buffer *leaf;
1657 	struct btrfs_extent_item *ei;
1658 	struct btrfs_extent_data_ref *dref = NULL;
1659 	struct btrfs_shared_data_ref *sref = NULL;
1660 	unsigned long ptr;
1661 	unsigned long end;
1662 	u32 item_size;
1663 	int size;
1664 	int type;
1665 	int ret;
1666 	u64 refs;
1667 
1668 	leaf = path->nodes[0];
1669 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1670 	refs = btrfs_extent_refs(leaf, ei);
1671 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1672 	refs += refs_to_mod;
1673 	btrfs_set_extent_refs(leaf, ei, refs);
1674 	if (extent_op)
1675 		__run_delayed_extent_op(extent_op, leaf, ei);
1676 
1677 	type = btrfs_extent_inline_ref_type(leaf, iref);
1678 
1679 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1680 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1681 		refs = btrfs_extent_data_ref_count(leaf, dref);
1682 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1683 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1684 		refs = btrfs_shared_data_ref_count(leaf, sref);
1685 	} else {
1686 		refs = 1;
1687 		BUG_ON(refs_to_mod != -1);
1688 	}
1689 
1690 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1691 	refs += refs_to_mod;
1692 
1693 	if (refs > 0) {
1694 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1695 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1696 		else
1697 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1698 	} else {
1699 		size =  btrfs_extent_inline_ref_size(type);
1700 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1701 		ptr = (unsigned long)iref;
1702 		end = (unsigned long)ei + item_size;
1703 		if (ptr + size < end)
1704 			memmove_extent_buffer(leaf, ptr, ptr + size,
1705 					      end - ptr - size);
1706 		item_size -= size;
1707 		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1708 	}
1709 	btrfs_mark_buffer_dirty(leaf);
1710 	return 0;
1711 }
1712 
1713 static noinline_for_stack
1714 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1715 				 struct btrfs_root *root,
1716 				 struct btrfs_path *path,
1717 				 u64 bytenr, u64 num_bytes, u64 parent,
1718 				 u64 root_objectid, u64 owner,
1719 				 u64 offset, int refs_to_add,
1720 				 struct btrfs_delayed_extent_op *extent_op)
1721 {
1722 	struct btrfs_extent_inline_ref *iref;
1723 	int ret;
1724 
1725 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1726 					   bytenr, num_bytes, parent,
1727 					   root_objectid, owner, offset, 1);
1728 	if (ret == 0) {
1729 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1730 		ret = update_inline_extent_backref(trans, root, path, iref,
1731 						   refs_to_add, extent_op);
1732 	} else if (ret == -ENOENT) {
1733 		ret = setup_inline_extent_backref(trans, root, path, iref,
1734 						  parent, root_objectid,
1735 						  owner, offset, refs_to_add,
1736 						  extent_op);
1737 	}
1738 	return ret;
1739 }
1740 
1741 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1742 				 struct btrfs_root *root,
1743 				 struct btrfs_path *path,
1744 				 u64 bytenr, u64 parent, u64 root_objectid,
1745 				 u64 owner, u64 offset, int refs_to_add)
1746 {
1747 	int ret;
1748 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1749 		BUG_ON(refs_to_add != 1);
1750 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1751 					    parent, root_objectid);
1752 	} else {
1753 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1754 					     parent, root_objectid,
1755 					     owner, offset, refs_to_add);
1756 	}
1757 	return ret;
1758 }
1759 
1760 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1761 				 struct btrfs_root *root,
1762 				 struct btrfs_path *path,
1763 				 struct btrfs_extent_inline_ref *iref,
1764 				 int refs_to_drop, int is_data)
1765 {
1766 	int ret;
1767 
1768 	BUG_ON(!is_data && refs_to_drop != 1);
1769 	if (iref) {
1770 		ret = update_inline_extent_backref(trans, root, path, iref,
1771 						   -refs_to_drop, NULL);
1772 	} else if (is_data) {
1773 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1774 	} else {
1775 		ret = btrfs_del_item(trans, root, path);
1776 	}
1777 	return ret;
1778 }
1779 
1780 static int btrfs_issue_discard(struct block_device *bdev,
1781 				u64 start, u64 len)
1782 {
1783 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1784 }
1785 
1786 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1787 				u64 num_bytes, u64 *actual_bytes)
1788 {
1789 	int ret;
1790 	u64 discarded_bytes = 0;
1791 	struct btrfs_bio *bbio = NULL;
1792 
1793 
1794 	/* Tell the block device(s) that the sectors can be discarded */
1795 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1796 			      bytenr, &num_bytes, &bbio, 0);
1797 	if (!ret) {
1798 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1799 		int i;
1800 
1801 
1802 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1803 			if (!stripe->dev->can_discard)
1804 				continue;
1805 
1806 			ret = btrfs_issue_discard(stripe->dev->bdev,
1807 						  stripe->physical,
1808 						  stripe->length);
1809 			if (!ret)
1810 				discarded_bytes += stripe->length;
1811 			else if (ret != -EOPNOTSUPP)
1812 				break;
1813 
1814 			/*
1815 			 * Just in case we get back EOPNOTSUPP for some reason,
1816 			 * just ignore the return value so we don't screw up
1817 			 * people calling discard_extent.
1818 			 */
1819 			ret = 0;
1820 		}
1821 		kfree(bbio);
1822 	}
1823 
1824 	if (actual_bytes)
1825 		*actual_bytes = discarded_bytes;
1826 
1827 
1828 	return ret;
1829 }
1830 
1831 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1832 			 struct btrfs_root *root,
1833 			 u64 bytenr, u64 num_bytes, u64 parent,
1834 			 u64 root_objectid, u64 owner, u64 offset)
1835 {
1836 	int ret;
1837 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1838 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1839 
1840 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1841 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1842 					parent, root_objectid, (int)owner,
1843 					BTRFS_ADD_DELAYED_REF, NULL);
1844 	} else {
1845 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1846 					parent, root_objectid, owner, offset,
1847 					BTRFS_ADD_DELAYED_REF, NULL);
1848 	}
1849 	return ret;
1850 }
1851 
1852 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1853 				  struct btrfs_root *root,
1854 				  u64 bytenr, u64 num_bytes,
1855 				  u64 parent, u64 root_objectid,
1856 				  u64 owner, u64 offset, int refs_to_add,
1857 				  struct btrfs_delayed_extent_op *extent_op)
1858 {
1859 	struct btrfs_path *path;
1860 	struct extent_buffer *leaf;
1861 	struct btrfs_extent_item *item;
1862 	u64 refs;
1863 	int ret;
1864 	int err = 0;
1865 
1866 	path = btrfs_alloc_path();
1867 	if (!path)
1868 		return -ENOMEM;
1869 
1870 	path->reada = 1;
1871 	path->leave_spinning = 1;
1872 	/* this will setup the path even if it fails to insert the back ref */
1873 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1874 					   path, bytenr, num_bytes, parent,
1875 					   root_objectid, owner, offset,
1876 					   refs_to_add, extent_op);
1877 	if (ret == 0)
1878 		goto out;
1879 
1880 	if (ret != -EAGAIN) {
1881 		err = ret;
1882 		goto out;
1883 	}
1884 
1885 	leaf = path->nodes[0];
1886 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1887 	refs = btrfs_extent_refs(leaf, item);
1888 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1889 	if (extent_op)
1890 		__run_delayed_extent_op(extent_op, leaf, item);
1891 
1892 	btrfs_mark_buffer_dirty(leaf);
1893 	btrfs_release_path(path);
1894 
1895 	path->reada = 1;
1896 	path->leave_spinning = 1;
1897 
1898 	/* now insert the actual backref */
1899 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1900 				    path, bytenr, parent, root_objectid,
1901 				    owner, offset, refs_to_add);
1902 	BUG_ON(ret);
1903 out:
1904 	btrfs_free_path(path);
1905 	return err;
1906 }
1907 
1908 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1909 				struct btrfs_root *root,
1910 				struct btrfs_delayed_ref_node *node,
1911 				struct btrfs_delayed_extent_op *extent_op,
1912 				int insert_reserved)
1913 {
1914 	int ret = 0;
1915 	struct btrfs_delayed_data_ref *ref;
1916 	struct btrfs_key ins;
1917 	u64 parent = 0;
1918 	u64 ref_root = 0;
1919 	u64 flags = 0;
1920 
1921 	ins.objectid = node->bytenr;
1922 	ins.offset = node->num_bytes;
1923 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1924 
1925 	ref = btrfs_delayed_node_to_data_ref(node);
1926 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1927 		parent = ref->parent;
1928 	else
1929 		ref_root = ref->root;
1930 
1931 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1932 		if (extent_op) {
1933 			BUG_ON(extent_op->update_key);
1934 			flags |= extent_op->flags_to_set;
1935 		}
1936 		ret = alloc_reserved_file_extent(trans, root,
1937 						 parent, ref_root, flags,
1938 						 ref->objectid, ref->offset,
1939 						 &ins, node->ref_mod);
1940 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1941 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1942 					     node->num_bytes, parent,
1943 					     ref_root, ref->objectid,
1944 					     ref->offset, node->ref_mod,
1945 					     extent_op);
1946 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1947 		ret = __btrfs_free_extent(trans, root, node->bytenr,
1948 					  node->num_bytes, parent,
1949 					  ref_root, ref->objectid,
1950 					  ref->offset, node->ref_mod,
1951 					  extent_op);
1952 	} else {
1953 		BUG();
1954 	}
1955 	return ret;
1956 }
1957 
1958 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1959 				    struct extent_buffer *leaf,
1960 				    struct btrfs_extent_item *ei)
1961 {
1962 	u64 flags = btrfs_extent_flags(leaf, ei);
1963 	if (extent_op->update_flags) {
1964 		flags |= extent_op->flags_to_set;
1965 		btrfs_set_extent_flags(leaf, ei, flags);
1966 	}
1967 
1968 	if (extent_op->update_key) {
1969 		struct btrfs_tree_block_info *bi;
1970 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1971 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1972 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1973 	}
1974 }
1975 
1976 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1977 				 struct btrfs_root *root,
1978 				 struct btrfs_delayed_ref_node *node,
1979 				 struct btrfs_delayed_extent_op *extent_op)
1980 {
1981 	struct btrfs_key key;
1982 	struct btrfs_path *path;
1983 	struct btrfs_extent_item *ei;
1984 	struct extent_buffer *leaf;
1985 	u32 item_size;
1986 	int ret;
1987 	int err = 0;
1988 
1989 	path = btrfs_alloc_path();
1990 	if (!path)
1991 		return -ENOMEM;
1992 
1993 	key.objectid = node->bytenr;
1994 	key.type = BTRFS_EXTENT_ITEM_KEY;
1995 	key.offset = node->num_bytes;
1996 
1997 	path->reada = 1;
1998 	path->leave_spinning = 1;
1999 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2000 				path, 0, 1);
2001 	if (ret < 0) {
2002 		err = ret;
2003 		goto out;
2004 	}
2005 	if (ret > 0) {
2006 		err = -EIO;
2007 		goto out;
2008 	}
2009 
2010 	leaf = path->nodes[0];
2011 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2012 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2013 	if (item_size < sizeof(*ei)) {
2014 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2015 					     path, (u64)-1, 0);
2016 		if (ret < 0) {
2017 			err = ret;
2018 			goto out;
2019 		}
2020 		leaf = path->nodes[0];
2021 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2022 	}
2023 #endif
2024 	BUG_ON(item_size < sizeof(*ei));
2025 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2026 	__run_delayed_extent_op(extent_op, leaf, ei);
2027 
2028 	btrfs_mark_buffer_dirty(leaf);
2029 out:
2030 	btrfs_free_path(path);
2031 	return err;
2032 }
2033 
2034 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2035 				struct btrfs_root *root,
2036 				struct btrfs_delayed_ref_node *node,
2037 				struct btrfs_delayed_extent_op *extent_op,
2038 				int insert_reserved)
2039 {
2040 	int ret = 0;
2041 	struct btrfs_delayed_tree_ref *ref;
2042 	struct btrfs_key ins;
2043 	u64 parent = 0;
2044 	u64 ref_root = 0;
2045 
2046 	ins.objectid = node->bytenr;
2047 	ins.offset = node->num_bytes;
2048 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2049 
2050 	ref = btrfs_delayed_node_to_tree_ref(node);
2051 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2052 		parent = ref->parent;
2053 	else
2054 		ref_root = ref->root;
2055 
2056 	BUG_ON(node->ref_mod != 1);
2057 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2058 		BUG_ON(!extent_op || !extent_op->update_flags ||
2059 		       !extent_op->update_key);
2060 		ret = alloc_reserved_tree_block(trans, root,
2061 						parent, ref_root,
2062 						extent_op->flags_to_set,
2063 						&extent_op->key,
2064 						ref->level, &ins);
2065 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2066 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2067 					     node->num_bytes, parent, ref_root,
2068 					     ref->level, 0, 1, extent_op);
2069 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2070 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2071 					  node->num_bytes, parent, ref_root,
2072 					  ref->level, 0, 1, extent_op);
2073 	} else {
2074 		BUG();
2075 	}
2076 	return ret;
2077 }
2078 
2079 /* helper function to actually process a single delayed ref entry */
2080 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2081 			       struct btrfs_root *root,
2082 			       struct btrfs_delayed_ref_node *node,
2083 			       struct btrfs_delayed_extent_op *extent_op,
2084 			       int insert_reserved)
2085 {
2086 	int ret;
2087 	if (btrfs_delayed_ref_is_head(node)) {
2088 		struct btrfs_delayed_ref_head *head;
2089 		/*
2090 		 * we've hit the end of the chain and we were supposed
2091 		 * to insert this extent into the tree.  But, it got
2092 		 * deleted before we ever needed to insert it, so all
2093 		 * we have to do is clean up the accounting
2094 		 */
2095 		BUG_ON(extent_op);
2096 		head = btrfs_delayed_node_to_head(node);
2097 		if (insert_reserved) {
2098 			btrfs_pin_extent(root, node->bytenr,
2099 					 node->num_bytes, 1);
2100 			if (head->is_data) {
2101 				ret = btrfs_del_csums(trans, root,
2102 						      node->bytenr,
2103 						      node->num_bytes);
2104 				BUG_ON(ret);
2105 			}
2106 		}
2107 		mutex_unlock(&head->mutex);
2108 		return 0;
2109 	}
2110 
2111 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2112 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2113 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2114 					   insert_reserved);
2115 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2116 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2117 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2118 					   insert_reserved);
2119 	else
2120 		BUG();
2121 	return ret;
2122 }
2123 
2124 static noinline struct btrfs_delayed_ref_node *
2125 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2126 {
2127 	struct rb_node *node;
2128 	struct btrfs_delayed_ref_node *ref;
2129 	int action = BTRFS_ADD_DELAYED_REF;
2130 again:
2131 	/*
2132 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2133 	 * this prevents ref count from going down to zero when
2134 	 * there still are pending delayed ref.
2135 	 */
2136 	node = rb_prev(&head->node.rb_node);
2137 	while (1) {
2138 		if (!node)
2139 			break;
2140 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2141 				rb_node);
2142 		if (ref->bytenr != head->node.bytenr)
2143 			break;
2144 		if (ref->action == action)
2145 			return ref;
2146 		node = rb_prev(node);
2147 	}
2148 	if (action == BTRFS_ADD_DELAYED_REF) {
2149 		action = BTRFS_DROP_DELAYED_REF;
2150 		goto again;
2151 	}
2152 	return NULL;
2153 }
2154 
2155 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2156 				       struct btrfs_root *root,
2157 				       struct list_head *cluster)
2158 {
2159 	struct btrfs_delayed_ref_root *delayed_refs;
2160 	struct btrfs_delayed_ref_node *ref;
2161 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2162 	struct btrfs_delayed_extent_op *extent_op;
2163 	int ret;
2164 	int count = 0;
2165 	int must_insert_reserved = 0;
2166 
2167 	delayed_refs = &trans->transaction->delayed_refs;
2168 	while (1) {
2169 		if (!locked_ref) {
2170 			/* pick a new head ref from the cluster list */
2171 			if (list_empty(cluster))
2172 				break;
2173 
2174 			locked_ref = list_entry(cluster->next,
2175 				     struct btrfs_delayed_ref_head, cluster);
2176 
2177 			/* grab the lock that says we are going to process
2178 			 * all the refs for this head */
2179 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2180 
2181 			/*
2182 			 * we may have dropped the spin lock to get the head
2183 			 * mutex lock, and that might have given someone else
2184 			 * time to free the head.  If that's true, it has been
2185 			 * removed from our list and we can move on.
2186 			 */
2187 			if (ret == -EAGAIN) {
2188 				locked_ref = NULL;
2189 				count++;
2190 				continue;
2191 			}
2192 		}
2193 
2194 		/*
2195 		 * record the must insert reserved flag before we
2196 		 * drop the spin lock.
2197 		 */
2198 		must_insert_reserved = locked_ref->must_insert_reserved;
2199 		locked_ref->must_insert_reserved = 0;
2200 
2201 		extent_op = locked_ref->extent_op;
2202 		locked_ref->extent_op = NULL;
2203 
2204 		/*
2205 		 * locked_ref is the head node, so we have to go one
2206 		 * node back for any delayed ref updates
2207 		 */
2208 		ref = select_delayed_ref(locked_ref);
2209 		if (!ref) {
2210 			/* All delayed refs have been processed, Go ahead
2211 			 * and send the head node to run_one_delayed_ref,
2212 			 * so that any accounting fixes can happen
2213 			 */
2214 			ref = &locked_ref->node;
2215 
2216 			if (extent_op && must_insert_reserved) {
2217 				kfree(extent_op);
2218 				extent_op = NULL;
2219 			}
2220 
2221 			if (extent_op) {
2222 				spin_unlock(&delayed_refs->lock);
2223 
2224 				ret = run_delayed_extent_op(trans, root,
2225 							    ref, extent_op);
2226 				BUG_ON(ret);
2227 				kfree(extent_op);
2228 
2229 				cond_resched();
2230 				spin_lock(&delayed_refs->lock);
2231 				continue;
2232 			}
2233 
2234 			list_del_init(&locked_ref->cluster);
2235 			locked_ref = NULL;
2236 		}
2237 
2238 		ref->in_tree = 0;
2239 		rb_erase(&ref->rb_node, &delayed_refs->root);
2240 		delayed_refs->num_entries--;
2241 
2242 		spin_unlock(&delayed_refs->lock);
2243 
2244 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2245 					  must_insert_reserved);
2246 		BUG_ON(ret);
2247 
2248 		btrfs_put_delayed_ref(ref);
2249 		kfree(extent_op);
2250 		count++;
2251 
2252 		cond_resched();
2253 		spin_lock(&delayed_refs->lock);
2254 	}
2255 	return count;
2256 }
2257 
2258 /*
2259  * this starts processing the delayed reference count updates and
2260  * extent insertions we have queued up so far.  count can be
2261  * 0, which means to process everything in the tree at the start
2262  * of the run (but not newly added entries), or it can be some target
2263  * number you'd like to process.
2264  */
2265 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2266 			   struct btrfs_root *root, unsigned long count)
2267 {
2268 	struct rb_node *node;
2269 	struct btrfs_delayed_ref_root *delayed_refs;
2270 	struct btrfs_delayed_ref_node *ref;
2271 	struct list_head cluster;
2272 	int ret;
2273 	int run_all = count == (unsigned long)-1;
2274 	int run_most = 0;
2275 
2276 	if (root == root->fs_info->extent_root)
2277 		root = root->fs_info->tree_root;
2278 
2279 	delayed_refs = &trans->transaction->delayed_refs;
2280 	INIT_LIST_HEAD(&cluster);
2281 again:
2282 	spin_lock(&delayed_refs->lock);
2283 	if (count == 0) {
2284 		count = delayed_refs->num_entries * 2;
2285 		run_most = 1;
2286 	}
2287 	while (1) {
2288 		if (!(run_all || run_most) &&
2289 		    delayed_refs->num_heads_ready < 64)
2290 			break;
2291 
2292 		/*
2293 		 * go find something we can process in the rbtree.  We start at
2294 		 * the beginning of the tree, and then build a cluster
2295 		 * of refs to process starting at the first one we are able to
2296 		 * lock
2297 		 */
2298 		ret = btrfs_find_ref_cluster(trans, &cluster,
2299 					     delayed_refs->run_delayed_start);
2300 		if (ret)
2301 			break;
2302 
2303 		ret = run_clustered_refs(trans, root, &cluster);
2304 		BUG_ON(ret < 0);
2305 
2306 		count -= min_t(unsigned long, ret, count);
2307 
2308 		if (count == 0)
2309 			break;
2310 	}
2311 
2312 	if (run_all) {
2313 		node = rb_first(&delayed_refs->root);
2314 		if (!node)
2315 			goto out;
2316 		count = (unsigned long)-1;
2317 
2318 		while (node) {
2319 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2320 				       rb_node);
2321 			if (btrfs_delayed_ref_is_head(ref)) {
2322 				struct btrfs_delayed_ref_head *head;
2323 
2324 				head = btrfs_delayed_node_to_head(ref);
2325 				atomic_inc(&ref->refs);
2326 
2327 				spin_unlock(&delayed_refs->lock);
2328 				/*
2329 				 * Mutex was contended, block until it's
2330 				 * released and try again
2331 				 */
2332 				mutex_lock(&head->mutex);
2333 				mutex_unlock(&head->mutex);
2334 
2335 				btrfs_put_delayed_ref(ref);
2336 				cond_resched();
2337 				goto again;
2338 			}
2339 			node = rb_next(node);
2340 		}
2341 		spin_unlock(&delayed_refs->lock);
2342 		schedule_timeout(1);
2343 		goto again;
2344 	}
2345 out:
2346 	spin_unlock(&delayed_refs->lock);
2347 	return 0;
2348 }
2349 
2350 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2351 				struct btrfs_root *root,
2352 				u64 bytenr, u64 num_bytes, u64 flags,
2353 				int is_data)
2354 {
2355 	struct btrfs_delayed_extent_op *extent_op;
2356 	int ret;
2357 
2358 	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2359 	if (!extent_op)
2360 		return -ENOMEM;
2361 
2362 	extent_op->flags_to_set = flags;
2363 	extent_op->update_flags = 1;
2364 	extent_op->update_key = 0;
2365 	extent_op->is_data = is_data ? 1 : 0;
2366 
2367 	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2368 	if (ret)
2369 		kfree(extent_op);
2370 	return ret;
2371 }
2372 
2373 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2374 				      struct btrfs_root *root,
2375 				      struct btrfs_path *path,
2376 				      u64 objectid, u64 offset, u64 bytenr)
2377 {
2378 	struct btrfs_delayed_ref_head *head;
2379 	struct btrfs_delayed_ref_node *ref;
2380 	struct btrfs_delayed_data_ref *data_ref;
2381 	struct btrfs_delayed_ref_root *delayed_refs;
2382 	struct rb_node *node;
2383 	int ret = 0;
2384 
2385 	ret = -ENOENT;
2386 	delayed_refs = &trans->transaction->delayed_refs;
2387 	spin_lock(&delayed_refs->lock);
2388 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2389 	if (!head)
2390 		goto out;
2391 
2392 	if (!mutex_trylock(&head->mutex)) {
2393 		atomic_inc(&head->node.refs);
2394 		spin_unlock(&delayed_refs->lock);
2395 
2396 		btrfs_release_path(path);
2397 
2398 		/*
2399 		 * Mutex was contended, block until it's released and let
2400 		 * caller try again
2401 		 */
2402 		mutex_lock(&head->mutex);
2403 		mutex_unlock(&head->mutex);
2404 		btrfs_put_delayed_ref(&head->node);
2405 		return -EAGAIN;
2406 	}
2407 
2408 	node = rb_prev(&head->node.rb_node);
2409 	if (!node)
2410 		goto out_unlock;
2411 
2412 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2413 
2414 	if (ref->bytenr != bytenr)
2415 		goto out_unlock;
2416 
2417 	ret = 1;
2418 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2419 		goto out_unlock;
2420 
2421 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2422 
2423 	node = rb_prev(node);
2424 	if (node) {
2425 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2426 		if (ref->bytenr == bytenr)
2427 			goto out_unlock;
2428 	}
2429 
2430 	if (data_ref->root != root->root_key.objectid ||
2431 	    data_ref->objectid != objectid || data_ref->offset != offset)
2432 		goto out_unlock;
2433 
2434 	ret = 0;
2435 out_unlock:
2436 	mutex_unlock(&head->mutex);
2437 out:
2438 	spin_unlock(&delayed_refs->lock);
2439 	return ret;
2440 }
2441 
2442 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2443 					struct btrfs_root *root,
2444 					struct btrfs_path *path,
2445 					u64 objectid, u64 offset, u64 bytenr)
2446 {
2447 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2448 	struct extent_buffer *leaf;
2449 	struct btrfs_extent_data_ref *ref;
2450 	struct btrfs_extent_inline_ref *iref;
2451 	struct btrfs_extent_item *ei;
2452 	struct btrfs_key key;
2453 	u32 item_size;
2454 	int ret;
2455 
2456 	key.objectid = bytenr;
2457 	key.offset = (u64)-1;
2458 	key.type = BTRFS_EXTENT_ITEM_KEY;
2459 
2460 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2461 	if (ret < 0)
2462 		goto out;
2463 	BUG_ON(ret == 0);
2464 
2465 	ret = -ENOENT;
2466 	if (path->slots[0] == 0)
2467 		goto out;
2468 
2469 	path->slots[0]--;
2470 	leaf = path->nodes[0];
2471 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2472 
2473 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2474 		goto out;
2475 
2476 	ret = 1;
2477 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2478 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2479 	if (item_size < sizeof(*ei)) {
2480 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2481 		goto out;
2482 	}
2483 #endif
2484 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2485 
2486 	if (item_size != sizeof(*ei) +
2487 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2488 		goto out;
2489 
2490 	if (btrfs_extent_generation(leaf, ei) <=
2491 	    btrfs_root_last_snapshot(&root->root_item))
2492 		goto out;
2493 
2494 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2495 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2496 	    BTRFS_EXTENT_DATA_REF_KEY)
2497 		goto out;
2498 
2499 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2500 	if (btrfs_extent_refs(leaf, ei) !=
2501 	    btrfs_extent_data_ref_count(leaf, ref) ||
2502 	    btrfs_extent_data_ref_root(leaf, ref) !=
2503 	    root->root_key.objectid ||
2504 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2505 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2506 		goto out;
2507 
2508 	ret = 0;
2509 out:
2510 	return ret;
2511 }
2512 
2513 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2514 			  struct btrfs_root *root,
2515 			  u64 objectid, u64 offset, u64 bytenr)
2516 {
2517 	struct btrfs_path *path;
2518 	int ret;
2519 	int ret2;
2520 
2521 	path = btrfs_alloc_path();
2522 	if (!path)
2523 		return -ENOENT;
2524 
2525 	do {
2526 		ret = check_committed_ref(trans, root, path, objectid,
2527 					  offset, bytenr);
2528 		if (ret && ret != -ENOENT)
2529 			goto out;
2530 
2531 		ret2 = check_delayed_ref(trans, root, path, objectid,
2532 					 offset, bytenr);
2533 	} while (ret2 == -EAGAIN);
2534 
2535 	if (ret2 && ret2 != -ENOENT) {
2536 		ret = ret2;
2537 		goto out;
2538 	}
2539 
2540 	if (ret != -ENOENT || ret2 != -ENOENT)
2541 		ret = 0;
2542 out:
2543 	btrfs_free_path(path);
2544 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2545 		WARN_ON(ret > 0);
2546 	return ret;
2547 }
2548 
2549 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2550 			   struct btrfs_root *root,
2551 			   struct extent_buffer *buf,
2552 			   int full_backref, int inc)
2553 {
2554 	u64 bytenr;
2555 	u64 num_bytes;
2556 	u64 parent;
2557 	u64 ref_root;
2558 	u32 nritems;
2559 	struct btrfs_key key;
2560 	struct btrfs_file_extent_item *fi;
2561 	int i;
2562 	int level;
2563 	int ret = 0;
2564 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2565 			    u64, u64, u64, u64, u64, u64);
2566 
2567 	ref_root = btrfs_header_owner(buf);
2568 	nritems = btrfs_header_nritems(buf);
2569 	level = btrfs_header_level(buf);
2570 
2571 	if (!root->ref_cows && level == 0)
2572 		return 0;
2573 
2574 	if (inc)
2575 		process_func = btrfs_inc_extent_ref;
2576 	else
2577 		process_func = btrfs_free_extent;
2578 
2579 	if (full_backref)
2580 		parent = buf->start;
2581 	else
2582 		parent = 0;
2583 
2584 	for (i = 0; i < nritems; i++) {
2585 		if (level == 0) {
2586 			btrfs_item_key_to_cpu(buf, &key, i);
2587 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2588 				continue;
2589 			fi = btrfs_item_ptr(buf, i,
2590 					    struct btrfs_file_extent_item);
2591 			if (btrfs_file_extent_type(buf, fi) ==
2592 			    BTRFS_FILE_EXTENT_INLINE)
2593 				continue;
2594 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2595 			if (bytenr == 0)
2596 				continue;
2597 
2598 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2599 			key.offset -= btrfs_file_extent_offset(buf, fi);
2600 			ret = process_func(trans, root, bytenr, num_bytes,
2601 					   parent, ref_root, key.objectid,
2602 					   key.offset);
2603 			if (ret)
2604 				goto fail;
2605 		} else {
2606 			bytenr = btrfs_node_blockptr(buf, i);
2607 			num_bytes = btrfs_level_size(root, level - 1);
2608 			ret = process_func(trans, root, bytenr, num_bytes,
2609 					   parent, ref_root, level - 1, 0);
2610 			if (ret)
2611 				goto fail;
2612 		}
2613 	}
2614 	return 0;
2615 fail:
2616 	BUG();
2617 	return ret;
2618 }
2619 
2620 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2621 		  struct extent_buffer *buf, int full_backref)
2622 {
2623 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2624 }
2625 
2626 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2627 		  struct extent_buffer *buf, int full_backref)
2628 {
2629 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2630 }
2631 
2632 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2633 				 struct btrfs_root *root,
2634 				 struct btrfs_path *path,
2635 				 struct btrfs_block_group_cache *cache)
2636 {
2637 	int ret;
2638 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2639 	unsigned long bi;
2640 	struct extent_buffer *leaf;
2641 
2642 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2643 	if (ret < 0)
2644 		goto fail;
2645 	BUG_ON(ret);
2646 
2647 	leaf = path->nodes[0];
2648 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2649 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2650 	btrfs_mark_buffer_dirty(leaf);
2651 	btrfs_release_path(path);
2652 fail:
2653 	if (ret)
2654 		return ret;
2655 	return 0;
2656 
2657 }
2658 
2659 static struct btrfs_block_group_cache *
2660 next_block_group(struct btrfs_root *root,
2661 		 struct btrfs_block_group_cache *cache)
2662 {
2663 	struct rb_node *node;
2664 	spin_lock(&root->fs_info->block_group_cache_lock);
2665 	node = rb_next(&cache->cache_node);
2666 	btrfs_put_block_group(cache);
2667 	if (node) {
2668 		cache = rb_entry(node, struct btrfs_block_group_cache,
2669 				 cache_node);
2670 		btrfs_get_block_group(cache);
2671 	} else
2672 		cache = NULL;
2673 	spin_unlock(&root->fs_info->block_group_cache_lock);
2674 	return cache;
2675 }
2676 
2677 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2678 			    struct btrfs_trans_handle *trans,
2679 			    struct btrfs_path *path)
2680 {
2681 	struct btrfs_root *root = block_group->fs_info->tree_root;
2682 	struct inode *inode = NULL;
2683 	u64 alloc_hint = 0;
2684 	int dcs = BTRFS_DC_ERROR;
2685 	int num_pages = 0;
2686 	int retries = 0;
2687 	int ret = 0;
2688 
2689 	/*
2690 	 * If this block group is smaller than 100 megs don't bother caching the
2691 	 * block group.
2692 	 */
2693 	if (block_group->key.offset < (100 * 1024 * 1024)) {
2694 		spin_lock(&block_group->lock);
2695 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2696 		spin_unlock(&block_group->lock);
2697 		return 0;
2698 	}
2699 
2700 again:
2701 	inode = lookup_free_space_inode(root, block_group, path);
2702 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2703 		ret = PTR_ERR(inode);
2704 		btrfs_release_path(path);
2705 		goto out;
2706 	}
2707 
2708 	if (IS_ERR(inode)) {
2709 		BUG_ON(retries);
2710 		retries++;
2711 
2712 		if (block_group->ro)
2713 			goto out_free;
2714 
2715 		ret = create_free_space_inode(root, trans, block_group, path);
2716 		if (ret)
2717 			goto out_free;
2718 		goto again;
2719 	}
2720 
2721 	/* We've already setup this transaction, go ahead and exit */
2722 	if (block_group->cache_generation == trans->transid &&
2723 	    i_size_read(inode)) {
2724 		dcs = BTRFS_DC_SETUP;
2725 		goto out_put;
2726 	}
2727 
2728 	/*
2729 	 * We want to set the generation to 0, that way if anything goes wrong
2730 	 * from here on out we know not to trust this cache when we load up next
2731 	 * time.
2732 	 */
2733 	BTRFS_I(inode)->generation = 0;
2734 	ret = btrfs_update_inode(trans, root, inode);
2735 	WARN_ON(ret);
2736 
2737 	if (i_size_read(inode) > 0) {
2738 		ret = btrfs_truncate_free_space_cache(root, trans, path,
2739 						      inode);
2740 		if (ret)
2741 			goto out_put;
2742 	}
2743 
2744 	spin_lock(&block_group->lock);
2745 	if (block_group->cached != BTRFS_CACHE_FINISHED) {
2746 		/* We're not cached, don't bother trying to write stuff out */
2747 		dcs = BTRFS_DC_WRITTEN;
2748 		spin_unlock(&block_group->lock);
2749 		goto out_put;
2750 	}
2751 	spin_unlock(&block_group->lock);
2752 
2753 	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2754 	if (!num_pages)
2755 		num_pages = 1;
2756 
2757 	/*
2758 	 * Just to make absolutely sure we have enough space, we're going to
2759 	 * preallocate 12 pages worth of space for each block group.  In
2760 	 * practice we ought to use at most 8, but we need extra space so we can
2761 	 * add our header and have a terminator between the extents and the
2762 	 * bitmaps.
2763 	 */
2764 	num_pages *= 16;
2765 	num_pages *= PAGE_CACHE_SIZE;
2766 
2767 	ret = btrfs_check_data_free_space(inode, num_pages);
2768 	if (ret)
2769 		goto out_put;
2770 
2771 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2772 					      num_pages, num_pages,
2773 					      &alloc_hint);
2774 	if (!ret)
2775 		dcs = BTRFS_DC_SETUP;
2776 	btrfs_free_reserved_data_space(inode, num_pages);
2777 
2778 out_put:
2779 	iput(inode);
2780 out_free:
2781 	btrfs_release_path(path);
2782 out:
2783 	spin_lock(&block_group->lock);
2784 	if (!ret)
2785 		block_group->cache_generation = trans->transid;
2786 	block_group->disk_cache_state = dcs;
2787 	spin_unlock(&block_group->lock);
2788 
2789 	return ret;
2790 }
2791 
2792 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2793 				   struct btrfs_root *root)
2794 {
2795 	struct btrfs_block_group_cache *cache;
2796 	int err = 0;
2797 	struct btrfs_path *path;
2798 	u64 last = 0;
2799 
2800 	path = btrfs_alloc_path();
2801 	if (!path)
2802 		return -ENOMEM;
2803 
2804 again:
2805 	while (1) {
2806 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2807 		while (cache) {
2808 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2809 				break;
2810 			cache = next_block_group(root, cache);
2811 		}
2812 		if (!cache) {
2813 			if (last == 0)
2814 				break;
2815 			last = 0;
2816 			continue;
2817 		}
2818 		err = cache_save_setup(cache, trans, path);
2819 		last = cache->key.objectid + cache->key.offset;
2820 		btrfs_put_block_group(cache);
2821 	}
2822 
2823 	while (1) {
2824 		if (last == 0) {
2825 			err = btrfs_run_delayed_refs(trans, root,
2826 						     (unsigned long)-1);
2827 			BUG_ON(err);
2828 		}
2829 
2830 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2831 		while (cache) {
2832 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2833 				btrfs_put_block_group(cache);
2834 				goto again;
2835 			}
2836 
2837 			if (cache->dirty)
2838 				break;
2839 			cache = next_block_group(root, cache);
2840 		}
2841 		if (!cache) {
2842 			if (last == 0)
2843 				break;
2844 			last = 0;
2845 			continue;
2846 		}
2847 
2848 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
2849 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2850 		cache->dirty = 0;
2851 		last = cache->key.objectid + cache->key.offset;
2852 
2853 		err = write_one_cache_group(trans, root, path, cache);
2854 		BUG_ON(err);
2855 		btrfs_put_block_group(cache);
2856 	}
2857 
2858 	while (1) {
2859 		/*
2860 		 * I don't think this is needed since we're just marking our
2861 		 * preallocated extent as written, but just in case it can't
2862 		 * hurt.
2863 		 */
2864 		if (last == 0) {
2865 			err = btrfs_run_delayed_refs(trans, root,
2866 						     (unsigned long)-1);
2867 			BUG_ON(err);
2868 		}
2869 
2870 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
2871 		while (cache) {
2872 			/*
2873 			 * Really this shouldn't happen, but it could if we
2874 			 * couldn't write the entire preallocated extent and
2875 			 * splitting the extent resulted in a new block.
2876 			 */
2877 			if (cache->dirty) {
2878 				btrfs_put_block_group(cache);
2879 				goto again;
2880 			}
2881 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2882 				break;
2883 			cache = next_block_group(root, cache);
2884 		}
2885 		if (!cache) {
2886 			if (last == 0)
2887 				break;
2888 			last = 0;
2889 			continue;
2890 		}
2891 
2892 		btrfs_write_out_cache(root, trans, cache, path);
2893 
2894 		/*
2895 		 * If we didn't have an error then the cache state is still
2896 		 * NEED_WRITE, so we can set it to WRITTEN.
2897 		 */
2898 		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2899 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
2900 		last = cache->key.objectid + cache->key.offset;
2901 		btrfs_put_block_group(cache);
2902 	}
2903 
2904 	btrfs_free_path(path);
2905 	return 0;
2906 }
2907 
2908 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2909 {
2910 	struct btrfs_block_group_cache *block_group;
2911 	int readonly = 0;
2912 
2913 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2914 	if (!block_group || block_group->ro)
2915 		readonly = 1;
2916 	if (block_group)
2917 		btrfs_put_block_group(block_group);
2918 	return readonly;
2919 }
2920 
2921 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2922 			     u64 total_bytes, u64 bytes_used,
2923 			     struct btrfs_space_info **space_info)
2924 {
2925 	struct btrfs_space_info *found;
2926 	int i;
2927 	int factor;
2928 
2929 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2930 		     BTRFS_BLOCK_GROUP_RAID10))
2931 		factor = 2;
2932 	else
2933 		factor = 1;
2934 
2935 	found = __find_space_info(info, flags);
2936 	if (found) {
2937 		spin_lock(&found->lock);
2938 		found->total_bytes += total_bytes;
2939 		found->disk_total += total_bytes * factor;
2940 		found->bytes_used += bytes_used;
2941 		found->disk_used += bytes_used * factor;
2942 		found->full = 0;
2943 		spin_unlock(&found->lock);
2944 		*space_info = found;
2945 		return 0;
2946 	}
2947 	found = kzalloc(sizeof(*found), GFP_NOFS);
2948 	if (!found)
2949 		return -ENOMEM;
2950 
2951 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2952 		INIT_LIST_HEAD(&found->block_groups[i]);
2953 	init_rwsem(&found->groups_sem);
2954 	spin_lock_init(&found->lock);
2955 	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2956 				BTRFS_BLOCK_GROUP_SYSTEM |
2957 				BTRFS_BLOCK_GROUP_METADATA);
2958 	found->total_bytes = total_bytes;
2959 	found->disk_total = total_bytes * factor;
2960 	found->bytes_used = bytes_used;
2961 	found->disk_used = bytes_used * factor;
2962 	found->bytes_pinned = 0;
2963 	found->bytes_reserved = 0;
2964 	found->bytes_readonly = 0;
2965 	found->bytes_may_use = 0;
2966 	found->full = 0;
2967 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2968 	found->chunk_alloc = 0;
2969 	found->flush = 0;
2970 	init_waitqueue_head(&found->wait);
2971 	*space_info = found;
2972 	list_add_rcu(&found->list, &info->space_info);
2973 	return 0;
2974 }
2975 
2976 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2977 {
2978 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2979 				   BTRFS_BLOCK_GROUP_RAID1 |
2980 				   BTRFS_BLOCK_GROUP_RAID10 |
2981 				   BTRFS_BLOCK_GROUP_DUP);
2982 	if (extra_flags) {
2983 		if (flags & BTRFS_BLOCK_GROUP_DATA)
2984 			fs_info->avail_data_alloc_bits |= extra_flags;
2985 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
2986 			fs_info->avail_metadata_alloc_bits |= extra_flags;
2987 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2988 			fs_info->avail_system_alloc_bits |= extra_flags;
2989 	}
2990 }
2991 
2992 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2993 {
2994 	/*
2995 	 * we add in the count of missing devices because we want
2996 	 * to make sure that any RAID levels on a degraded FS
2997 	 * continue to be honored.
2998 	 */
2999 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3000 		root->fs_info->fs_devices->missing_devices;
3001 
3002 	if (num_devices == 1)
3003 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3004 	if (num_devices < 4)
3005 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3006 
3007 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3008 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3009 		      BTRFS_BLOCK_GROUP_RAID10))) {
3010 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
3011 	}
3012 
3013 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3014 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3015 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3016 	}
3017 
3018 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3019 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3020 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
3021 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
3022 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3023 	return flags;
3024 }
3025 
3026 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3027 {
3028 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3029 		flags |= root->fs_info->avail_data_alloc_bits &
3030 			 root->fs_info->data_alloc_profile;
3031 	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3032 		flags |= root->fs_info->avail_system_alloc_bits &
3033 			 root->fs_info->system_alloc_profile;
3034 	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3035 		flags |= root->fs_info->avail_metadata_alloc_bits &
3036 			 root->fs_info->metadata_alloc_profile;
3037 	return btrfs_reduce_alloc_profile(root, flags);
3038 }
3039 
3040 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3041 {
3042 	u64 flags;
3043 
3044 	if (data)
3045 		flags = BTRFS_BLOCK_GROUP_DATA;
3046 	else if (root == root->fs_info->chunk_root)
3047 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3048 	else
3049 		flags = BTRFS_BLOCK_GROUP_METADATA;
3050 
3051 	return get_alloc_profile(root, flags);
3052 }
3053 
3054 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3055 {
3056 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3057 						       BTRFS_BLOCK_GROUP_DATA);
3058 }
3059 
3060 /*
3061  * This will check the space that the inode allocates from to make sure we have
3062  * enough space for bytes.
3063  */
3064 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3065 {
3066 	struct btrfs_space_info *data_sinfo;
3067 	struct btrfs_root *root = BTRFS_I(inode)->root;
3068 	u64 used;
3069 	int ret = 0, committed = 0, alloc_chunk = 1;
3070 
3071 	/* make sure bytes are sectorsize aligned */
3072 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3073 
3074 	if (root == root->fs_info->tree_root ||
3075 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3076 		alloc_chunk = 0;
3077 		committed = 1;
3078 	}
3079 
3080 	data_sinfo = BTRFS_I(inode)->space_info;
3081 	if (!data_sinfo)
3082 		goto alloc;
3083 
3084 again:
3085 	/* make sure we have enough space to handle the data first */
3086 	spin_lock(&data_sinfo->lock);
3087 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3088 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3089 		data_sinfo->bytes_may_use;
3090 
3091 	if (used + bytes > data_sinfo->total_bytes) {
3092 		struct btrfs_trans_handle *trans;
3093 
3094 		/*
3095 		 * if we don't have enough free bytes in this space then we need
3096 		 * to alloc a new chunk.
3097 		 */
3098 		if (!data_sinfo->full && alloc_chunk) {
3099 			u64 alloc_target;
3100 
3101 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3102 			spin_unlock(&data_sinfo->lock);
3103 alloc:
3104 			alloc_target = btrfs_get_alloc_profile(root, 1);
3105 			trans = btrfs_join_transaction(root);
3106 			if (IS_ERR(trans))
3107 				return PTR_ERR(trans);
3108 
3109 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3110 					     bytes + 2 * 1024 * 1024,
3111 					     alloc_target,
3112 					     CHUNK_ALLOC_NO_FORCE);
3113 			btrfs_end_transaction(trans, root);
3114 			if (ret < 0) {
3115 				if (ret != -ENOSPC)
3116 					return ret;
3117 				else
3118 					goto commit_trans;
3119 			}
3120 
3121 			if (!data_sinfo) {
3122 				btrfs_set_inode_space_info(root, inode);
3123 				data_sinfo = BTRFS_I(inode)->space_info;
3124 			}
3125 			goto again;
3126 		}
3127 
3128 		/*
3129 		 * If we have less pinned bytes than we want to allocate then
3130 		 * don't bother committing the transaction, it won't help us.
3131 		 */
3132 		if (data_sinfo->bytes_pinned < bytes)
3133 			committed = 1;
3134 		spin_unlock(&data_sinfo->lock);
3135 
3136 		/* commit the current transaction and try again */
3137 commit_trans:
3138 		if (!committed &&
3139 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3140 			committed = 1;
3141 			trans = btrfs_join_transaction(root);
3142 			if (IS_ERR(trans))
3143 				return PTR_ERR(trans);
3144 			ret = btrfs_commit_transaction(trans, root);
3145 			if (ret)
3146 				return ret;
3147 			goto again;
3148 		}
3149 
3150 		return -ENOSPC;
3151 	}
3152 	data_sinfo->bytes_may_use += bytes;
3153 	spin_unlock(&data_sinfo->lock);
3154 
3155 	return 0;
3156 }
3157 
3158 /*
3159  * Called if we need to clear a data reservation for this inode.
3160  */
3161 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3162 {
3163 	struct btrfs_root *root = BTRFS_I(inode)->root;
3164 	struct btrfs_space_info *data_sinfo;
3165 
3166 	/* make sure bytes are sectorsize aligned */
3167 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3168 
3169 	data_sinfo = BTRFS_I(inode)->space_info;
3170 	spin_lock(&data_sinfo->lock);
3171 	data_sinfo->bytes_may_use -= bytes;
3172 	spin_unlock(&data_sinfo->lock);
3173 }
3174 
3175 static void force_metadata_allocation(struct btrfs_fs_info *info)
3176 {
3177 	struct list_head *head = &info->space_info;
3178 	struct btrfs_space_info *found;
3179 
3180 	rcu_read_lock();
3181 	list_for_each_entry_rcu(found, head, list) {
3182 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3183 			found->force_alloc = CHUNK_ALLOC_FORCE;
3184 	}
3185 	rcu_read_unlock();
3186 }
3187 
3188 static int should_alloc_chunk(struct btrfs_root *root,
3189 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
3190 			      int force)
3191 {
3192 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3193 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3194 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3195 	u64 thresh;
3196 
3197 	if (force == CHUNK_ALLOC_FORCE)
3198 		return 1;
3199 
3200 	/*
3201 	 * We need to take into account the global rsv because for all intents
3202 	 * and purposes it's used space.  Don't worry about locking the
3203 	 * global_rsv, it doesn't change except when the transaction commits.
3204 	 */
3205 	num_allocated += global_rsv->size;
3206 
3207 	/*
3208 	 * in limited mode, we want to have some free space up to
3209 	 * about 1% of the FS size.
3210 	 */
3211 	if (force == CHUNK_ALLOC_LIMITED) {
3212 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3213 		thresh = max_t(u64, 64 * 1024 * 1024,
3214 			       div_factor_fine(thresh, 1));
3215 
3216 		if (num_bytes - num_allocated < thresh)
3217 			return 1;
3218 	}
3219 
3220 	/*
3221 	 * we have two similar checks here, one based on percentage
3222 	 * and once based on a hard number of 256MB.  The idea
3223 	 * is that if we have a good amount of free
3224 	 * room, don't allocate a chunk.  A good mount is
3225 	 * less than 80% utilized of the chunks we have allocated,
3226 	 * or more than 256MB free
3227 	 */
3228 	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3229 		return 0;
3230 
3231 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3232 		return 0;
3233 
3234 	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3235 
3236 	/* 256MB or 5% of the FS */
3237 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3238 
3239 	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3240 		return 0;
3241 	return 1;
3242 }
3243 
3244 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3245 			  struct btrfs_root *extent_root, u64 alloc_bytes,
3246 			  u64 flags, int force)
3247 {
3248 	struct btrfs_space_info *space_info;
3249 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3250 	int wait_for_alloc = 0;
3251 	int ret = 0;
3252 
3253 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
3254 
3255 	space_info = __find_space_info(extent_root->fs_info, flags);
3256 	if (!space_info) {
3257 		ret = update_space_info(extent_root->fs_info, flags,
3258 					0, 0, &space_info);
3259 		BUG_ON(ret);
3260 	}
3261 	BUG_ON(!space_info);
3262 
3263 again:
3264 	spin_lock(&space_info->lock);
3265 	if (space_info->force_alloc)
3266 		force = space_info->force_alloc;
3267 	if (space_info->full) {
3268 		spin_unlock(&space_info->lock);
3269 		return 0;
3270 	}
3271 
3272 	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3273 		spin_unlock(&space_info->lock);
3274 		return 0;
3275 	} else if (space_info->chunk_alloc) {
3276 		wait_for_alloc = 1;
3277 	} else {
3278 		space_info->chunk_alloc = 1;
3279 	}
3280 
3281 	spin_unlock(&space_info->lock);
3282 
3283 	mutex_lock(&fs_info->chunk_mutex);
3284 
3285 	/*
3286 	 * The chunk_mutex is held throughout the entirety of a chunk
3287 	 * allocation, so once we've acquired the chunk_mutex we know that the
3288 	 * other guy is done and we need to recheck and see if we should
3289 	 * allocate.
3290 	 */
3291 	if (wait_for_alloc) {
3292 		mutex_unlock(&fs_info->chunk_mutex);
3293 		wait_for_alloc = 0;
3294 		goto again;
3295 	}
3296 
3297 	/*
3298 	 * If we have mixed data/metadata chunks we want to make sure we keep
3299 	 * allocating mixed chunks instead of individual chunks.
3300 	 */
3301 	if (btrfs_mixed_space_info(space_info))
3302 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3303 
3304 	/*
3305 	 * if we're doing a data chunk, go ahead and make sure that
3306 	 * we keep a reasonable number of metadata chunks allocated in the
3307 	 * FS as well.
3308 	 */
3309 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3310 		fs_info->data_chunk_allocations++;
3311 		if (!(fs_info->data_chunk_allocations %
3312 		      fs_info->metadata_ratio))
3313 			force_metadata_allocation(fs_info);
3314 	}
3315 
3316 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3317 	if (ret < 0 && ret != -ENOSPC)
3318 		goto out;
3319 
3320 	spin_lock(&space_info->lock);
3321 	if (ret)
3322 		space_info->full = 1;
3323 	else
3324 		ret = 1;
3325 
3326 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3327 	space_info->chunk_alloc = 0;
3328 	spin_unlock(&space_info->lock);
3329 out:
3330 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
3331 	return ret;
3332 }
3333 
3334 /*
3335  * shrink metadata reservation for delalloc
3336  */
3337 static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3338 			   bool wait_ordered)
3339 {
3340 	struct btrfs_block_rsv *block_rsv;
3341 	struct btrfs_space_info *space_info;
3342 	struct btrfs_trans_handle *trans;
3343 	u64 reserved;
3344 	u64 max_reclaim;
3345 	u64 reclaimed = 0;
3346 	long time_left;
3347 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3348 	int loops = 0;
3349 	unsigned long progress;
3350 
3351 	trans = (struct btrfs_trans_handle *)current->journal_info;
3352 	block_rsv = &root->fs_info->delalloc_block_rsv;
3353 	space_info = block_rsv->space_info;
3354 
3355 	smp_mb();
3356 	reserved = space_info->bytes_may_use;
3357 	progress = space_info->reservation_progress;
3358 
3359 	if (reserved == 0)
3360 		return 0;
3361 
3362 	smp_mb();
3363 	if (root->fs_info->delalloc_bytes == 0) {
3364 		if (trans)
3365 			return 0;
3366 		btrfs_wait_ordered_extents(root, 0, 0);
3367 		return 0;
3368 	}
3369 
3370 	max_reclaim = min(reserved, to_reclaim);
3371 	nr_pages = max_t(unsigned long, nr_pages,
3372 			 max_reclaim >> PAGE_CACHE_SHIFT);
3373 	while (loops < 1024) {
3374 		/* have the flusher threads jump in and do some IO */
3375 		smp_mb();
3376 		nr_pages = min_t(unsigned long, nr_pages,
3377 		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3378 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3379 						WB_REASON_FS_FREE_SPACE);
3380 
3381 		spin_lock(&space_info->lock);
3382 		if (reserved > space_info->bytes_may_use)
3383 			reclaimed += reserved - space_info->bytes_may_use;
3384 		reserved = space_info->bytes_may_use;
3385 		spin_unlock(&space_info->lock);
3386 
3387 		loops++;
3388 
3389 		if (reserved == 0 || reclaimed >= max_reclaim)
3390 			break;
3391 
3392 		if (trans && trans->transaction->blocked)
3393 			return -EAGAIN;
3394 
3395 		if (wait_ordered && !trans) {
3396 			btrfs_wait_ordered_extents(root, 0, 0);
3397 		} else {
3398 			time_left = schedule_timeout_interruptible(1);
3399 
3400 			/* We were interrupted, exit */
3401 			if (time_left)
3402 				break;
3403 		}
3404 
3405 		/* we've kicked the IO a few times, if anything has been freed,
3406 		 * exit.  There is no sense in looping here for a long time
3407 		 * when we really need to commit the transaction, or there are
3408 		 * just too many writers without enough free space
3409 		 */
3410 
3411 		if (loops > 3) {
3412 			smp_mb();
3413 			if (progress != space_info->reservation_progress)
3414 				break;
3415 		}
3416 
3417 	}
3418 
3419 	return reclaimed >= to_reclaim;
3420 }
3421 
3422 /**
3423  * maybe_commit_transaction - possibly commit the transaction if its ok to
3424  * @root - the root we're allocating for
3425  * @bytes - the number of bytes we want to reserve
3426  * @force - force the commit
3427  *
3428  * This will check to make sure that committing the transaction will actually
3429  * get us somewhere and then commit the transaction if it does.  Otherwise it
3430  * will return -ENOSPC.
3431  */
3432 static int may_commit_transaction(struct btrfs_root *root,
3433 				  struct btrfs_space_info *space_info,
3434 				  u64 bytes, int force)
3435 {
3436 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3437 	struct btrfs_trans_handle *trans;
3438 
3439 	trans = (struct btrfs_trans_handle *)current->journal_info;
3440 	if (trans)
3441 		return -EAGAIN;
3442 
3443 	if (force)
3444 		goto commit;
3445 
3446 	/* See if there is enough pinned space to make this reservation */
3447 	spin_lock(&space_info->lock);
3448 	if (space_info->bytes_pinned >= bytes) {
3449 		spin_unlock(&space_info->lock);
3450 		goto commit;
3451 	}
3452 	spin_unlock(&space_info->lock);
3453 
3454 	/*
3455 	 * See if there is some space in the delayed insertion reservation for
3456 	 * this reservation.
3457 	 */
3458 	if (space_info != delayed_rsv->space_info)
3459 		return -ENOSPC;
3460 
3461 	spin_lock(&delayed_rsv->lock);
3462 	if (delayed_rsv->size < bytes) {
3463 		spin_unlock(&delayed_rsv->lock);
3464 		return -ENOSPC;
3465 	}
3466 	spin_unlock(&delayed_rsv->lock);
3467 
3468 commit:
3469 	trans = btrfs_join_transaction(root);
3470 	if (IS_ERR(trans))
3471 		return -ENOSPC;
3472 
3473 	return btrfs_commit_transaction(trans, root);
3474 }
3475 
3476 /**
3477  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3478  * @root - the root we're allocating for
3479  * @block_rsv - the block_rsv we're allocating for
3480  * @orig_bytes - the number of bytes we want
3481  * @flush - wether or not we can flush to make our reservation
3482  *
3483  * This will reserve orgi_bytes number of bytes from the space info associated
3484  * with the block_rsv.  If there is not enough space it will make an attempt to
3485  * flush out space to make room.  It will do this by flushing delalloc if
3486  * possible or committing the transaction.  If flush is 0 then no attempts to
3487  * regain reservations will be made and this will fail if there is not enough
3488  * space already.
3489  */
3490 static int reserve_metadata_bytes(struct btrfs_root *root,
3491 				  struct btrfs_block_rsv *block_rsv,
3492 				  u64 orig_bytes, int flush)
3493 {
3494 	struct btrfs_space_info *space_info = block_rsv->space_info;
3495 	u64 used;
3496 	u64 num_bytes = orig_bytes;
3497 	int retries = 0;
3498 	int ret = 0;
3499 	bool committed = false;
3500 	bool flushing = false;
3501 	bool wait_ordered = false;
3502 
3503 again:
3504 	ret = 0;
3505 	spin_lock(&space_info->lock);
3506 	/*
3507 	 * We only want to wait if somebody other than us is flushing and we are
3508 	 * actually alloed to flush.
3509 	 */
3510 	while (flush && !flushing && space_info->flush) {
3511 		spin_unlock(&space_info->lock);
3512 		/*
3513 		 * If we have a trans handle we can't wait because the flusher
3514 		 * may have to commit the transaction, which would mean we would
3515 		 * deadlock since we are waiting for the flusher to finish, but
3516 		 * hold the current transaction open.
3517 		 */
3518 		if (current->journal_info)
3519 			return -EAGAIN;
3520 		ret = wait_event_interruptible(space_info->wait,
3521 					       !space_info->flush);
3522 		/* Must have been interrupted, return */
3523 		if (ret)
3524 			return -EINTR;
3525 
3526 		spin_lock(&space_info->lock);
3527 	}
3528 
3529 	ret = -ENOSPC;
3530 	used = space_info->bytes_used + space_info->bytes_reserved +
3531 		space_info->bytes_pinned + space_info->bytes_readonly +
3532 		space_info->bytes_may_use;
3533 
3534 	/*
3535 	 * The idea here is that we've not already over-reserved the block group
3536 	 * then we can go ahead and save our reservation first and then start
3537 	 * flushing if we need to.  Otherwise if we've already overcommitted
3538 	 * lets start flushing stuff first and then come back and try to make
3539 	 * our reservation.
3540 	 */
3541 	if (used <= space_info->total_bytes) {
3542 		if (used + orig_bytes <= space_info->total_bytes) {
3543 			space_info->bytes_may_use += orig_bytes;
3544 			ret = 0;
3545 		} else {
3546 			/*
3547 			 * Ok set num_bytes to orig_bytes since we aren't
3548 			 * overocmmitted, this way we only try and reclaim what
3549 			 * we need.
3550 			 */
3551 			num_bytes = orig_bytes;
3552 		}
3553 	} else {
3554 		/*
3555 		 * Ok we're over committed, set num_bytes to the overcommitted
3556 		 * amount plus the amount of bytes that we need for this
3557 		 * reservation.
3558 		 */
3559 		wait_ordered = true;
3560 		num_bytes = used - space_info->total_bytes +
3561 			(orig_bytes * (retries + 1));
3562 	}
3563 
3564 	if (ret) {
3565 		u64 profile = btrfs_get_alloc_profile(root, 0);
3566 		u64 avail;
3567 
3568 		/*
3569 		 * If we have a lot of space that's pinned, don't bother doing
3570 		 * the overcommit dance yet and just commit the transaction.
3571 		 */
3572 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3573 		do_div(avail, 10);
3574 		if (space_info->bytes_pinned >= avail && flush && !committed) {
3575 			space_info->flush = 1;
3576 			flushing = true;
3577 			spin_unlock(&space_info->lock);
3578 			ret = may_commit_transaction(root, space_info,
3579 						     orig_bytes, 1);
3580 			if (ret)
3581 				goto out;
3582 			committed = true;
3583 			goto again;
3584 		}
3585 
3586 		spin_lock(&root->fs_info->free_chunk_lock);
3587 		avail = root->fs_info->free_chunk_space;
3588 
3589 		/*
3590 		 * If we have dup, raid1 or raid10 then only half of the free
3591 		 * space is actually useable.
3592 		 */
3593 		if (profile & (BTRFS_BLOCK_GROUP_DUP |
3594 			       BTRFS_BLOCK_GROUP_RAID1 |
3595 			       BTRFS_BLOCK_GROUP_RAID10))
3596 			avail >>= 1;
3597 
3598 		/*
3599 		 * If we aren't flushing don't let us overcommit too much, say
3600 		 * 1/8th of the space.  If we can flush, let it overcommit up to
3601 		 * 1/2 of the space.
3602 		 */
3603 		if (flush)
3604 			avail >>= 3;
3605 		else
3606 			avail >>= 1;
3607 		 spin_unlock(&root->fs_info->free_chunk_lock);
3608 
3609 		if (used + num_bytes < space_info->total_bytes + avail) {
3610 			space_info->bytes_may_use += orig_bytes;
3611 			ret = 0;
3612 		} else {
3613 			wait_ordered = true;
3614 		}
3615 	}
3616 
3617 	/*
3618 	 * Couldn't make our reservation, save our place so while we're trying
3619 	 * to reclaim space we can actually use it instead of somebody else
3620 	 * stealing it from us.
3621 	 */
3622 	if (ret && flush) {
3623 		flushing = true;
3624 		space_info->flush = 1;
3625 	}
3626 
3627 	spin_unlock(&space_info->lock);
3628 
3629 	if (!ret || !flush)
3630 		goto out;
3631 
3632 	/*
3633 	 * We do synchronous shrinking since we don't actually unreserve
3634 	 * metadata until after the IO is completed.
3635 	 */
3636 	ret = shrink_delalloc(root, num_bytes, wait_ordered);
3637 	if (ret < 0)
3638 		goto out;
3639 
3640 	ret = 0;
3641 
3642 	/*
3643 	 * So if we were overcommitted it's possible that somebody else flushed
3644 	 * out enough space and we simply didn't have enough space to reclaim,
3645 	 * so go back around and try again.
3646 	 */
3647 	if (retries < 2) {
3648 		wait_ordered = true;
3649 		retries++;
3650 		goto again;
3651 	}
3652 
3653 	ret = -ENOSPC;
3654 	if (committed)
3655 		goto out;
3656 
3657 	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3658 	if (!ret) {
3659 		committed = true;
3660 		goto again;
3661 	}
3662 
3663 out:
3664 	if (flushing) {
3665 		spin_lock(&space_info->lock);
3666 		space_info->flush = 0;
3667 		wake_up_all(&space_info->wait);
3668 		spin_unlock(&space_info->lock);
3669 	}
3670 	return ret;
3671 }
3672 
3673 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3674 					     struct btrfs_root *root)
3675 {
3676 	struct btrfs_block_rsv *block_rsv = NULL;
3677 
3678 	if (root->ref_cows || root == root->fs_info->csum_root)
3679 		block_rsv = trans->block_rsv;
3680 
3681 	if (!block_rsv)
3682 		block_rsv = root->block_rsv;
3683 
3684 	if (!block_rsv)
3685 		block_rsv = &root->fs_info->empty_block_rsv;
3686 
3687 	return block_rsv;
3688 }
3689 
3690 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3691 			       u64 num_bytes)
3692 {
3693 	int ret = -ENOSPC;
3694 	spin_lock(&block_rsv->lock);
3695 	if (block_rsv->reserved >= num_bytes) {
3696 		block_rsv->reserved -= num_bytes;
3697 		if (block_rsv->reserved < block_rsv->size)
3698 			block_rsv->full = 0;
3699 		ret = 0;
3700 	}
3701 	spin_unlock(&block_rsv->lock);
3702 	return ret;
3703 }
3704 
3705 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3706 				u64 num_bytes, int update_size)
3707 {
3708 	spin_lock(&block_rsv->lock);
3709 	block_rsv->reserved += num_bytes;
3710 	if (update_size)
3711 		block_rsv->size += num_bytes;
3712 	else if (block_rsv->reserved >= block_rsv->size)
3713 		block_rsv->full = 1;
3714 	spin_unlock(&block_rsv->lock);
3715 }
3716 
3717 static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3718 				    struct btrfs_block_rsv *dest, u64 num_bytes)
3719 {
3720 	struct btrfs_space_info *space_info = block_rsv->space_info;
3721 
3722 	spin_lock(&block_rsv->lock);
3723 	if (num_bytes == (u64)-1)
3724 		num_bytes = block_rsv->size;
3725 	block_rsv->size -= num_bytes;
3726 	if (block_rsv->reserved >= block_rsv->size) {
3727 		num_bytes = block_rsv->reserved - block_rsv->size;
3728 		block_rsv->reserved = block_rsv->size;
3729 		block_rsv->full = 1;
3730 	} else {
3731 		num_bytes = 0;
3732 	}
3733 	spin_unlock(&block_rsv->lock);
3734 
3735 	if (num_bytes > 0) {
3736 		if (dest) {
3737 			spin_lock(&dest->lock);
3738 			if (!dest->full) {
3739 				u64 bytes_to_add;
3740 
3741 				bytes_to_add = dest->size - dest->reserved;
3742 				bytes_to_add = min(num_bytes, bytes_to_add);
3743 				dest->reserved += bytes_to_add;
3744 				if (dest->reserved >= dest->size)
3745 					dest->full = 1;
3746 				num_bytes -= bytes_to_add;
3747 			}
3748 			spin_unlock(&dest->lock);
3749 		}
3750 		if (num_bytes) {
3751 			spin_lock(&space_info->lock);
3752 			space_info->bytes_may_use -= num_bytes;
3753 			space_info->reservation_progress++;
3754 			spin_unlock(&space_info->lock);
3755 		}
3756 	}
3757 }
3758 
3759 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3760 				   struct btrfs_block_rsv *dst, u64 num_bytes)
3761 {
3762 	int ret;
3763 
3764 	ret = block_rsv_use_bytes(src, num_bytes);
3765 	if (ret)
3766 		return ret;
3767 
3768 	block_rsv_add_bytes(dst, num_bytes, 1);
3769 	return 0;
3770 }
3771 
3772 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3773 {
3774 	memset(rsv, 0, sizeof(*rsv));
3775 	spin_lock_init(&rsv->lock);
3776 }
3777 
3778 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3779 {
3780 	struct btrfs_block_rsv *block_rsv;
3781 	struct btrfs_fs_info *fs_info = root->fs_info;
3782 
3783 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3784 	if (!block_rsv)
3785 		return NULL;
3786 
3787 	btrfs_init_block_rsv(block_rsv);
3788 	block_rsv->space_info = __find_space_info(fs_info,
3789 						  BTRFS_BLOCK_GROUP_METADATA);
3790 	return block_rsv;
3791 }
3792 
3793 void btrfs_free_block_rsv(struct btrfs_root *root,
3794 			  struct btrfs_block_rsv *rsv)
3795 {
3796 	btrfs_block_rsv_release(root, rsv, (u64)-1);
3797 	kfree(rsv);
3798 }
3799 
3800 int btrfs_block_rsv_add(struct btrfs_root *root,
3801 			struct btrfs_block_rsv *block_rsv,
3802 			u64 num_bytes)
3803 {
3804 	int ret;
3805 
3806 	if (num_bytes == 0)
3807 		return 0;
3808 
3809 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3810 	if (!ret) {
3811 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
3812 		return 0;
3813 	}
3814 
3815 	return ret;
3816 }
3817 
3818 int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3819 				struct btrfs_block_rsv *block_rsv,
3820 				u64 num_bytes)
3821 {
3822 	int ret;
3823 
3824 	if (num_bytes == 0)
3825 		return 0;
3826 
3827 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
3828 	if (!ret) {
3829 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
3830 		return 0;
3831 	}
3832 
3833 	return ret;
3834 }
3835 
3836 int btrfs_block_rsv_check(struct btrfs_root *root,
3837 			  struct btrfs_block_rsv *block_rsv, int min_factor)
3838 {
3839 	u64 num_bytes = 0;
3840 	int ret = -ENOSPC;
3841 
3842 	if (!block_rsv)
3843 		return 0;
3844 
3845 	spin_lock(&block_rsv->lock);
3846 	num_bytes = div_factor(block_rsv->size, min_factor);
3847 	if (block_rsv->reserved >= num_bytes)
3848 		ret = 0;
3849 	spin_unlock(&block_rsv->lock);
3850 
3851 	return ret;
3852 }
3853 
3854 int btrfs_block_rsv_refill(struct btrfs_root *root,
3855 			  struct btrfs_block_rsv *block_rsv,
3856 			  u64 min_reserved)
3857 {
3858 	u64 num_bytes = 0;
3859 	int ret = -ENOSPC;
3860 
3861 	if (!block_rsv)
3862 		return 0;
3863 
3864 	spin_lock(&block_rsv->lock);
3865 	num_bytes = min_reserved;
3866 	if (block_rsv->reserved >= num_bytes)
3867 		ret = 0;
3868 	else
3869 		num_bytes -= block_rsv->reserved;
3870 	spin_unlock(&block_rsv->lock);
3871 
3872 	if (!ret)
3873 		return 0;
3874 
3875 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3876 	if (!ret) {
3877 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
3878 		return 0;
3879 	}
3880 
3881 	return ret;
3882 }
3883 
3884 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3885 			    struct btrfs_block_rsv *dst_rsv,
3886 			    u64 num_bytes)
3887 {
3888 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3889 }
3890 
3891 void btrfs_block_rsv_release(struct btrfs_root *root,
3892 			     struct btrfs_block_rsv *block_rsv,
3893 			     u64 num_bytes)
3894 {
3895 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3896 	if (global_rsv->full || global_rsv == block_rsv ||
3897 	    block_rsv->space_info != global_rsv->space_info)
3898 		global_rsv = NULL;
3899 	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3900 }
3901 
3902 /*
3903  * helper to calculate size of global block reservation.
3904  * the desired value is sum of space used by extent tree,
3905  * checksum tree and root tree
3906  */
3907 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3908 {
3909 	struct btrfs_space_info *sinfo;
3910 	u64 num_bytes;
3911 	u64 meta_used;
3912 	u64 data_used;
3913 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3914 
3915 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3916 	spin_lock(&sinfo->lock);
3917 	data_used = sinfo->bytes_used;
3918 	spin_unlock(&sinfo->lock);
3919 
3920 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3921 	spin_lock(&sinfo->lock);
3922 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3923 		data_used = 0;
3924 	meta_used = sinfo->bytes_used;
3925 	spin_unlock(&sinfo->lock);
3926 
3927 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3928 		    csum_size * 2;
3929 	num_bytes += div64_u64(data_used + meta_used, 50);
3930 
3931 	if (num_bytes * 3 > meta_used)
3932 		num_bytes = div64_u64(meta_used, 3);
3933 
3934 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3935 }
3936 
3937 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3938 {
3939 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3940 	struct btrfs_space_info *sinfo = block_rsv->space_info;
3941 	u64 num_bytes;
3942 
3943 	num_bytes = calc_global_metadata_size(fs_info);
3944 
3945 	spin_lock(&block_rsv->lock);
3946 	spin_lock(&sinfo->lock);
3947 
3948 	block_rsv->size = num_bytes;
3949 
3950 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3951 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
3952 		    sinfo->bytes_may_use;
3953 
3954 	if (sinfo->total_bytes > num_bytes) {
3955 		num_bytes = sinfo->total_bytes - num_bytes;
3956 		block_rsv->reserved += num_bytes;
3957 		sinfo->bytes_may_use += num_bytes;
3958 	}
3959 
3960 	if (block_rsv->reserved >= block_rsv->size) {
3961 		num_bytes = block_rsv->reserved - block_rsv->size;
3962 		sinfo->bytes_may_use -= num_bytes;
3963 		sinfo->reservation_progress++;
3964 		block_rsv->reserved = block_rsv->size;
3965 		block_rsv->full = 1;
3966 	}
3967 
3968 	spin_unlock(&sinfo->lock);
3969 	spin_unlock(&block_rsv->lock);
3970 }
3971 
3972 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3973 {
3974 	struct btrfs_space_info *space_info;
3975 
3976 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3977 	fs_info->chunk_block_rsv.space_info = space_info;
3978 
3979 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3980 	fs_info->global_block_rsv.space_info = space_info;
3981 	fs_info->delalloc_block_rsv.space_info = space_info;
3982 	fs_info->trans_block_rsv.space_info = space_info;
3983 	fs_info->empty_block_rsv.space_info = space_info;
3984 	fs_info->delayed_block_rsv.space_info = space_info;
3985 
3986 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3987 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3988 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3989 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3990 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3991 
3992 	update_global_block_rsv(fs_info);
3993 }
3994 
3995 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3996 {
3997 	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3998 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3999 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4000 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4001 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4002 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4003 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4004 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4005 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4006 }
4007 
4008 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4009 				  struct btrfs_root *root)
4010 {
4011 	if (!trans->bytes_reserved)
4012 		return;
4013 
4014 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4015 	trans->bytes_reserved = 0;
4016 }
4017 
4018 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4019 				  struct inode *inode)
4020 {
4021 	struct btrfs_root *root = BTRFS_I(inode)->root;
4022 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4023 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4024 
4025 	/*
4026 	 * We need to hold space in order to delete our orphan item once we've
4027 	 * added it, so this takes the reservation so we can release it later
4028 	 * when we are truly done with the orphan item.
4029 	 */
4030 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4031 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4032 }
4033 
4034 void btrfs_orphan_release_metadata(struct inode *inode)
4035 {
4036 	struct btrfs_root *root = BTRFS_I(inode)->root;
4037 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4038 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4039 }
4040 
4041 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4042 				struct btrfs_pending_snapshot *pending)
4043 {
4044 	struct btrfs_root *root = pending->root;
4045 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4046 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4047 	/*
4048 	 * two for root back/forward refs, two for directory entries
4049 	 * and one for root of the snapshot.
4050 	 */
4051 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4052 	dst_rsv->space_info = src_rsv->space_info;
4053 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4054 }
4055 
4056 /**
4057  * drop_outstanding_extent - drop an outstanding extent
4058  * @inode: the inode we're dropping the extent for
4059  *
4060  * This is called when we are freeing up an outstanding extent, either called
4061  * after an error or after an extent is written.  This will return the number of
4062  * reserved extents that need to be freed.  This must be called with
4063  * BTRFS_I(inode)->lock held.
4064  */
4065 static unsigned drop_outstanding_extent(struct inode *inode)
4066 {
4067 	unsigned dropped_extents = 0;
4068 
4069 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4070 	BTRFS_I(inode)->outstanding_extents--;
4071 
4072 	/*
4073 	 * If we have more or the same amount of outsanding extents than we have
4074 	 * reserved then we need to leave the reserved extents count alone.
4075 	 */
4076 	if (BTRFS_I(inode)->outstanding_extents >=
4077 	    BTRFS_I(inode)->reserved_extents)
4078 		return 0;
4079 
4080 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4081 		BTRFS_I(inode)->outstanding_extents;
4082 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4083 	return dropped_extents;
4084 }
4085 
4086 /**
4087  * calc_csum_metadata_size - return the amount of metada space that must be
4088  *	reserved/free'd for the given bytes.
4089  * @inode: the inode we're manipulating
4090  * @num_bytes: the number of bytes in question
4091  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4092  *
4093  * This adjusts the number of csum_bytes in the inode and then returns the
4094  * correct amount of metadata that must either be reserved or freed.  We
4095  * calculate how many checksums we can fit into one leaf and then divide the
4096  * number of bytes that will need to be checksumed by this value to figure out
4097  * how many checksums will be required.  If we are adding bytes then the number
4098  * may go up and we will return the number of additional bytes that must be
4099  * reserved.  If it is going down we will return the number of bytes that must
4100  * be freed.
4101  *
4102  * This must be called with BTRFS_I(inode)->lock held.
4103  */
4104 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4105 				   int reserve)
4106 {
4107 	struct btrfs_root *root = BTRFS_I(inode)->root;
4108 	u64 csum_size;
4109 	int num_csums_per_leaf;
4110 	int num_csums;
4111 	int old_csums;
4112 
4113 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4114 	    BTRFS_I(inode)->csum_bytes == 0)
4115 		return 0;
4116 
4117 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4118 	if (reserve)
4119 		BTRFS_I(inode)->csum_bytes += num_bytes;
4120 	else
4121 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4122 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4123 	num_csums_per_leaf = (int)div64_u64(csum_size,
4124 					    sizeof(struct btrfs_csum_item) +
4125 					    sizeof(struct btrfs_disk_key));
4126 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4127 	num_csums = num_csums + num_csums_per_leaf - 1;
4128 	num_csums = num_csums / num_csums_per_leaf;
4129 
4130 	old_csums = old_csums + num_csums_per_leaf - 1;
4131 	old_csums = old_csums / num_csums_per_leaf;
4132 
4133 	/* No change, no need to reserve more */
4134 	if (old_csums == num_csums)
4135 		return 0;
4136 
4137 	if (reserve)
4138 		return btrfs_calc_trans_metadata_size(root,
4139 						      num_csums - old_csums);
4140 
4141 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4142 }
4143 
4144 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4145 {
4146 	struct btrfs_root *root = BTRFS_I(inode)->root;
4147 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4148 	u64 to_reserve = 0;
4149 	unsigned nr_extents = 0;
4150 	int flush = 1;
4151 	int ret;
4152 
4153 	if (btrfs_is_free_space_inode(root, inode))
4154 		flush = 0;
4155 
4156 	if (flush && btrfs_transaction_in_commit(root->fs_info))
4157 		schedule_timeout(1);
4158 
4159 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4160 
4161 	spin_lock(&BTRFS_I(inode)->lock);
4162 	BTRFS_I(inode)->outstanding_extents++;
4163 
4164 	if (BTRFS_I(inode)->outstanding_extents >
4165 	    BTRFS_I(inode)->reserved_extents) {
4166 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4167 			BTRFS_I(inode)->reserved_extents;
4168 		BTRFS_I(inode)->reserved_extents += nr_extents;
4169 
4170 		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4171 	}
4172 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4173 	spin_unlock(&BTRFS_I(inode)->lock);
4174 
4175 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4176 	if (ret) {
4177 		u64 to_free = 0;
4178 		unsigned dropped;
4179 
4180 		spin_lock(&BTRFS_I(inode)->lock);
4181 		dropped = drop_outstanding_extent(inode);
4182 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4183 		spin_unlock(&BTRFS_I(inode)->lock);
4184 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4185 
4186 		/*
4187 		 * Somebody could have come in and twiddled with the
4188 		 * reservation, so if we have to free more than we would have
4189 		 * reserved from this reservation go ahead and release those
4190 		 * bytes.
4191 		 */
4192 		to_free -= to_reserve;
4193 		if (to_free)
4194 			btrfs_block_rsv_release(root, block_rsv, to_free);
4195 		return ret;
4196 	}
4197 
4198 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4199 
4200 	return 0;
4201 }
4202 
4203 /**
4204  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4205  * @inode: the inode to release the reservation for
4206  * @num_bytes: the number of bytes we're releasing
4207  *
4208  * This will release the metadata reservation for an inode.  This can be called
4209  * once we complete IO for a given set of bytes to release their metadata
4210  * reservations.
4211  */
4212 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4213 {
4214 	struct btrfs_root *root = BTRFS_I(inode)->root;
4215 	u64 to_free = 0;
4216 	unsigned dropped;
4217 
4218 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4219 	spin_lock(&BTRFS_I(inode)->lock);
4220 	dropped = drop_outstanding_extent(inode);
4221 
4222 	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4223 	spin_unlock(&BTRFS_I(inode)->lock);
4224 	if (dropped > 0)
4225 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4226 
4227 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4228 				to_free);
4229 }
4230 
4231 /**
4232  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4233  * @inode: inode we're writing to
4234  * @num_bytes: the number of bytes we want to allocate
4235  *
4236  * This will do the following things
4237  *
4238  * o reserve space in the data space info for num_bytes
4239  * o reserve space in the metadata space info based on number of outstanding
4240  *   extents and how much csums will be needed
4241  * o add to the inodes ->delalloc_bytes
4242  * o add it to the fs_info's delalloc inodes list.
4243  *
4244  * This will return 0 for success and -ENOSPC if there is no space left.
4245  */
4246 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4247 {
4248 	int ret;
4249 
4250 	ret = btrfs_check_data_free_space(inode, num_bytes);
4251 	if (ret)
4252 		return ret;
4253 
4254 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4255 	if (ret) {
4256 		btrfs_free_reserved_data_space(inode, num_bytes);
4257 		return ret;
4258 	}
4259 
4260 	return 0;
4261 }
4262 
4263 /**
4264  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4265  * @inode: inode we're releasing space for
4266  * @num_bytes: the number of bytes we want to free up
4267  *
4268  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4269  * called in the case that we don't need the metadata AND data reservations
4270  * anymore.  So if there is an error or we insert an inline extent.
4271  *
4272  * This function will release the metadata space that was not used and will
4273  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4274  * list if there are no delalloc bytes left.
4275  */
4276 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4277 {
4278 	btrfs_delalloc_release_metadata(inode, num_bytes);
4279 	btrfs_free_reserved_data_space(inode, num_bytes);
4280 }
4281 
4282 static int update_block_group(struct btrfs_trans_handle *trans,
4283 			      struct btrfs_root *root,
4284 			      u64 bytenr, u64 num_bytes, int alloc)
4285 {
4286 	struct btrfs_block_group_cache *cache = NULL;
4287 	struct btrfs_fs_info *info = root->fs_info;
4288 	u64 total = num_bytes;
4289 	u64 old_val;
4290 	u64 byte_in_group;
4291 	int factor;
4292 
4293 	/* block accounting for super block */
4294 	spin_lock(&info->delalloc_lock);
4295 	old_val = btrfs_super_bytes_used(info->super_copy);
4296 	if (alloc)
4297 		old_val += num_bytes;
4298 	else
4299 		old_val -= num_bytes;
4300 	btrfs_set_super_bytes_used(info->super_copy, old_val);
4301 	spin_unlock(&info->delalloc_lock);
4302 
4303 	while (total) {
4304 		cache = btrfs_lookup_block_group(info, bytenr);
4305 		if (!cache)
4306 			return -1;
4307 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4308 				    BTRFS_BLOCK_GROUP_RAID1 |
4309 				    BTRFS_BLOCK_GROUP_RAID10))
4310 			factor = 2;
4311 		else
4312 			factor = 1;
4313 		/*
4314 		 * If this block group has free space cache written out, we
4315 		 * need to make sure to load it if we are removing space.  This
4316 		 * is because we need the unpinning stage to actually add the
4317 		 * space back to the block group, otherwise we will leak space.
4318 		 */
4319 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
4320 			cache_block_group(cache, trans, NULL, 1);
4321 
4322 		byte_in_group = bytenr - cache->key.objectid;
4323 		WARN_ON(byte_in_group > cache->key.offset);
4324 
4325 		spin_lock(&cache->space_info->lock);
4326 		spin_lock(&cache->lock);
4327 
4328 		if (btrfs_test_opt(root, SPACE_CACHE) &&
4329 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
4330 			cache->disk_cache_state = BTRFS_DC_CLEAR;
4331 
4332 		cache->dirty = 1;
4333 		old_val = btrfs_block_group_used(&cache->item);
4334 		num_bytes = min(total, cache->key.offset - byte_in_group);
4335 		if (alloc) {
4336 			old_val += num_bytes;
4337 			btrfs_set_block_group_used(&cache->item, old_val);
4338 			cache->reserved -= num_bytes;
4339 			cache->space_info->bytes_reserved -= num_bytes;
4340 			cache->space_info->bytes_used += num_bytes;
4341 			cache->space_info->disk_used += num_bytes * factor;
4342 			spin_unlock(&cache->lock);
4343 			spin_unlock(&cache->space_info->lock);
4344 		} else {
4345 			old_val -= num_bytes;
4346 			btrfs_set_block_group_used(&cache->item, old_val);
4347 			cache->pinned += num_bytes;
4348 			cache->space_info->bytes_pinned += num_bytes;
4349 			cache->space_info->bytes_used -= num_bytes;
4350 			cache->space_info->disk_used -= num_bytes * factor;
4351 			spin_unlock(&cache->lock);
4352 			spin_unlock(&cache->space_info->lock);
4353 
4354 			set_extent_dirty(info->pinned_extents,
4355 					 bytenr, bytenr + num_bytes - 1,
4356 					 GFP_NOFS | __GFP_NOFAIL);
4357 		}
4358 		btrfs_put_block_group(cache);
4359 		total -= num_bytes;
4360 		bytenr += num_bytes;
4361 	}
4362 	return 0;
4363 }
4364 
4365 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4366 {
4367 	struct btrfs_block_group_cache *cache;
4368 	u64 bytenr;
4369 
4370 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4371 	if (!cache)
4372 		return 0;
4373 
4374 	bytenr = cache->key.objectid;
4375 	btrfs_put_block_group(cache);
4376 
4377 	return bytenr;
4378 }
4379 
4380 static int pin_down_extent(struct btrfs_root *root,
4381 			   struct btrfs_block_group_cache *cache,
4382 			   u64 bytenr, u64 num_bytes, int reserved)
4383 {
4384 	spin_lock(&cache->space_info->lock);
4385 	spin_lock(&cache->lock);
4386 	cache->pinned += num_bytes;
4387 	cache->space_info->bytes_pinned += num_bytes;
4388 	if (reserved) {
4389 		cache->reserved -= num_bytes;
4390 		cache->space_info->bytes_reserved -= num_bytes;
4391 	}
4392 	spin_unlock(&cache->lock);
4393 	spin_unlock(&cache->space_info->lock);
4394 
4395 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4396 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4397 	return 0;
4398 }
4399 
4400 /*
4401  * this function must be called within transaction
4402  */
4403 int btrfs_pin_extent(struct btrfs_root *root,
4404 		     u64 bytenr, u64 num_bytes, int reserved)
4405 {
4406 	struct btrfs_block_group_cache *cache;
4407 
4408 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4409 	BUG_ON(!cache);
4410 
4411 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4412 
4413 	btrfs_put_block_group(cache);
4414 	return 0;
4415 }
4416 
4417 /*
4418  * this function must be called within transaction
4419  */
4420 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4421 				    struct btrfs_root *root,
4422 				    u64 bytenr, u64 num_bytes)
4423 {
4424 	struct btrfs_block_group_cache *cache;
4425 
4426 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4427 	BUG_ON(!cache);
4428 
4429 	/*
4430 	 * pull in the free space cache (if any) so that our pin
4431 	 * removes the free space from the cache.  We have load_only set
4432 	 * to one because the slow code to read in the free extents does check
4433 	 * the pinned extents.
4434 	 */
4435 	cache_block_group(cache, trans, root, 1);
4436 
4437 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
4438 
4439 	/* remove us from the free space cache (if we're there at all) */
4440 	btrfs_remove_free_space(cache, bytenr, num_bytes);
4441 	btrfs_put_block_group(cache);
4442 	return 0;
4443 }
4444 
4445 /**
4446  * btrfs_update_reserved_bytes - update the block_group and space info counters
4447  * @cache:	The cache we are manipulating
4448  * @num_bytes:	The number of bytes in question
4449  * @reserve:	One of the reservation enums
4450  *
4451  * This is called by the allocator when it reserves space, or by somebody who is
4452  * freeing space that was never actually used on disk.  For example if you
4453  * reserve some space for a new leaf in transaction A and before transaction A
4454  * commits you free that leaf, you call this with reserve set to 0 in order to
4455  * clear the reservation.
4456  *
4457  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4458  * ENOSPC accounting.  For data we handle the reservation through clearing the
4459  * delalloc bits in the io_tree.  We have to do this since we could end up
4460  * allocating less disk space for the amount of data we have reserved in the
4461  * case of compression.
4462  *
4463  * If this is a reservation and the block group has become read only we cannot
4464  * make the reservation and return -EAGAIN, otherwise this function always
4465  * succeeds.
4466  */
4467 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4468 				       u64 num_bytes, int reserve)
4469 {
4470 	struct btrfs_space_info *space_info = cache->space_info;
4471 	int ret = 0;
4472 	spin_lock(&space_info->lock);
4473 	spin_lock(&cache->lock);
4474 	if (reserve != RESERVE_FREE) {
4475 		if (cache->ro) {
4476 			ret = -EAGAIN;
4477 		} else {
4478 			cache->reserved += num_bytes;
4479 			space_info->bytes_reserved += num_bytes;
4480 			if (reserve == RESERVE_ALLOC) {
4481 				BUG_ON(space_info->bytes_may_use < num_bytes);
4482 				space_info->bytes_may_use -= num_bytes;
4483 			}
4484 		}
4485 	} else {
4486 		if (cache->ro)
4487 			space_info->bytes_readonly += num_bytes;
4488 		cache->reserved -= num_bytes;
4489 		space_info->bytes_reserved -= num_bytes;
4490 		space_info->reservation_progress++;
4491 	}
4492 	spin_unlock(&cache->lock);
4493 	spin_unlock(&space_info->lock);
4494 	return ret;
4495 }
4496 
4497 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4498 				struct btrfs_root *root)
4499 {
4500 	struct btrfs_fs_info *fs_info = root->fs_info;
4501 	struct btrfs_caching_control *next;
4502 	struct btrfs_caching_control *caching_ctl;
4503 	struct btrfs_block_group_cache *cache;
4504 
4505 	down_write(&fs_info->extent_commit_sem);
4506 
4507 	list_for_each_entry_safe(caching_ctl, next,
4508 				 &fs_info->caching_block_groups, list) {
4509 		cache = caching_ctl->block_group;
4510 		if (block_group_cache_done(cache)) {
4511 			cache->last_byte_to_unpin = (u64)-1;
4512 			list_del_init(&caching_ctl->list);
4513 			put_caching_control(caching_ctl);
4514 		} else {
4515 			cache->last_byte_to_unpin = caching_ctl->progress;
4516 		}
4517 	}
4518 
4519 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4520 		fs_info->pinned_extents = &fs_info->freed_extents[1];
4521 	else
4522 		fs_info->pinned_extents = &fs_info->freed_extents[0];
4523 
4524 	up_write(&fs_info->extent_commit_sem);
4525 
4526 	update_global_block_rsv(fs_info);
4527 	return 0;
4528 }
4529 
4530 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4531 {
4532 	struct btrfs_fs_info *fs_info = root->fs_info;
4533 	struct btrfs_block_group_cache *cache = NULL;
4534 	u64 len;
4535 
4536 	while (start <= end) {
4537 		if (!cache ||
4538 		    start >= cache->key.objectid + cache->key.offset) {
4539 			if (cache)
4540 				btrfs_put_block_group(cache);
4541 			cache = btrfs_lookup_block_group(fs_info, start);
4542 			BUG_ON(!cache);
4543 		}
4544 
4545 		len = cache->key.objectid + cache->key.offset - start;
4546 		len = min(len, end + 1 - start);
4547 
4548 		if (start < cache->last_byte_to_unpin) {
4549 			len = min(len, cache->last_byte_to_unpin - start);
4550 			btrfs_add_free_space(cache, start, len);
4551 		}
4552 
4553 		start += len;
4554 
4555 		spin_lock(&cache->space_info->lock);
4556 		spin_lock(&cache->lock);
4557 		cache->pinned -= len;
4558 		cache->space_info->bytes_pinned -= len;
4559 		if (cache->ro)
4560 			cache->space_info->bytes_readonly += len;
4561 		spin_unlock(&cache->lock);
4562 		spin_unlock(&cache->space_info->lock);
4563 	}
4564 
4565 	if (cache)
4566 		btrfs_put_block_group(cache);
4567 	return 0;
4568 }
4569 
4570 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4571 			       struct btrfs_root *root)
4572 {
4573 	struct btrfs_fs_info *fs_info = root->fs_info;
4574 	struct extent_io_tree *unpin;
4575 	u64 start;
4576 	u64 end;
4577 	int ret;
4578 
4579 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4580 		unpin = &fs_info->freed_extents[1];
4581 	else
4582 		unpin = &fs_info->freed_extents[0];
4583 
4584 	while (1) {
4585 		ret = find_first_extent_bit(unpin, 0, &start, &end,
4586 					    EXTENT_DIRTY);
4587 		if (ret)
4588 			break;
4589 
4590 		if (btrfs_test_opt(root, DISCARD))
4591 			ret = btrfs_discard_extent(root, start,
4592 						   end + 1 - start, NULL);
4593 
4594 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
4595 		unpin_extent_range(root, start, end);
4596 		cond_resched();
4597 	}
4598 
4599 	return 0;
4600 }
4601 
4602 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4603 				struct btrfs_root *root,
4604 				u64 bytenr, u64 num_bytes, u64 parent,
4605 				u64 root_objectid, u64 owner_objectid,
4606 				u64 owner_offset, int refs_to_drop,
4607 				struct btrfs_delayed_extent_op *extent_op)
4608 {
4609 	struct btrfs_key key;
4610 	struct btrfs_path *path;
4611 	struct btrfs_fs_info *info = root->fs_info;
4612 	struct btrfs_root *extent_root = info->extent_root;
4613 	struct extent_buffer *leaf;
4614 	struct btrfs_extent_item *ei;
4615 	struct btrfs_extent_inline_ref *iref;
4616 	int ret;
4617 	int is_data;
4618 	int extent_slot = 0;
4619 	int found_extent = 0;
4620 	int num_to_del = 1;
4621 	u32 item_size;
4622 	u64 refs;
4623 
4624 	path = btrfs_alloc_path();
4625 	if (!path)
4626 		return -ENOMEM;
4627 
4628 	path->reada = 1;
4629 	path->leave_spinning = 1;
4630 
4631 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4632 	BUG_ON(!is_data && refs_to_drop != 1);
4633 
4634 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
4635 				    bytenr, num_bytes, parent,
4636 				    root_objectid, owner_objectid,
4637 				    owner_offset);
4638 	if (ret == 0) {
4639 		extent_slot = path->slots[0];
4640 		while (extent_slot >= 0) {
4641 			btrfs_item_key_to_cpu(path->nodes[0], &key,
4642 					      extent_slot);
4643 			if (key.objectid != bytenr)
4644 				break;
4645 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4646 			    key.offset == num_bytes) {
4647 				found_extent = 1;
4648 				break;
4649 			}
4650 			if (path->slots[0] - extent_slot > 5)
4651 				break;
4652 			extent_slot--;
4653 		}
4654 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4655 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4656 		if (found_extent && item_size < sizeof(*ei))
4657 			found_extent = 0;
4658 #endif
4659 		if (!found_extent) {
4660 			BUG_ON(iref);
4661 			ret = remove_extent_backref(trans, extent_root, path,
4662 						    NULL, refs_to_drop,
4663 						    is_data);
4664 			BUG_ON(ret);
4665 			btrfs_release_path(path);
4666 			path->leave_spinning = 1;
4667 
4668 			key.objectid = bytenr;
4669 			key.type = BTRFS_EXTENT_ITEM_KEY;
4670 			key.offset = num_bytes;
4671 
4672 			ret = btrfs_search_slot(trans, extent_root,
4673 						&key, path, -1, 1);
4674 			if (ret) {
4675 				printk(KERN_ERR "umm, got %d back from search"
4676 				       ", was looking for %llu\n", ret,
4677 				       (unsigned long long)bytenr);
4678 				if (ret > 0)
4679 					btrfs_print_leaf(extent_root,
4680 							 path->nodes[0]);
4681 			}
4682 			BUG_ON(ret);
4683 			extent_slot = path->slots[0];
4684 		}
4685 	} else {
4686 		btrfs_print_leaf(extent_root, path->nodes[0]);
4687 		WARN_ON(1);
4688 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4689 		       "parent %llu root %llu  owner %llu offset %llu\n",
4690 		       (unsigned long long)bytenr,
4691 		       (unsigned long long)parent,
4692 		       (unsigned long long)root_objectid,
4693 		       (unsigned long long)owner_objectid,
4694 		       (unsigned long long)owner_offset);
4695 	}
4696 
4697 	leaf = path->nodes[0];
4698 	item_size = btrfs_item_size_nr(leaf, extent_slot);
4699 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4700 	if (item_size < sizeof(*ei)) {
4701 		BUG_ON(found_extent || extent_slot != path->slots[0]);
4702 		ret = convert_extent_item_v0(trans, extent_root, path,
4703 					     owner_objectid, 0);
4704 		BUG_ON(ret < 0);
4705 
4706 		btrfs_release_path(path);
4707 		path->leave_spinning = 1;
4708 
4709 		key.objectid = bytenr;
4710 		key.type = BTRFS_EXTENT_ITEM_KEY;
4711 		key.offset = num_bytes;
4712 
4713 		ret = btrfs_search_slot(trans, extent_root, &key, path,
4714 					-1, 1);
4715 		if (ret) {
4716 			printk(KERN_ERR "umm, got %d back from search"
4717 			       ", was looking for %llu\n", ret,
4718 			       (unsigned long long)bytenr);
4719 			btrfs_print_leaf(extent_root, path->nodes[0]);
4720 		}
4721 		BUG_ON(ret);
4722 		extent_slot = path->slots[0];
4723 		leaf = path->nodes[0];
4724 		item_size = btrfs_item_size_nr(leaf, extent_slot);
4725 	}
4726 #endif
4727 	BUG_ON(item_size < sizeof(*ei));
4728 	ei = btrfs_item_ptr(leaf, extent_slot,
4729 			    struct btrfs_extent_item);
4730 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4731 		struct btrfs_tree_block_info *bi;
4732 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4733 		bi = (struct btrfs_tree_block_info *)(ei + 1);
4734 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4735 	}
4736 
4737 	refs = btrfs_extent_refs(leaf, ei);
4738 	BUG_ON(refs < refs_to_drop);
4739 	refs -= refs_to_drop;
4740 
4741 	if (refs > 0) {
4742 		if (extent_op)
4743 			__run_delayed_extent_op(extent_op, leaf, ei);
4744 		/*
4745 		 * In the case of inline back ref, reference count will
4746 		 * be updated by remove_extent_backref
4747 		 */
4748 		if (iref) {
4749 			BUG_ON(!found_extent);
4750 		} else {
4751 			btrfs_set_extent_refs(leaf, ei, refs);
4752 			btrfs_mark_buffer_dirty(leaf);
4753 		}
4754 		if (found_extent) {
4755 			ret = remove_extent_backref(trans, extent_root, path,
4756 						    iref, refs_to_drop,
4757 						    is_data);
4758 			BUG_ON(ret);
4759 		}
4760 	} else {
4761 		if (found_extent) {
4762 			BUG_ON(is_data && refs_to_drop !=
4763 			       extent_data_ref_count(root, path, iref));
4764 			if (iref) {
4765 				BUG_ON(path->slots[0] != extent_slot);
4766 			} else {
4767 				BUG_ON(path->slots[0] != extent_slot + 1);
4768 				path->slots[0] = extent_slot;
4769 				num_to_del = 2;
4770 			}
4771 		}
4772 
4773 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4774 				      num_to_del);
4775 		BUG_ON(ret);
4776 		btrfs_release_path(path);
4777 
4778 		if (is_data) {
4779 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4780 			BUG_ON(ret);
4781 		} else {
4782 			invalidate_mapping_pages(info->btree_inode->i_mapping,
4783 			     bytenr >> PAGE_CACHE_SHIFT,
4784 			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4785 		}
4786 
4787 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4788 		BUG_ON(ret);
4789 	}
4790 	btrfs_free_path(path);
4791 	return ret;
4792 }
4793 
4794 /*
4795  * when we free an block, it is possible (and likely) that we free the last
4796  * delayed ref for that extent as well.  This searches the delayed ref tree for
4797  * a given extent, and if there are no other delayed refs to be processed, it
4798  * removes it from the tree.
4799  */
4800 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4801 				      struct btrfs_root *root, u64 bytenr)
4802 {
4803 	struct btrfs_delayed_ref_head *head;
4804 	struct btrfs_delayed_ref_root *delayed_refs;
4805 	struct btrfs_delayed_ref_node *ref;
4806 	struct rb_node *node;
4807 	int ret = 0;
4808 
4809 	delayed_refs = &trans->transaction->delayed_refs;
4810 	spin_lock(&delayed_refs->lock);
4811 	head = btrfs_find_delayed_ref_head(trans, bytenr);
4812 	if (!head)
4813 		goto out;
4814 
4815 	node = rb_prev(&head->node.rb_node);
4816 	if (!node)
4817 		goto out;
4818 
4819 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4820 
4821 	/* there are still entries for this ref, we can't drop it */
4822 	if (ref->bytenr == bytenr)
4823 		goto out;
4824 
4825 	if (head->extent_op) {
4826 		if (!head->must_insert_reserved)
4827 			goto out;
4828 		kfree(head->extent_op);
4829 		head->extent_op = NULL;
4830 	}
4831 
4832 	/*
4833 	 * waiting for the lock here would deadlock.  If someone else has it
4834 	 * locked they are already in the process of dropping it anyway
4835 	 */
4836 	if (!mutex_trylock(&head->mutex))
4837 		goto out;
4838 
4839 	/*
4840 	 * at this point we have a head with no other entries.  Go
4841 	 * ahead and process it.
4842 	 */
4843 	head->node.in_tree = 0;
4844 	rb_erase(&head->node.rb_node, &delayed_refs->root);
4845 
4846 	delayed_refs->num_entries--;
4847 
4848 	/*
4849 	 * we don't take a ref on the node because we're removing it from the
4850 	 * tree, so we just steal the ref the tree was holding.
4851 	 */
4852 	delayed_refs->num_heads--;
4853 	if (list_empty(&head->cluster))
4854 		delayed_refs->num_heads_ready--;
4855 
4856 	list_del_init(&head->cluster);
4857 	spin_unlock(&delayed_refs->lock);
4858 
4859 	BUG_ON(head->extent_op);
4860 	if (head->must_insert_reserved)
4861 		ret = 1;
4862 
4863 	mutex_unlock(&head->mutex);
4864 	btrfs_put_delayed_ref(&head->node);
4865 	return ret;
4866 out:
4867 	spin_unlock(&delayed_refs->lock);
4868 	return 0;
4869 }
4870 
4871 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4872 			   struct btrfs_root *root,
4873 			   struct extent_buffer *buf,
4874 			   u64 parent, int last_ref)
4875 {
4876 	struct btrfs_block_group_cache *cache = NULL;
4877 	int ret;
4878 
4879 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4880 		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4881 						parent, root->root_key.objectid,
4882 						btrfs_header_level(buf),
4883 						BTRFS_DROP_DELAYED_REF, NULL);
4884 		BUG_ON(ret);
4885 	}
4886 
4887 	if (!last_ref)
4888 		return;
4889 
4890 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4891 
4892 	if (btrfs_header_generation(buf) == trans->transid) {
4893 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4894 			ret = check_ref_cleanup(trans, root, buf->start);
4895 			if (!ret)
4896 				goto out;
4897 		}
4898 
4899 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4900 			pin_down_extent(root, cache, buf->start, buf->len, 1);
4901 			goto out;
4902 		}
4903 
4904 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4905 
4906 		btrfs_add_free_space(cache, buf->start, buf->len);
4907 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4908 	}
4909 out:
4910 	/*
4911 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4912 	 * anymore.
4913 	 */
4914 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4915 	btrfs_put_block_group(cache);
4916 }
4917 
4918 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4919 		      struct btrfs_root *root,
4920 		      u64 bytenr, u64 num_bytes, u64 parent,
4921 		      u64 root_objectid, u64 owner, u64 offset)
4922 {
4923 	int ret;
4924 
4925 	/*
4926 	 * tree log blocks never actually go into the extent allocation
4927 	 * tree, just update pinning info and exit early.
4928 	 */
4929 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4930 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4931 		/* unlocks the pinned mutex */
4932 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
4933 		ret = 0;
4934 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4935 		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4936 					parent, root_objectid, (int)owner,
4937 					BTRFS_DROP_DELAYED_REF, NULL);
4938 		BUG_ON(ret);
4939 	} else {
4940 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4941 					parent, root_objectid, owner,
4942 					offset, BTRFS_DROP_DELAYED_REF, NULL);
4943 		BUG_ON(ret);
4944 	}
4945 	return ret;
4946 }
4947 
4948 static u64 stripe_align(struct btrfs_root *root, u64 val)
4949 {
4950 	u64 mask = ((u64)root->stripesize - 1);
4951 	u64 ret = (val + mask) & ~mask;
4952 	return ret;
4953 }
4954 
4955 /*
4956  * when we wait for progress in the block group caching, its because
4957  * our allocation attempt failed at least once.  So, we must sleep
4958  * and let some progress happen before we try again.
4959  *
4960  * This function will sleep at least once waiting for new free space to
4961  * show up, and then it will check the block group free space numbers
4962  * for our min num_bytes.  Another option is to have it go ahead
4963  * and look in the rbtree for a free extent of a given size, but this
4964  * is a good start.
4965  */
4966 static noinline int
4967 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4968 				u64 num_bytes)
4969 {
4970 	struct btrfs_caching_control *caching_ctl;
4971 	DEFINE_WAIT(wait);
4972 
4973 	caching_ctl = get_caching_control(cache);
4974 	if (!caching_ctl)
4975 		return 0;
4976 
4977 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4978 		   (cache->free_space_ctl->free_space >= num_bytes));
4979 
4980 	put_caching_control(caching_ctl);
4981 	return 0;
4982 }
4983 
4984 static noinline int
4985 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4986 {
4987 	struct btrfs_caching_control *caching_ctl;
4988 	DEFINE_WAIT(wait);
4989 
4990 	caching_ctl = get_caching_control(cache);
4991 	if (!caching_ctl)
4992 		return 0;
4993 
4994 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
4995 
4996 	put_caching_control(caching_ctl);
4997 	return 0;
4998 }
4999 
5000 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5001 {
5002 	int index;
5003 	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
5004 		index = 0;
5005 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
5006 		index = 1;
5007 	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
5008 		index = 2;
5009 	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
5010 		index = 3;
5011 	else
5012 		index = 4;
5013 	return index;
5014 }
5015 
5016 enum btrfs_loop_type {
5017 	LOOP_FIND_IDEAL = 0,
5018 	LOOP_CACHING_NOWAIT = 1,
5019 	LOOP_CACHING_WAIT = 2,
5020 	LOOP_ALLOC_CHUNK = 3,
5021 	LOOP_NO_EMPTY_SIZE = 4,
5022 };
5023 
5024 /*
5025  * walks the btree of allocated extents and find a hole of a given size.
5026  * The key ins is changed to record the hole:
5027  * ins->objectid == block start
5028  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5029  * ins->offset == number of blocks
5030  * Any available blocks before search_start are skipped.
5031  */
5032 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5033 				     struct btrfs_root *orig_root,
5034 				     u64 num_bytes, u64 empty_size,
5035 				     u64 search_start, u64 search_end,
5036 				     u64 hint_byte, struct btrfs_key *ins,
5037 				     u64 data)
5038 {
5039 	int ret = 0;
5040 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5041 	struct btrfs_free_cluster *last_ptr = NULL;
5042 	struct btrfs_block_group_cache *block_group = NULL;
5043 	int empty_cluster = 2 * 1024 * 1024;
5044 	int allowed_chunk_alloc = 0;
5045 	int done_chunk_alloc = 0;
5046 	struct btrfs_space_info *space_info;
5047 	int last_ptr_loop = 0;
5048 	int loop = 0;
5049 	int index = 0;
5050 	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5051 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5052 	bool found_uncached_bg = false;
5053 	bool failed_cluster_refill = false;
5054 	bool failed_alloc = false;
5055 	bool use_cluster = true;
5056 	bool have_caching_bg = false;
5057 	u64 ideal_cache_percent = 0;
5058 	u64 ideal_cache_offset = 0;
5059 
5060 	WARN_ON(num_bytes < root->sectorsize);
5061 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5062 	ins->objectid = 0;
5063 	ins->offset = 0;
5064 
5065 	space_info = __find_space_info(root->fs_info, data);
5066 	if (!space_info) {
5067 		printk(KERN_ERR "No space info for %llu\n", data);
5068 		return -ENOSPC;
5069 	}
5070 
5071 	/*
5072 	 * If the space info is for both data and metadata it means we have a
5073 	 * small filesystem and we can't use the clustering stuff.
5074 	 */
5075 	if (btrfs_mixed_space_info(space_info))
5076 		use_cluster = false;
5077 
5078 	if (orig_root->ref_cows || empty_size)
5079 		allowed_chunk_alloc = 1;
5080 
5081 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5082 		last_ptr = &root->fs_info->meta_alloc_cluster;
5083 		if (!btrfs_test_opt(root, SSD))
5084 			empty_cluster = 64 * 1024;
5085 	}
5086 
5087 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5088 	    btrfs_test_opt(root, SSD)) {
5089 		last_ptr = &root->fs_info->data_alloc_cluster;
5090 	}
5091 
5092 	if (last_ptr) {
5093 		spin_lock(&last_ptr->lock);
5094 		if (last_ptr->block_group)
5095 			hint_byte = last_ptr->window_start;
5096 		spin_unlock(&last_ptr->lock);
5097 	}
5098 
5099 	search_start = max(search_start, first_logical_byte(root, 0));
5100 	search_start = max(search_start, hint_byte);
5101 
5102 	if (!last_ptr)
5103 		empty_cluster = 0;
5104 
5105 	if (search_start == hint_byte) {
5106 ideal_cache:
5107 		block_group = btrfs_lookup_block_group(root->fs_info,
5108 						       search_start);
5109 		/*
5110 		 * we don't want to use the block group if it doesn't match our
5111 		 * allocation bits, or if its not cached.
5112 		 *
5113 		 * However if we are re-searching with an ideal block group
5114 		 * picked out then we don't care that the block group is cached.
5115 		 */
5116 		if (block_group && block_group_bits(block_group, data) &&
5117 		    (block_group->cached != BTRFS_CACHE_NO ||
5118 		     search_start == ideal_cache_offset)) {
5119 			down_read(&space_info->groups_sem);
5120 			if (list_empty(&block_group->list) ||
5121 			    block_group->ro) {
5122 				/*
5123 				 * someone is removing this block group,
5124 				 * we can't jump into the have_block_group
5125 				 * target because our list pointers are not
5126 				 * valid
5127 				 */
5128 				btrfs_put_block_group(block_group);
5129 				up_read(&space_info->groups_sem);
5130 			} else {
5131 				index = get_block_group_index(block_group);
5132 				goto have_block_group;
5133 			}
5134 		} else if (block_group) {
5135 			btrfs_put_block_group(block_group);
5136 		}
5137 	}
5138 search:
5139 	have_caching_bg = false;
5140 	down_read(&space_info->groups_sem);
5141 	list_for_each_entry(block_group, &space_info->block_groups[index],
5142 			    list) {
5143 		u64 offset;
5144 		int cached;
5145 
5146 		btrfs_get_block_group(block_group);
5147 		search_start = block_group->key.objectid;
5148 
5149 		/*
5150 		 * this can happen if we end up cycling through all the
5151 		 * raid types, but we want to make sure we only allocate
5152 		 * for the proper type.
5153 		 */
5154 		if (!block_group_bits(block_group, data)) {
5155 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5156 				BTRFS_BLOCK_GROUP_RAID1 |
5157 				BTRFS_BLOCK_GROUP_RAID10;
5158 
5159 			/*
5160 			 * if they asked for extra copies and this block group
5161 			 * doesn't provide them, bail.  This does allow us to
5162 			 * fill raid0 from raid1.
5163 			 */
5164 			if ((data & extra) && !(block_group->flags & extra))
5165 				goto loop;
5166 		}
5167 
5168 have_block_group:
5169 		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
5170 			u64 free_percent;
5171 
5172 			ret = cache_block_group(block_group, trans,
5173 						orig_root, 1);
5174 			if (block_group->cached == BTRFS_CACHE_FINISHED)
5175 				goto have_block_group;
5176 
5177 			free_percent = btrfs_block_group_used(&block_group->item);
5178 			free_percent *= 100;
5179 			free_percent = div64_u64(free_percent,
5180 						 block_group->key.offset);
5181 			free_percent = 100 - free_percent;
5182 			if (free_percent > ideal_cache_percent &&
5183 			    likely(!block_group->ro)) {
5184 				ideal_cache_offset = block_group->key.objectid;
5185 				ideal_cache_percent = free_percent;
5186 			}
5187 
5188 			/*
5189 			 * The caching workers are limited to 2 threads, so we
5190 			 * can queue as much work as we care to.
5191 			 */
5192 			if (loop > LOOP_FIND_IDEAL) {
5193 				ret = cache_block_group(block_group, trans,
5194 							orig_root, 0);
5195 				BUG_ON(ret);
5196 			}
5197 			found_uncached_bg = true;
5198 
5199 			/*
5200 			 * If loop is set for cached only, try the next block
5201 			 * group.
5202 			 */
5203 			if (loop == LOOP_FIND_IDEAL)
5204 				goto loop;
5205 		}
5206 
5207 		cached = block_group_cache_done(block_group);
5208 		if (unlikely(!cached))
5209 			found_uncached_bg = true;
5210 
5211 		if (unlikely(block_group->ro))
5212 			goto loop;
5213 
5214 		spin_lock(&block_group->free_space_ctl->tree_lock);
5215 		if (cached &&
5216 		    block_group->free_space_ctl->free_space <
5217 		    num_bytes + empty_size) {
5218 			spin_unlock(&block_group->free_space_ctl->tree_lock);
5219 			goto loop;
5220 		}
5221 		spin_unlock(&block_group->free_space_ctl->tree_lock);
5222 
5223 		/*
5224 		 * Ok we want to try and use the cluster allocator, so lets look
5225 		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
5226 		 * have tried the cluster allocator plenty of times at this
5227 		 * point and not have found anything, so we are likely way too
5228 		 * fragmented for the clustering stuff to find anything, so lets
5229 		 * just skip it and let the allocator find whatever block it can
5230 		 * find
5231 		 */
5232 		if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
5233 			/*
5234 			 * the refill lock keeps out other
5235 			 * people trying to start a new cluster
5236 			 */
5237 			spin_lock(&last_ptr->refill_lock);
5238 			if (last_ptr->block_group &&
5239 			    (last_ptr->block_group->ro ||
5240 			    !block_group_bits(last_ptr->block_group, data))) {
5241 				offset = 0;
5242 				goto refill_cluster;
5243 			}
5244 
5245 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
5246 						 num_bytes, search_start);
5247 			if (offset) {
5248 				/* we have a block, we're done */
5249 				spin_unlock(&last_ptr->refill_lock);
5250 				goto checks;
5251 			}
5252 
5253 			spin_lock(&last_ptr->lock);
5254 			/*
5255 			 * whoops, this cluster doesn't actually point to
5256 			 * this block group.  Get a ref on the block
5257 			 * group is does point to and try again
5258 			 */
5259 			if (!last_ptr_loop && last_ptr->block_group &&
5260 			    last_ptr->block_group != block_group &&
5261 			    index <=
5262 				 get_block_group_index(last_ptr->block_group)) {
5263 
5264 				btrfs_put_block_group(block_group);
5265 				block_group = last_ptr->block_group;
5266 				btrfs_get_block_group(block_group);
5267 				spin_unlock(&last_ptr->lock);
5268 				spin_unlock(&last_ptr->refill_lock);
5269 
5270 				last_ptr_loop = 1;
5271 				search_start = block_group->key.objectid;
5272 				/*
5273 				 * we know this block group is properly
5274 				 * in the list because
5275 				 * btrfs_remove_block_group, drops the
5276 				 * cluster before it removes the block
5277 				 * group from the list
5278 				 */
5279 				goto have_block_group;
5280 			}
5281 			spin_unlock(&last_ptr->lock);
5282 refill_cluster:
5283 			/*
5284 			 * this cluster didn't work out, free it and
5285 			 * start over
5286 			 */
5287 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5288 
5289 			last_ptr_loop = 0;
5290 
5291 			/* allocate a cluster in this block group */
5292 			ret = btrfs_find_space_cluster(trans, root,
5293 					       block_group, last_ptr,
5294 					       offset, num_bytes,
5295 					       empty_cluster + empty_size);
5296 			if (ret == 0) {
5297 				/*
5298 				 * now pull our allocation out of this
5299 				 * cluster
5300 				 */
5301 				offset = btrfs_alloc_from_cluster(block_group,
5302 						  last_ptr, num_bytes,
5303 						  search_start);
5304 				if (offset) {
5305 					/* we found one, proceed */
5306 					spin_unlock(&last_ptr->refill_lock);
5307 					goto checks;
5308 				}
5309 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
5310 				   && !failed_cluster_refill) {
5311 				spin_unlock(&last_ptr->refill_lock);
5312 
5313 				failed_cluster_refill = true;
5314 				wait_block_group_cache_progress(block_group,
5315 				       num_bytes + empty_cluster + empty_size);
5316 				goto have_block_group;
5317 			}
5318 
5319 			/*
5320 			 * at this point we either didn't find a cluster
5321 			 * or we weren't able to allocate a block from our
5322 			 * cluster.  Free the cluster we've been trying
5323 			 * to use, and go to the next block group
5324 			 */
5325 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
5326 			spin_unlock(&last_ptr->refill_lock);
5327 			goto loop;
5328 		}
5329 
5330 		offset = btrfs_find_space_for_alloc(block_group, search_start,
5331 						    num_bytes, empty_size);
5332 		/*
5333 		 * If we didn't find a chunk, and we haven't failed on this
5334 		 * block group before, and this block group is in the middle of
5335 		 * caching and we are ok with waiting, then go ahead and wait
5336 		 * for progress to be made, and set failed_alloc to true.
5337 		 *
5338 		 * If failed_alloc is true then we've already waited on this
5339 		 * block group once and should move on to the next block group.
5340 		 */
5341 		if (!offset && !failed_alloc && !cached &&
5342 		    loop > LOOP_CACHING_NOWAIT) {
5343 			wait_block_group_cache_progress(block_group,
5344 						num_bytes + empty_size);
5345 			failed_alloc = true;
5346 			goto have_block_group;
5347 		} else if (!offset) {
5348 			if (!cached)
5349 				have_caching_bg = true;
5350 			goto loop;
5351 		}
5352 checks:
5353 		search_start = stripe_align(root, offset);
5354 		/* move on to the next group */
5355 		if (search_start + num_bytes >= search_end) {
5356 			btrfs_add_free_space(block_group, offset, num_bytes);
5357 			goto loop;
5358 		}
5359 
5360 		/* move on to the next group */
5361 		if (search_start + num_bytes >
5362 		    block_group->key.objectid + block_group->key.offset) {
5363 			btrfs_add_free_space(block_group, offset, num_bytes);
5364 			goto loop;
5365 		}
5366 
5367 		ins->objectid = search_start;
5368 		ins->offset = num_bytes;
5369 
5370 		if (offset < search_start)
5371 			btrfs_add_free_space(block_group, offset,
5372 					     search_start - offset);
5373 		BUG_ON(offset > search_start);
5374 
5375 		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5376 						  alloc_type);
5377 		if (ret == -EAGAIN) {
5378 			btrfs_add_free_space(block_group, offset, num_bytes);
5379 			goto loop;
5380 		}
5381 
5382 		/* we are all good, lets return */
5383 		ins->objectid = search_start;
5384 		ins->offset = num_bytes;
5385 
5386 		if (offset < search_start)
5387 			btrfs_add_free_space(block_group, offset,
5388 					     search_start - offset);
5389 		BUG_ON(offset > search_start);
5390 		btrfs_put_block_group(block_group);
5391 		break;
5392 loop:
5393 		failed_cluster_refill = false;
5394 		failed_alloc = false;
5395 		BUG_ON(index != get_block_group_index(block_group));
5396 		btrfs_put_block_group(block_group);
5397 	}
5398 	up_read(&space_info->groups_sem);
5399 
5400 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5401 		goto search;
5402 
5403 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5404 		goto search;
5405 
5406 	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5407 	 *			for them to make caching progress.  Also
5408 	 *			determine the best possible bg to cache
5409 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5410 	 *			caching kthreads as we move along
5411 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5412 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5413 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5414 	 *			again
5415 	 */
5416 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5417 		index = 0;
5418 		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5419 			found_uncached_bg = false;
5420 			loop++;
5421 			if (!ideal_cache_percent)
5422 				goto search;
5423 
5424 			/*
5425 			 * 1 of the following 2 things have happened so far
5426 			 *
5427 			 * 1) We found an ideal block group for caching that
5428 			 * is mostly full and will cache quickly, so we might
5429 			 * as well wait for it.
5430 			 *
5431 			 * 2) We searched for cached only and we didn't find
5432 			 * anything, and we didn't start any caching kthreads
5433 			 * either, so chances are we will loop through and
5434 			 * start a couple caching kthreads, and then come back
5435 			 * around and just wait for them.  This will be slower
5436 			 * because we will have 2 caching kthreads reading at
5437 			 * the same time when we could have just started one
5438 			 * and waited for it to get far enough to give us an
5439 			 * allocation, so go ahead and go to the wait caching
5440 			 * loop.
5441 			 */
5442 			loop = LOOP_CACHING_WAIT;
5443 			search_start = ideal_cache_offset;
5444 			ideal_cache_percent = 0;
5445 			goto ideal_cache;
5446 		} else if (loop == LOOP_FIND_IDEAL) {
5447 			/*
5448 			 * Didn't find a uncached bg, wait on anything we find
5449 			 * next.
5450 			 */
5451 			loop = LOOP_CACHING_WAIT;
5452 			goto search;
5453 		}
5454 
5455 		loop++;
5456 
5457 		if (loop == LOOP_ALLOC_CHUNK) {
5458 		       if (allowed_chunk_alloc) {
5459 				ret = do_chunk_alloc(trans, root, num_bytes +
5460 						     2 * 1024 * 1024, data,
5461 						     CHUNK_ALLOC_LIMITED);
5462 				allowed_chunk_alloc = 0;
5463 				if (ret == 1)
5464 					done_chunk_alloc = 1;
5465 			} else if (!done_chunk_alloc &&
5466 				   space_info->force_alloc ==
5467 				   CHUNK_ALLOC_NO_FORCE) {
5468 				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5469 			}
5470 
5471 		       /*
5472 			* We didn't allocate a chunk, go ahead and drop the
5473 			* empty size and loop again.
5474 			*/
5475 		       if (!done_chunk_alloc)
5476 			       loop = LOOP_NO_EMPTY_SIZE;
5477 		}
5478 
5479 		if (loop == LOOP_NO_EMPTY_SIZE) {
5480 			empty_size = 0;
5481 			empty_cluster = 0;
5482 		}
5483 
5484 		goto search;
5485 	} else if (!ins->objectid) {
5486 		ret = -ENOSPC;
5487 	} else if (ins->objectid) {
5488 		ret = 0;
5489 	}
5490 
5491 	return ret;
5492 }
5493 
5494 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5495 			    int dump_block_groups)
5496 {
5497 	struct btrfs_block_group_cache *cache;
5498 	int index = 0;
5499 
5500 	spin_lock(&info->lock);
5501 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5502 	       (unsigned long long)info->flags,
5503 	       (unsigned long long)(info->total_bytes - info->bytes_used -
5504 				    info->bytes_pinned - info->bytes_reserved -
5505 				    info->bytes_readonly),
5506 	       (info->full) ? "" : "not ");
5507 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5508 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
5509 	       (unsigned long long)info->total_bytes,
5510 	       (unsigned long long)info->bytes_used,
5511 	       (unsigned long long)info->bytes_pinned,
5512 	       (unsigned long long)info->bytes_reserved,
5513 	       (unsigned long long)info->bytes_may_use,
5514 	       (unsigned long long)info->bytes_readonly);
5515 	spin_unlock(&info->lock);
5516 
5517 	if (!dump_block_groups)
5518 		return;
5519 
5520 	down_read(&info->groups_sem);
5521 again:
5522 	list_for_each_entry(cache, &info->block_groups[index], list) {
5523 		spin_lock(&cache->lock);
5524 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5525 		       "%llu pinned %llu reserved\n",
5526 		       (unsigned long long)cache->key.objectid,
5527 		       (unsigned long long)cache->key.offset,
5528 		       (unsigned long long)btrfs_block_group_used(&cache->item),
5529 		       (unsigned long long)cache->pinned,
5530 		       (unsigned long long)cache->reserved);
5531 		btrfs_dump_free_space(cache, bytes);
5532 		spin_unlock(&cache->lock);
5533 	}
5534 	if (++index < BTRFS_NR_RAID_TYPES)
5535 		goto again;
5536 	up_read(&info->groups_sem);
5537 }
5538 
5539 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5540 			 struct btrfs_root *root,
5541 			 u64 num_bytes, u64 min_alloc_size,
5542 			 u64 empty_size, u64 hint_byte,
5543 			 u64 search_end, struct btrfs_key *ins,
5544 			 u64 data)
5545 {
5546 	int ret;
5547 	u64 search_start = 0;
5548 
5549 	data = btrfs_get_alloc_profile(root, data);
5550 again:
5551 	/*
5552 	 * the only place that sets empty_size is btrfs_realloc_node, which
5553 	 * is not called recursively on allocations
5554 	 */
5555 	if (empty_size || root->ref_cows)
5556 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5557 				     num_bytes + 2 * 1024 * 1024, data,
5558 				     CHUNK_ALLOC_NO_FORCE);
5559 
5560 	WARN_ON(num_bytes < root->sectorsize);
5561 	ret = find_free_extent(trans, root, num_bytes, empty_size,
5562 			       search_start, search_end, hint_byte,
5563 			       ins, data);
5564 
5565 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5566 		num_bytes = num_bytes >> 1;
5567 		num_bytes = num_bytes & ~(root->sectorsize - 1);
5568 		num_bytes = max(num_bytes, min_alloc_size);
5569 		do_chunk_alloc(trans, root->fs_info->extent_root,
5570 			       num_bytes, data, CHUNK_ALLOC_FORCE);
5571 		goto again;
5572 	}
5573 	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5574 		struct btrfs_space_info *sinfo;
5575 
5576 		sinfo = __find_space_info(root->fs_info, data);
5577 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
5578 		       "wanted %llu\n", (unsigned long long)data,
5579 		       (unsigned long long)num_bytes);
5580 		dump_space_info(sinfo, num_bytes, 1);
5581 	}
5582 
5583 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5584 
5585 	return ret;
5586 }
5587 
5588 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5589 					u64 start, u64 len, int pin)
5590 {
5591 	struct btrfs_block_group_cache *cache;
5592 	int ret = 0;
5593 
5594 	cache = btrfs_lookup_block_group(root->fs_info, start);
5595 	if (!cache) {
5596 		printk(KERN_ERR "Unable to find block group for %llu\n",
5597 		       (unsigned long long)start);
5598 		return -ENOSPC;
5599 	}
5600 
5601 	if (btrfs_test_opt(root, DISCARD))
5602 		ret = btrfs_discard_extent(root, start, len, NULL);
5603 
5604 	if (pin)
5605 		pin_down_extent(root, cache, start, len, 1);
5606 	else {
5607 		btrfs_add_free_space(cache, start, len);
5608 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5609 	}
5610 	btrfs_put_block_group(cache);
5611 
5612 	trace_btrfs_reserved_extent_free(root, start, len);
5613 
5614 	return ret;
5615 }
5616 
5617 int btrfs_free_reserved_extent(struct btrfs_root *root,
5618 					u64 start, u64 len)
5619 {
5620 	return __btrfs_free_reserved_extent(root, start, len, 0);
5621 }
5622 
5623 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5624 				       u64 start, u64 len)
5625 {
5626 	return __btrfs_free_reserved_extent(root, start, len, 1);
5627 }
5628 
5629 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5630 				      struct btrfs_root *root,
5631 				      u64 parent, u64 root_objectid,
5632 				      u64 flags, u64 owner, u64 offset,
5633 				      struct btrfs_key *ins, int ref_mod)
5634 {
5635 	int ret;
5636 	struct btrfs_fs_info *fs_info = root->fs_info;
5637 	struct btrfs_extent_item *extent_item;
5638 	struct btrfs_extent_inline_ref *iref;
5639 	struct btrfs_path *path;
5640 	struct extent_buffer *leaf;
5641 	int type;
5642 	u32 size;
5643 
5644 	if (parent > 0)
5645 		type = BTRFS_SHARED_DATA_REF_KEY;
5646 	else
5647 		type = BTRFS_EXTENT_DATA_REF_KEY;
5648 
5649 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5650 
5651 	path = btrfs_alloc_path();
5652 	if (!path)
5653 		return -ENOMEM;
5654 
5655 	path->leave_spinning = 1;
5656 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5657 				      ins, size);
5658 	BUG_ON(ret);
5659 
5660 	leaf = path->nodes[0];
5661 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
5662 				     struct btrfs_extent_item);
5663 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5664 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5665 	btrfs_set_extent_flags(leaf, extent_item,
5666 			       flags | BTRFS_EXTENT_FLAG_DATA);
5667 
5668 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5669 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
5670 	if (parent > 0) {
5671 		struct btrfs_shared_data_ref *ref;
5672 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
5673 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5674 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5675 	} else {
5676 		struct btrfs_extent_data_ref *ref;
5677 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5678 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5679 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5680 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5681 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5682 	}
5683 
5684 	btrfs_mark_buffer_dirty(path->nodes[0]);
5685 	btrfs_free_path(path);
5686 
5687 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5688 	if (ret) {
5689 		printk(KERN_ERR "btrfs update block group failed for %llu "
5690 		       "%llu\n", (unsigned long long)ins->objectid,
5691 		       (unsigned long long)ins->offset);
5692 		BUG();
5693 	}
5694 	return ret;
5695 }
5696 
5697 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5698 				     struct btrfs_root *root,
5699 				     u64 parent, u64 root_objectid,
5700 				     u64 flags, struct btrfs_disk_key *key,
5701 				     int level, struct btrfs_key *ins)
5702 {
5703 	int ret;
5704 	struct btrfs_fs_info *fs_info = root->fs_info;
5705 	struct btrfs_extent_item *extent_item;
5706 	struct btrfs_tree_block_info *block_info;
5707 	struct btrfs_extent_inline_ref *iref;
5708 	struct btrfs_path *path;
5709 	struct extent_buffer *leaf;
5710 	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5711 
5712 	path = btrfs_alloc_path();
5713 	if (!path)
5714 		return -ENOMEM;
5715 
5716 	path->leave_spinning = 1;
5717 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5718 				      ins, size);
5719 	BUG_ON(ret);
5720 
5721 	leaf = path->nodes[0];
5722 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
5723 				     struct btrfs_extent_item);
5724 	btrfs_set_extent_refs(leaf, extent_item, 1);
5725 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5726 	btrfs_set_extent_flags(leaf, extent_item,
5727 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5728 	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5729 
5730 	btrfs_set_tree_block_key(leaf, block_info, key);
5731 	btrfs_set_tree_block_level(leaf, block_info, level);
5732 
5733 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5734 	if (parent > 0) {
5735 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5736 		btrfs_set_extent_inline_ref_type(leaf, iref,
5737 						 BTRFS_SHARED_BLOCK_REF_KEY);
5738 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5739 	} else {
5740 		btrfs_set_extent_inline_ref_type(leaf, iref,
5741 						 BTRFS_TREE_BLOCK_REF_KEY);
5742 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5743 	}
5744 
5745 	btrfs_mark_buffer_dirty(leaf);
5746 	btrfs_free_path(path);
5747 
5748 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5749 	if (ret) {
5750 		printk(KERN_ERR "btrfs update block group failed for %llu "
5751 		       "%llu\n", (unsigned long long)ins->objectid,
5752 		       (unsigned long long)ins->offset);
5753 		BUG();
5754 	}
5755 	return ret;
5756 }
5757 
5758 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5759 				     struct btrfs_root *root,
5760 				     u64 root_objectid, u64 owner,
5761 				     u64 offset, struct btrfs_key *ins)
5762 {
5763 	int ret;
5764 
5765 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5766 
5767 	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5768 					 0, root_objectid, owner, offset,
5769 					 BTRFS_ADD_DELAYED_EXTENT, NULL);
5770 	return ret;
5771 }
5772 
5773 /*
5774  * this is used by the tree logging recovery code.  It records that
5775  * an extent has been allocated and makes sure to clear the free
5776  * space cache bits as well
5777  */
5778 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5779 				   struct btrfs_root *root,
5780 				   u64 root_objectid, u64 owner, u64 offset,
5781 				   struct btrfs_key *ins)
5782 {
5783 	int ret;
5784 	struct btrfs_block_group_cache *block_group;
5785 	struct btrfs_caching_control *caching_ctl;
5786 	u64 start = ins->objectid;
5787 	u64 num_bytes = ins->offset;
5788 
5789 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5790 	cache_block_group(block_group, trans, NULL, 0);
5791 	caching_ctl = get_caching_control(block_group);
5792 
5793 	if (!caching_ctl) {
5794 		BUG_ON(!block_group_cache_done(block_group));
5795 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
5796 		BUG_ON(ret);
5797 	} else {
5798 		mutex_lock(&caching_ctl->mutex);
5799 
5800 		if (start >= caching_ctl->progress) {
5801 			ret = add_excluded_extent(root, start, num_bytes);
5802 			BUG_ON(ret);
5803 		} else if (start + num_bytes <= caching_ctl->progress) {
5804 			ret = btrfs_remove_free_space(block_group,
5805 						      start, num_bytes);
5806 			BUG_ON(ret);
5807 		} else {
5808 			num_bytes = caching_ctl->progress - start;
5809 			ret = btrfs_remove_free_space(block_group,
5810 						      start, num_bytes);
5811 			BUG_ON(ret);
5812 
5813 			start = caching_ctl->progress;
5814 			num_bytes = ins->objectid + ins->offset -
5815 				    caching_ctl->progress;
5816 			ret = add_excluded_extent(root, start, num_bytes);
5817 			BUG_ON(ret);
5818 		}
5819 
5820 		mutex_unlock(&caching_ctl->mutex);
5821 		put_caching_control(caching_ctl);
5822 	}
5823 
5824 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5825 					  RESERVE_ALLOC_NO_ACCOUNT);
5826 	BUG_ON(ret);
5827 	btrfs_put_block_group(block_group);
5828 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5829 					 0, owner, offset, ins, 1);
5830 	return ret;
5831 }
5832 
5833 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5834 					    struct btrfs_root *root,
5835 					    u64 bytenr, u32 blocksize,
5836 					    int level)
5837 {
5838 	struct extent_buffer *buf;
5839 
5840 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5841 	if (!buf)
5842 		return ERR_PTR(-ENOMEM);
5843 	btrfs_set_header_generation(buf, trans->transid);
5844 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5845 	btrfs_tree_lock(buf);
5846 	clean_tree_block(trans, root, buf);
5847 
5848 	btrfs_set_lock_blocking(buf);
5849 	btrfs_set_buffer_uptodate(buf);
5850 
5851 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5852 		/*
5853 		 * we allow two log transactions at a time, use different
5854 		 * EXENT bit to differentiate dirty pages.
5855 		 */
5856 		if (root->log_transid % 2 == 0)
5857 			set_extent_dirty(&root->dirty_log_pages, buf->start,
5858 					buf->start + buf->len - 1, GFP_NOFS);
5859 		else
5860 			set_extent_new(&root->dirty_log_pages, buf->start,
5861 					buf->start + buf->len - 1, GFP_NOFS);
5862 	} else {
5863 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5864 			 buf->start + buf->len - 1, GFP_NOFS);
5865 	}
5866 	trans->blocks_used++;
5867 	/* this returns a buffer locked for blocking */
5868 	return buf;
5869 }
5870 
5871 static struct btrfs_block_rsv *
5872 use_block_rsv(struct btrfs_trans_handle *trans,
5873 	      struct btrfs_root *root, u32 blocksize)
5874 {
5875 	struct btrfs_block_rsv *block_rsv;
5876 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5877 	int ret;
5878 
5879 	block_rsv = get_block_rsv(trans, root);
5880 
5881 	if (block_rsv->size == 0) {
5882 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5883 		/*
5884 		 * If we couldn't reserve metadata bytes try and use some from
5885 		 * the global reserve.
5886 		 */
5887 		if (ret && block_rsv != global_rsv) {
5888 			ret = block_rsv_use_bytes(global_rsv, blocksize);
5889 			if (!ret)
5890 				return global_rsv;
5891 			return ERR_PTR(ret);
5892 		} else if (ret) {
5893 			return ERR_PTR(ret);
5894 		}
5895 		return block_rsv;
5896 	}
5897 
5898 	ret = block_rsv_use_bytes(block_rsv, blocksize);
5899 	if (!ret)
5900 		return block_rsv;
5901 	if (ret) {
5902 		static DEFINE_RATELIMIT_STATE(_rs,
5903 				DEFAULT_RATELIMIT_INTERVAL,
5904 				/*DEFAULT_RATELIMIT_BURST*/ 2);
5905 		if (__ratelimit(&_rs)) {
5906 			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5907 			WARN_ON(1);
5908 		}
5909 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5910 		if (!ret) {
5911 			return block_rsv;
5912 		} else if (ret && block_rsv != global_rsv) {
5913 			ret = block_rsv_use_bytes(global_rsv, blocksize);
5914 			if (!ret)
5915 				return global_rsv;
5916 		}
5917 	}
5918 
5919 	return ERR_PTR(-ENOSPC);
5920 }
5921 
5922 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5923 {
5924 	block_rsv_add_bytes(block_rsv, blocksize, 0);
5925 	block_rsv_release_bytes(block_rsv, NULL, 0);
5926 }
5927 
5928 /*
5929  * finds a free extent and does all the dirty work required for allocation
5930  * returns the key for the extent through ins, and a tree buffer for
5931  * the first block of the extent through buf.
5932  *
5933  * returns the tree buffer or NULL.
5934  */
5935 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5936 					struct btrfs_root *root, u32 blocksize,
5937 					u64 parent, u64 root_objectid,
5938 					struct btrfs_disk_key *key, int level,
5939 					u64 hint, u64 empty_size)
5940 {
5941 	struct btrfs_key ins;
5942 	struct btrfs_block_rsv *block_rsv;
5943 	struct extent_buffer *buf;
5944 	u64 flags = 0;
5945 	int ret;
5946 
5947 
5948 	block_rsv = use_block_rsv(trans, root, blocksize);
5949 	if (IS_ERR(block_rsv))
5950 		return ERR_CAST(block_rsv);
5951 
5952 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5953 				   empty_size, hint, (u64)-1, &ins, 0);
5954 	if (ret) {
5955 		unuse_block_rsv(block_rsv, blocksize);
5956 		return ERR_PTR(ret);
5957 	}
5958 
5959 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5960 				    blocksize, level);
5961 	BUG_ON(IS_ERR(buf));
5962 
5963 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5964 		if (parent == 0)
5965 			parent = ins.objectid;
5966 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5967 	} else
5968 		BUG_ON(parent > 0);
5969 
5970 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5971 		struct btrfs_delayed_extent_op *extent_op;
5972 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5973 		BUG_ON(!extent_op);
5974 		if (key)
5975 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
5976 		else
5977 			memset(&extent_op->key, 0, sizeof(extent_op->key));
5978 		extent_op->flags_to_set = flags;
5979 		extent_op->update_key = 1;
5980 		extent_op->update_flags = 1;
5981 		extent_op->is_data = 0;
5982 
5983 		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5984 					ins.offset, parent, root_objectid,
5985 					level, BTRFS_ADD_DELAYED_EXTENT,
5986 					extent_op);
5987 		BUG_ON(ret);
5988 	}
5989 	return buf;
5990 }
5991 
5992 struct walk_control {
5993 	u64 refs[BTRFS_MAX_LEVEL];
5994 	u64 flags[BTRFS_MAX_LEVEL];
5995 	struct btrfs_key update_progress;
5996 	int stage;
5997 	int level;
5998 	int shared_level;
5999 	int update_ref;
6000 	int keep_locks;
6001 	int reada_slot;
6002 	int reada_count;
6003 };
6004 
6005 #define DROP_REFERENCE	1
6006 #define UPDATE_BACKREF	2
6007 
6008 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6009 				     struct btrfs_root *root,
6010 				     struct walk_control *wc,
6011 				     struct btrfs_path *path)
6012 {
6013 	u64 bytenr;
6014 	u64 generation;
6015 	u64 refs;
6016 	u64 flags;
6017 	u32 nritems;
6018 	u32 blocksize;
6019 	struct btrfs_key key;
6020 	struct extent_buffer *eb;
6021 	int ret;
6022 	int slot;
6023 	int nread = 0;
6024 
6025 	if (path->slots[wc->level] < wc->reada_slot) {
6026 		wc->reada_count = wc->reada_count * 2 / 3;
6027 		wc->reada_count = max(wc->reada_count, 2);
6028 	} else {
6029 		wc->reada_count = wc->reada_count * 3 / 2;
6030 		wc->reada_count = min_t(int, wc->reada_count,
6031 					BTRFS_NODEPTRS_PER_BLOCK(root));
6032 	}
6033 
6034 	eb = path->nodes[wc->level];
6035 	nritems = btrfs_header_nritems(eb);
6036 	blocksize = btrfs_level_size(root, wc->level - 1);
6037 
6038 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6039 		if (nread >= wc->reada_count)
6040 			break;
6041 
6042 		cond_resched();
6043 		bytenr = btrfs_node_blockptr(eb, slot);
6044 		generation = btrfs_node_ptr_generation(eb, slot);
6045 
6046 		if (slot == path->slots[wc->level])
6047 			goto reada;
6048 
6049 		if (wc->stage == UPDATE_BACKREF &&
6050 		    generation <= root->root_key.offset)
6051 			continue;
6052 
6053 		/* We don't lock the tree block, it's OK to be racy here */
6054 		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6055 					       &refs, &flags);
6056 		BUG_ON(ret);
6057 		BUG_ON(refs == 0);
6058 
6059 		if (wc->stage == DROP_REFERENCE) {
6060 			if (refs == 1)
6061 				goto reada;
6062 
6063 			if (wc->level == 1 &&
6064 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6065 				continue;
6066 			if (!wc->update_ref ||
6067 			    generation <= root->root_key.offset)
6068 				continue;
6069 			btrfs_node_key_to_cpu(eb, &key, slot);
6070 			ret = btrfs_comp_cpu_keys(&key,
6071 						  &wc->update_progress);
6072 			if (ret < 0)
6073 				continue;
6074 		} else {
6075 			if (wc->level == 1 &&
6076 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6077 				continue;
6078 		}
6079 reada:
6080 		ret = readahead_tree_block(root, bytenr, blocksize,
6081 					   generation);
6082 		if (ret)
6083 			break;
6084 		nread++;
6085 	}
6086 	wc->reada_slot = slot;
6087 }
6088 
6089 /*
6090  * hepler to process tree block while walking down the tree.
6091  *
6092  * when wc->stage == UPDATE_BACKREF, this function updates
6093  * back refs for pointers in the block.
6094  *
6095  * NOTE: return value 1 means we should stop walking down.
6096  */
6097 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6098 				   struct btrfs_root *root,
6099 				   struct btrfs_path *path,
6100 				   struct walk_control *wc, int lookup_info)
6101 {
6102 	int level = wc->level;
6103 	struct extent_buffer *eb = path->nodes[level];
6104 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6105 	int ret;
6106 
6107 	if (wc->stage == UPDATE_BACKREF &&
6108 	    btrfs_header_owner(eb) != root->root_key.objectid)
6109 		return 1;
6110 
6111 	/*
6112 	 * when reference count of tree block is 1, it won't increase
6113 	 * again. once full backref flag is set, we never clear it.
6114 	 */
6115 	if (lookup_info &&
6116 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6117 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6118 		BUG_ON(!path->locks[level]);
6119 		ret = btrfs_lookup_extent_info(trans, root,
6120 					       eb->start, eb->len,
6121 					       &wc->refs[level],
6122 					       &wc->flags[level]);
6123 		BUG_ON(ret);
6124 		BUG_ON(wc->refs[level] == 0);
6125 	}
6126 
6127 	if (wc->stage == DROP_REFERENCE) {
6128 		if (wc->refs[level] > 1)
6129 			return 1;
6130 
6131 		if (path->locks[level] && !wc->keep_locks) {
6132 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6133 			path->locks[level] = 0;
6134 		}
6135 		return 0;
6136 	}
6137 
6138 	/* wc->stage == UPDATE_BACKREF */
6139 	if (!(wc->flags[level] & flag)) {
6140 		BUG_ON(!path->locks[level]);
6141 		ret = btrfs_inc_ref(trans, root, eb, 1);
6142 		BUG_ON(ret);
6143 		ret = btrfs_dec_ref(trans, root, eb, 0);
6144 		BUG_ON(ret);
6145 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6146 						  eb->len, flag, 0);
6147 		BUG_ON(ret);
6148 		wc->flags[level] |= flag;
6149 	}
6150 
6151 	/*
6152 	 * the block is shared by multiple trees, so it's not good to
6153 	 * keep the tree lock
6154 	 */
6155 	if (path->locks[level] && level > 0) {
6156 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6157 		path->locks[level] = 0;
6158 	}
6159 	return 0;
6160 }
6161 
6162 /*
6163  * hepler to process tree block pointer.
6164  *
6165  * when wc->stage == DROP_REFERENCE, this function checks
6166  * reference count of the block pointed to. if the block
6167  * is shared and we need update back refs for the subtree
6168  * rooted at the block, this function changes wc->stage to
6169  * UPDATE_BACKREF. if the block is shared and there is no
6170  * need to update back, this function drops the reference
6171  * to the block.
6172  *
6173  * NOTE: return value 1 means we should stop walking down.
6174  */
6175 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6176 				 struct btrfs_root *root,
6177 				 struct btrfs_path *path,
6178 				 struct walk_control *wc, int *lookup_info)
6179 {
6180 	u64 bytenr;
6181 	u64 generation;
6182 	u64 parent;
6183 	u32 blocksize;
6184 	struct btrfs_key key;
6185 	struct extent_buffer *next;
6186 	int level = wc->level;
6187 	int reada = 0;
6188 	int ret = 0;
6189 
6190 	generation = btrfs_node_ptr_generation(path->nodes[level],
6191 					       path->slots[level]);
6192 	/*
6193 	 * if the lower level block was created before the snapshot
6194 	 * was created, we know there is no need to update back refs
6195 	 * for the subtree
6196 	 */
6197 	if (wc->stage == UPDATE_BACKREF &&
6198 	    generation <= root->root_key.offset) {
6199 		*lookup_info = 1;
6200 		return 1;
6201 	}
6202 
6203 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6204 	blocksize = btrfs_level_size(root, level - 1);
6205 
6206 	next = btrfs_find_tree_block(root, bytenr, blocksize);
6207 	if (!next) {
6208 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6209 		if (!next)
6210 			return -ENOMEM;
6211 		reada = 1;
6212 	}
6213 	btrfs_tree_lock(next);
6214 	btrfs_set_lock_blocking(next);
6215 
6216 	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6217 				       &wc->refs[level - 1],
6218 				       &wc->flags[level - 1]);
6219 	BUG_ON(ret);
6220 	BUG_ON(wc->refs[level - 1] == 0);
6221 	*lookup_info = 0;
6222 
6223 	if (wc->stage == DROP_REFERENCE) {
6224 		if (wc->refs[level - 1] > 1) {
6225 			if (level == 1 &&
6226 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6227 				goto skip;
6228 
6229 			if (!wc->update_ref ||
6230 			    generation <= root->root_key.offset)
6231 				goto skip;
6232 
6233 			btrfs_node_key_to_cpu(path->nodes[level], &key,
6234 					      path->slots[level]);
6235 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6236 			if (ret < 0)
6237 				goto skip;
6238 
6239 			wc->stage = UPDATE_BACKREF;
6240 			wc->shared_level = level - 1;
6241 		}
6242 	} else {
6243 		if (level == 1 &&
6244 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6245 			goto skip;
6246 	}
6247 
6248 	if (!btrfs_buffer_uptodate(next, generation)) {
6249 		btrfs_tree_unlock(next);
6250 		free_extent_buffer(next);
6251 		next = NULL;
6252 		*lookup_info = 1;
6253 	}
6254 
6255 	if (!next) {
6256 		if (reada && level == 1)
6257 			reada_walk_down(trans, root, wc, path);
6258 		next = read_tree_block(root, bytenr, blocksize, generation);
6259 		if (!next)
6260 			return -EIO;
6261 		btrfs_tree_lock(next);
6262 		btrfs_set_lock_blocking(next);
6263 	}
6264 
6265 	level--;
6266 	BUG_ON(level != btrfs_header_level(next));
6267 	path->nodes[level] = next;
6268 	path->slots[level] = 0;
6269 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6270 	wc->level = level;
6271 	if (wc->level == 1)
6272 		wc->reada_slot = 0;
6273 	return 0;
6274 skip:
6275 	wc->refs[level - 1] = 0;
6276 	wc->flags[level - 1] = 0;
6277 	if (wc->stage == DROP_REFERENCE) {
6278 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6279 			parent = path->nodes[level]->start;
6280 		} else {
6281 			BUG_ON(root->root_key.objectid !=
6282 			       btrfs_header_owner(path->nodes[level]));
6283 			parent = 0;
6284 		}
6285 
6286 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6287 					root->root_key.objectid, level - 1, 0);
6288 		BUG_ON(ret);
6289 	}
6290 	btrfs_tree_unlock(next);
6291 	free_extent_buffer(next);
6292 	*lookup_info = 1;
6293 	return 1;
6294 }
6295 
6296 /*
6297  * hepler to process tree block while walking up the tree.
6298  *
6299  * when wc->stage == DROP_REFERENCE, this function drops
6300  * reference count on the block.
6301  *
6302  * when wc->stage == UPDATE_BACKREF, this function changes
6303  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6304  * to UPDATE_BACKREF previously while processing the block.
6305  *
6306  * NOTE: return value 1 means we should stop walking up.
6307  */
6308 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6309 				 struct btrfs_root *root,
6310 				 struct btrfs_path *path,
6311 				 struct walk_control *wc)
6312 {
6313 	int ret;
6314 	int level = wc->level;
6315 	struct extent_buffer *eb = path->nodes[level];
6316 	u64 parent = 0;
6317 
6318 	if (wc->stage == UPDATE_BACKREF) {
6319 		BUG_ON(wc->shared_level < level);
6320 		if (level < wc->shared_level)
6321 			goto out;
6322 
6323 		ret = find_next_key(path, level + 1, &wc->update_progress);
6324 		if (ret > 0)
6325 			wc->update_ref = 0;
6326 
6327 		wc->stage = DROP_REFERENCE;
6328 		wc->shared_level = -1;
6329 		path->slots[level] = 0;
6330 
6331 		/*
6332 		 * check reference count again if the block isn't locked.
6333 		 * we should start walking down the tree again if reference
6334 		 * count is one.
6335 		 */
6336 		if (!path->locks[level]) {
6337 			BUG_ON(level == 0);
6338 			btrfs_tree_lock(eb);
6339 			btrfs_set_lock_blocking(eb);
6340 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6341 
6342 			ret = btrfs_lookup_extent_info(trans, root,
6343 						       eb->start, eb->len,
6344 						       &wc->refs[level],
6345 						       &wc->flags[level]);
6346 			BUG_ON(ret);
6347 			BUG_ON(wc->refs[level] == 0);
6348 			if (wc->refs[level] == 1) {
6349 				btrfs_tree_unlock_rw(eb, path->locks[level]);
6350 				return 1;
6351 			}
6352 		}
6353 	}
6354 
6355 	/* wc->stage == DROP_REFERENCE */
6356 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6357 
6358 	if (wc->refs[level] == 1) {
6359 		if (level == 0) {
6360 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6361 				ret = btrfs_dec_ref(trans, root, eb, 1);
6362 			else
6363 				ret = btrfs_dec_ref(trans, root, eb, 0);
6364 			BUG_ON(ret);
6365 		}
6366 		/* make block locked assertion in clean_tree_block happy */
6367 		if (!path->locks[level] &&
6368 		    btrfs_header_generation(eb) == trans->transid) {
6369 			btrfs_tree_lock(eb);
6370 			btrfs_set_lock_blocking(eb);
6371 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6372 		}
6373 		clean_tree_block(trans, root, eb);
6374 	}
6375 
6376 	if (eb == root->node) {
6377 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6378 			parent = eb->start;
6379 		else
6380 			BUG_ON(root->root_key.objectid !=
6381 			       btrfs_header_owner(eb));
6382 	} else {
6383 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6384 			parent = path->nodes[level + 1]->start;
6385 		else
6386 			BUG_ON(root->root_key.objectid !=
6387 			       btrfs_header_owner(path->nodes[level + 1]));
6388 	}
6389 
6390 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6391 out:
6392 	wc->refs[level] = 0;
6393 	wc->flags[level] = 0;
6394 	return 0;
6395 }
6396 
6397 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6398 				   struct btrfs_root *root,
6399 				   struct btrfs_path *path,
6400 				   struct walk_control *wc)
6401 {
6402 	int level = wc->level;
6403 	int lookup_info = 1;
6404 	int ret;
6405 
6406 	while (level >= 0) {
6407 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
6408 		if (ret > 0)
6409 			break;
6410 
6411 		if (level == 0)
6412 			break;
6413 
6414 		if (path->slots[level] >=
6415 		    btrfs_header_nritems(path->nodes[level]))
6416 			break;
6417 
6418 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
6419 		if (ret > 0) {
6420 			path->slots[level]++;
6421 			continue;
6422 		} else if (ret < 0)
6423 			return ret;
6424 		level = wc->level;
6425 	}
6426 	return 0;
6427 }
6428 
6429 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6430 				 struct btrfs_root *root,
6431 				 struct btrfs_path *path,
6432 				 struct walk_control *wc, int max_level)
6433 {
6434 	int level = wc->level;
6435 	int ret;
6436 
6437 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6438 	while (level < max_level && path->nodes[level]) {
6439 		wc->level = level;
6440 		if (path->slots[level] + 1 <
6441 		    btrfs_header_nritems(path->nodes[level])) {
6442 			path->slots[level]++;
6443 			return 0;
6444 		} else {
6445 			ret = walk_up_proc(trans, root, path, wc);
6446 			if (ret > 0)
6447 				return 0;
6448 
6449 			if (path->locks[level]) {
6450 				btrfs_tree_unlock_rw(path->nodes[level],
6451 						     path->locks[level]);
6452 				path->locks[level] = 0;
6453 			}
6454 			free_extent_buffer(path->nodes[level]);
6455 			path->nodes[level] = NULL;
6456 			level++;
6457 		}
6458 	}
6459 	return 1;
6460 }
6461 
6462 /*
6463  * drop a subvolume tree.
6464  *
6465  * this function traverses the tree freeing any blocks that only
6466  * referenced by the tree.
6467  *
6468  * when a shared tree block is found. this function decreases its
6469  * reference count by one. if update_ref is true, this function
6470  * also make sure backrefs for the shared block and all lower level
6471  * blocks are properly updated.
6472  */
6473 void btrfs_drop_snapshot(struct btrfs_root *root,
6474 			 struct btrfs_block_rsv *block_rsv, int update_ref)
6475 {
6476 	struct btrfs_path *path;
6477 	struct btrfs_trans_handle *trans;
6478 	struct btrfs_root *tree_root = root->fs_info->tree_root;
6479 	struct btrfs_root_item *root_item = &root->root_item;
6480 	struct walk_control *wc;
6481 	struct btrfs_key key;
6482 	int err = 0;
6483 	int ret;
6484 	int level;
6485 
6486 	path = btrfs_alloc_path();
6487 	if (!path) {
6488 		err = -ENOMEM;
6489 		goto out;
6490 	}
6491 
6492 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6493 	if (!wc) {
6494 		btrfs_free_path(path);
6495 		err = -ENOMEM;
6496 		goto out;
6497 	}
6498 
6499 	trans = btrfs_start_transaction(tree_root, 0);
6500 	BUG_ON(IS_ERR(trans));
6501 
6502 	if (block_rsv)
6503 		trans->block_rsv = block_rsv;
6504 
6505 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6506 		level = btrfs_header_level(root->node);
6507 		path->nodes[level] = btrfs_lock_root_node(root);
6508 		btrfs_set_lock_blocking(path->nodes[level]);
6509 		path->slots[level] = 0;
6510 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6511 		memset(&wc->update_progress, 0,
6512 		       sizeof(wc->update_progress));
6513 	} else {
6514 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6515 		memcpy(&wc->update_progress, &key,
6516 		       sizeof(wc->update_progress));
6517 
6518 		level = root_item->drop_level;
6519 		BUG_ON(level == 0);
6520 		path->lowest_level = level;
6521 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6522 		path->lowest_level = 0;
6523 		if (ret < 0) {
6524 			err = ret;
6525 			goto out_free;
6526 		}
6527 		WARN_ON(ret > 0);
6528 
6529 		/*
6530 		 * unlock our path, this is safe because only this
6531 		 * function is allowed to delete this snapshot
6532 		 */
6533 		btrfs_unlock_up_safe(path, 0);
6534 
6535 		level = btrfs_header_level(root->node);
6536 		while (1) {
6537 			btrfs_tree_lock(path->nodes[level]);
6538 			btrfs_set_lock_blocking(path->nodes[level]);
6539 
6540 			ret = btrfs_lookup_extent_info(trans, root,
6541 						path->nodes[level]->start,
6542 						path->nodes[level]->len,
6543 						&wc->refs[level],
6544 						&wc->flags[level]);
6545 			BUG_ON(ret);
6546 			BUG_ON(wc->refs[level] == 0);
6547 
6548 			if (level == root_item->drop_level)
6549 				break;
6550 
6551 			btrfs_tree_unlock(path->nodes[level]);
6552 			WARN_ON(wc->refs[level] != 1);
6553 			level--;
6554 		}
6555 	}
6556 
6557 	wc->level = level;
6558 	wc->shared_level = -1;
6559 	wc->stage = DROP_REFERENCE;
6560 	wc->update_ref = update_ref;
6561 	wc->keep_locks = 0;
6562 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6563 
6564 	while (1) {
6565 		ret = walk_down_tree(trans, root, path, wc);
6566 		if (ret < 0) {
6567 			err = ret;
6568 			break;
6569 		}
6570 
6571 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6572 		if (ret < 0) {
6573 			err = ret;
6574 			break;
6575 		}
6576 
6577 		if (ret > 0) {
6578 			BUG_ON(wc->stage != DROP_REFERENCE);
6579 			break;
6580 		}
6581 
6582 		if (wc->stage == DROP_REFERENCE) {
6583 			level = wc->level;
6584 			btrfs_node_key(path->nodes[level],
6585 				       &root_item->drop_progress,
6586 				       path->slots[level]);
6587 			root_item->drop_level = level;
6588 		}
6589 
6590 		BUG_ON(wc->level == 0);
6591 		if (btrfs_should_end_transaction(trans, tree_root)) {
6592 			ret = btrfs_update_root(trans, tree_root,
6593 						&root->root_key,
6594 						root_item);
6595 			BUG_ON(ret);
6596 
6597 			btrfs_end_transaction_throttle(trans, tree_root);
6598 			trans = btrfs_start_transaction(tree_root, 0);
6599 			BUG_ON(IS_ERR(trans));
6600 			if (block_rsv)
6601 				trans->block_rsv = block_rsv;
6602 		}
6603 	}
6604 	btrfs_release_path(path);
6605 	BUG_ON(err);
6606 
6607 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
6608 	BUG_ON(ret);
6609 
6610 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6611 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6612 					   NULL, NULL);
6613 		BUG_ON(ret < 0);
6614 		if (ret > 0) {
6615 			/* if we fail to delete the orphan item this time
6616 			 * around, it'll get picked up the next time.
6617 			 *
6618 			 * The most common failure here is just -ENOENT.
6619 			 */
6620 			btrfs_del_orphan_item(trans, tree_root,
6621 					      root->root_key.objectid);
6622 		}
6623 	}
6624 
6625 	if (root->in_radix) {
6626 		btrfs_free_fs_root(tree_root->fs_info, root);
6627 	} else {
6628 		free_extent_buffer(root->node);
6629 		free_extent_buffer(root->commit_root);
6630 		kfree(root);
6631 	}
6632 out_free:
6633 	btrfs_end_transaction_throttle(trans, tree_root);
6634 	kfree(wc);
6635 	btrfs_free_path(path);
6636 out:
6637 	if (err)
6638 		btrfs_std_error(root->fs_info, err);
6639 	return;
6640 }
6641 
6642 /*
6643  * drop subtree rooted at tree block 'node'.
6644  *
6645  * NOTE: this function will unlock and release tree block 'node'
6646  */
6647 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6648 			struct btrfs_root *root,
6649 			struct extent_buffer *node,
6650 			struct extent_buffer *parent)
6651 {
6652 	struct btrfs_path *path;
6653 	struct walk_control *wc;
6654 	int level;
6655 	int parent_level;
6656 	int ret = 0;
6657 	int wret;
6658 
6659 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6660 
6661 	path = btrfs_alloc_path();
6662 	if (!path)
6663 		return -ENOMEM;
6664 
6665 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6666 	if (!wc) {
6667 		btrfs_free_path(path);
6668 		return -ENOMEM;
6669 	}
6670 
6671 	btrfs_assert_tree_locked(parent);
6672 	parent_level = btrfs_header_level(parent);
6673 	extent_buffer_get(parent);
6674 	path->nodes[parent_level] = parent;
6675 	path->slots[parent_level] = btrfs_header_nritems(parent);
6676 
6677 	btrfs_assert_tree_locked(node);
6678 	level = btrfs_header_level(node);
6679 	path->nodes[level] = node;
6680 	path->slots[level] = 0;
6681 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6682 
6683 	wc->refs[parent_level] = 1;
6684 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6685 	wc->level = level;
6686 	wc->shared_level = -1;
6687 	wc->stage = DROP_REFERENCE;
6688 	wc->update_ref = 0;
6689 	wc->keep_locks = 1;
6690 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6691 
6692 	while (1) {
6693 		wret = walk_down_tree(trans, root, path, wc);
6694 		if (wret < 0) {
6695 			ret = wret;
6696 			break;
6697 		}
6698 
6699 		wret = walk_up_tree(trans, root, path, wc, parent_level);
6700 		if (wret < 0)
6701 			ret = wret;
6702 		if (wret != 0)
6703 			break;
6704 	}
6705 
6706 	kfree(wc);
6707 	btrfs_free_path(path);
6708 	return ret;
6709 }
6710 
6711 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6712 {
6713 	u64 num_devices;
6714 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6715 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6716 
6717 	/*
6718 	 * we add in the count of missing devices because we want
6719 	 * to make sure that any RAID levels on a degraded FS
6720 	 * continue to be honored.
6721 	 */
6722 	num_devices = root->fs_info->fs_devices->rw_devices +
6723 		root->fs_info->fs_devices->missing_devices;
6724 
6725 	if (num_devices == 1) {
6726 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6727 		stripped = flags & ~stripped;
6728 
6729 		/* turn raid0 into single device chunks */
6730 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
6731 			return stripped;
6732 
6733 		/* turn mirroring into duplication */
6734 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6735 			     BTRFS_BLOCK_GROUP_RAID10))
6736 			return stripped | BTRFS_BLOCK_GROUP_DUP;
6737 		return flags;
6738 	} else {
6739 		/* they already had raid on here, just return */
6740 		if (flags & stripped)
6741 			return flags;
6742 
6743 		stripped |= BTRFS_BLOCK_GROUP_DUP;
6744 		stripped = flags & ~stripped;
6745 
6746 		/* switch duplicated blocks with raid1 */
6747 		if (flags & BTRFS_BLOCK_GROUP_DUP)
6748 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
6749 
6750 		/* turn single device chunks into raid0 */
6751 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
6752 	}
6753 	return flags;
6754 }
6755 
6756 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6757 {
6758 	struct btrfs_space_info *sinfo = cache->space_info;
6759 	u64 num_bytes;
6760 	u64 min_allocable_bytes;
6761 	int ret = -ENOSPC;
6762 
6763 
6764 	/*
6765 	 * We need some metadata space and system metadata space for
6766 	 * allocating chunks in some corner cases until we force to set
6767 	 * it to be readonly.
6768 	 */
6769 	if ((sinfo->flags &
6770 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6771 	    !force)
6772 		min_allocable_bytes = 1 * 1024 * 1024;
6773 	else
6774 		min_allocable_bytes = 0;
6775 
6776 	spin_lock(&sinfo->lock);
6777 	spin_lock(&cache->lock);
6778 
6779 	if (cache->ro) {
6780 		ret = 0;
6781 		goto out;
6782 	}
6783 
6784 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6785 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
6786 
6787 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6788 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6789 	    min_allocable_bytes <= sinfo->total_bytes) {
6790 		sinfo->bytes_readonly += num_bytes;
6791 		cache->ro = 1;
6792 		ret = 0;
6793 	}
6794 out:
6795 	spin_unlock(&cache->lock);
6796 	spin_unlock(&sinfo->lock);
6797 	return ret;
6798 }
6799 
6800 int btrfs_set_block_group_ro(struct btrfs_root *root,
6801 			     struct btrfs_block_group_cache *cache)
6802 
6803 {
6804 	struct btrfs_trans_handle *trans;
6805 	u64 alloc_flags;
6806 	int ret;
6807 
6808 	BUG_ON(cache->ro);
6809 
6810 	trans = btrfs_join_transaction(root);
6811 	BUG_ON(IS_ERR(trans));
6812 
6813 	alloc_flags = update_block_group_flags(root, cache->flags);
6814 	if (alloc_flags != cache->flags)
6815 		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6816 			       CHUNK_ALLOC_FORCE);
6817 
6818 	ret = set_block_group_ro(cache, 0);
6819 	if (!ret)
6820 		goto out;
6821 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
6822 	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6823 			     CHUNK_ALLOC_FORCE);
6824 	if (ret < 0)
6825 		goto out;
6826 	ret = set_block_group_ro(cache, 0);
6827 out:
6828 	btrfs_end_transaction(trans, root);
6829 	return ret;
6830 }
6831 
6832 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
6833 			    struct btrfs_root *root, u64 type)
6834 {
6835 	u64 alloc_flags = get_alloc_profile(root, type);
6836 	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6837 			      CHUNK_ALLOC_FORCE);
6838 }
6839 
6840 /*
6841  * helper to account the unused space of all the readonly block group in the
6842  * list. takes mirrors into account.
6843  */
6844 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
6845 {
6846 	struct btrfs_block_group_cache *block_group;
6847 	u64 free_bytes = 0;
6848 	int factor;
6849 
6850 	list_for_each_entry(block_group, groups_list, list) {
6851 		spin_lock(&block_group->lock);
6852 
6853 		if (!block_group->ro) {
6854 			spin_unlock(&block_group->lock);
6855 			continue;
6856 		}
6857 
6858 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
6859 					  BTRFS_BLOCK_GROUP_RAID10 |
6860 					  BTRFS_BLOCK_GROUP_DUP))
6861 			factor = 2;
6862 		else
6863 			factor = 1;
6864 
6865 		free_bytes += (block_group->key.offset -
6866 			       btrfs_block_group_used(&block_group->item)) *
6867 			       factor;
6868 
6869 		spin_unlock(&block_group->lock);
6870 	}
6871 
6872 	return free_bytes;
6873 }
6874 
6875 /*
6876  * helper to account the unused space of all the readonly block group in the
6877  * space_info. takes mirrors into account.
6878  */
6879 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6880 {
6881 	int i;
6882 	u64 free_bytes = 0;
6883 
6884 	spin_lock(&sinfo->lock);
6885 
6886 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
6887 		if (!list_empty(&sinfo->block_groups[i]))
6888 			free_bytes += __btrfs_get_ro_block_group_free_space(
6889 						&sinfo->block_groups[i]);
6890 
6891 	spin_unlock(&sinfo->lock);
6892 
6893 	return free_bytes;
6894 }
6895 
6896 int btrfs_set_block_group_rw(struct btrfs_root *root,
6897 			      struct btrfs_block_group_cache *cache)
6898 {
6899 	struct btrfs_space_info *sinfo = cache->space_info;
6900 	u64 num_bytes;
6901 
6902 	BUG_ON(!cache->ro);
6903 
6904 	spin_lock(&sinfo->lock);
6905 	spin_lock(&cache->lock);
6906 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6907 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
6908 	sinfo->bytes_readonly -= num_bytes;
6909 	cache->ro = 0;
6910 	spin_unlock(&cache->lock);
6911 	spin_unlock(&sinfo->lock);
6912 	return 0;
6913 }
6914 
6915 /*
6916  * checks to see if its even possible to relocate this block group.
6917  *
6918  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6919  * ok to go ahead and try.
6920  */
6921 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6922 {
6923 	struct btrfs_block_group_cache *block_group;
6924 	struct btrfs_space_info *space_info;
6925 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6926 	struct btrfs_device *device;
6927 	u64 min_free;
6928 	u64 dev_min = 1;
6929 	u64 dev_nr = 0;
6930 	int index;
6931 	int full = 0;
6932 	int ret = 0;
6933 
6934 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6935 
6936 	/* odd, couldn't find the block group, leave it alone */
6937 	if (!block_group)
6938 		return -1;
6939 
6940 	min_free = btrfs_block_group_used(&block_group->item);
6941 
6942 	/* no bytes used, we're good */
6943 	if (!min_free)
6944 		goto out;
6945 
6946 	space_info = block_group->space_info;
6947 	spin_lock(&space_info->lock);
6948 
6949 	full = space_info->full;
6950 
6951 	/*
6952 	 * if this is the last block group we have in this space, we can't
6953 	 * relocate it unless we're able to allocate a new chunk below.
6954 	 *
6955 	 * Otherwise, we need to make sure we have room in the space to handle
6956 	 * all of the extents from this block group.  If we can, we're good
6957 	 */
6958 	if ((space_info->total_bytes != block_group->key.offset) &&
6959 	    (space_info->bytes_used + space_info->bytes_reserved +
6960 	     space_info->bytes_pinned + space_info->bytes_readonly +
6961 	     min_free < space_info->total_bytes)) {
6962 		spin_unlock(&space_info->lock);
6963 		goto out;
6964 	}
6965 	spin_unlock(&space_info->lock);
6966 
6967 	/*
6968 	 * ok we don't have enough space, but maybe we have free space on our
6969 	 * devices to allocate new chunks for relocation, so loop through our
6970 	 * alloc devices and guess if we have enough space.  However, if we
6971 	 * were marked as full, then we know there aren't enough chunks, and we
6972 	 * can just return.
6973 	 */
6974 	ret = -1;
6975 	if (full)
6976 		goto out;
6977 
6978 	/*
6979 	 * index:
6980 	 *      0: raid10
6981 	 *      1: raid1
6982 	 *      2: dup
6983 	 *      3: raid0
6984 	 *      4: single
6985 	 */
6986 	index = get_block_group_index(block_group);
6987 	if (index == 0) {
6988 		dev_min = 4;
6989 		/* Divide by 2 */
6990 		min_free >>= 1;
6991 	} else if (index == 1) {
6992 		dev_min = 2;
6993 	} else if (index == 2) {
6994 		/* Multiply by 2 */
6995 		min_free <<= 1;
6996 	} else if (index == 3) {
6997 		dev_min = fs_devices->rw_devices;
6998 		do_div(min_free, dev_min);
6999 	}
7000 
7001 	mutex_lock(&root->fs_info->chunk_mutex);
7002 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7003 		u64 dev_offset;
7004 
7005 		/*
7006 		 * check to make sure we can actually find a chunk with enough
7007 		 * space to fit our block group in.
7008 		 */
7009 		if (device->total_bytes > device->bytes_used + min_free) {
7010 			ret = find_free_dev_extent(NULL, device, min_free,
7011 						   &dev_offset, NULL);
7012 			if (!ret)
7013 				dev_nr++;
7014 
7015 			if (dev_nr >= dev_min)
7016 				break;
7017 
7018 			ret = -1;
7019 		}
7020 	}
7021 	mutex_unlock(&root->fs_info->chunk_mutex);
7022 out:
7023 	btrfs_put_block_group(block_group);
7024 	return ret;
7025 }
7026 
7027 static int find_first_block_group(struct btrfs_root *root,
7028 		struct btrfs_path *path, struct btrfs_key *key)
7029 {
7030 	int ret = 0;
7031 	struct btrfs_key found_key;
7032 	struct extent_buffer *leaf;
7033 	int slot;
7034 
7035 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7036 	if (ret < 0)
7037 		goto out;
7038 
7039 	while (1) {
7040 		slot = path->slots[0];
7041 		leaf = path->nodes[0];
7042 		if (slot >= btrfs_header_nritems(leaf)) {
7043 			ret = btrfs_next_leaf(root, path);
7044 			if (ret == 0)
7045 				continue;
7046 			if (ret < 0)
7047 				goto out;
7048 			break;
7049 		}
7050 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7051 
7052 		if (found_key.objectid >= key->objectid &&
7053 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7054 			ret = 0;
7055 			goto out;
7056 		}
7057 		path->slots[0]++;
7058 	}
7059 out:
7060 	return ret;
7061 }
7062 
7063 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7064 {
7065 	struct btrfs_block_group_cache *block_group;
7066 	u64 last = 0;
7067 
7068 	while (1) {
7069 		struct inode *inode;
7070 
7071 		block_group = btrfs_lookup_first_block_group(info, last);
7072 		while (block_group) {
7073 			spin_lock(&block_group->lock);
7074 			if (block_group->iref)
7075 				break;
7076 			spin_unlock(&block_group->lock);
7077 			block_group = next_block_group(info->tree_root,
7078 						       block_group);
7079 		}
7080 		if (!block_group) {
7081 			if (last == 0)
7082 				break;
7083 			last = 0;
7084 			continue;
7085 		}
7086 
7087 		inode = block_group->inode;
7088 		block_group->iref = 0;
7089 		block_group->inode = NULL;
7090 		spin_unlock(&block_group->lock);
7091 		iput(inode);
7092 		last = block_group->key.objectid + block_group->key.offset;
7093 		btrfs_put_block_group(block_group);
7094 	}
7095 }
7096 
7097 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7098 {
7099 	struct btrfs_block_group_cache *block_group;
7100 	struct btrfs_space_info *space_info;
7101 	struct btrfs_caching_control *caching_ctl;
7102 	struct rb_node *n;
7103 
7104 	down_write(&info->extent_commit_sem);
7105 	while (!list_empty(&info->caching_block_groups)) {
7106 		caching_ctl = list_entry(info->caching_block_groups.next,
7107 					 struct btrfs_caching_control, list);
7108 		list_del(&caching_ctl->list);
7109 		put_caching_control(caching_ctl);
7110 	}
7111 	up_write(&info->extent_commit_sem);
7112 
7113 	spin_lock(&info->block_group_cache_lock);
7114 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7115 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7116 				       cache_node);
7117 		rb_erase(&block_group->cache_node,
7118 			 &info->block_group_cache_tree);
7119 		spin_unlock(&info->block_group_cache_lock);
7120 
7121 		down_write(&block_group->space_info->groups_sem);
7122 		list_del(&block_group->list);
7123 		up_write(&block_group->space_info->groups_sem);
7124 
7125 		if (block_group->cached == BTRFS_CACHE_STARTED)
7126 			wait_block_group_cache_done(block_group);
7127 
7128 		/*
7129 		 * We haven't cached this block group, which means we could
7130 		 * possibly have excluded extents on this block group.
7131 		 */
7132 		if (block_group->cached == BTRFS_CACHE_NO)
7133 			free_excluded_extents(info->extent_root, block_group);
7134 
7135 		btrfs_remove_free_space_cache(block_group);
7136 		btrfs_put_block_group(block_group);
7137 
7138 		spin_lock(&info->block_group_cache_lock);
7139 	}
7140 	spin_unlock(&info->block_group_cache_lock);
7141 
7142 	/* now that all the block groups are freed, go through and
7143 	 * free all the space_info structs.  This is only called during
7144 	 * the final stages of unmount, and so we know nobody is
7145 	 * using them.  We call synchronize_rcu() once before we start,
7146 	 * just to be on the safe side.
7147 	 */
7148 	synchronize_rcu();
7149 
7150 	release_global_block_rsv(info);
7151 
7152 	while(!list_empty(&info->space_info)) {
7153 		space_info = list_entry(info->space_info.next,
7154 					struct btrfs_space_info,
7155 					list);
7156 		if (space_info->bytes_pinned > 0 ||
7157 		    space_info->bytes_reserved > 0 ||
7158 		    space_info->bytes_may_use > 0) {
7159 			WARN_ON(1);
7160 			dump_space_info(space_info, 0, 0);
7161 		}
7162 		list_del(&space_info->list);
7163 		kfree(space_info);
7164 	}
7165 	return 0;
7166 }
7167 
7168 static void __link_block_group(struct btrfs_space_info *space_info,
7169 			       struct btrfs_block_group_cache *cache)
7170 {
7171 	int index = get_block_group_index(cache);
7172 
7173 	down_write(&space_info->groups_sem);
7174 	list_add_tail(&cache->list, &space_info->block_groups[index]);
7175 	up_write(&space_info->groups_sem);
7176 }
7177 
7178 int btrfs_read_block_groups(struct btrfs_root *root)
7179 {
7180 	struct btrfs_path *path;
7181 	int ret;
7182 	struct btrfs_block_group_cache *cache;
7183 	struct btrfs_fs_info *info = root->fs_info;
7184 	struct btrfs_space_info *space_info;
7185 	struct btrfs_key key;
7186 	struct btrfs_key found_key;
7187 	struct extent_buffer *leaf;
7188 	int need_clear = 0;
7189 	u64 cache_gen;
7190 
7191 	root = info->extent_root;
7192 	key.objectid = 0;
7193 	key.offset = 0;
7194 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7195 	path = btrfs_alloc_path();
7196 	if (!path)
7197 		return -ENOMEM;
7198 	path->reada = 1;
7199 
7200 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7201 	if (btrfs_test_opt(root, SPACE_CACHE) &&
7202 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7203 		need_clear = 1;
7204 	if (btrfs_test_opt(root, CLEAR_CACHE))
7205 		need_clear = 1;
7206 
7207 	while (1) {
7208 		ret = find_first_block_group(root, path, &key);
7209 		if (ret > 0)
7210 			break;
7211 		if (ret != 0)
7212 			goto error;
7213 		leaf = path->nodes[0];
7214 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7215 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
7216 		if (!cache) {
7217 			ret = -ENOMEM;
7218 			goto error;
7219 		}
7220 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7221 						GFP_NOFS);
7222 		if (!cache->free_space_ctl) {
7223 			kfree(cache);
7224 			ret = -ENOMEM;
7225 			goto error;
7226 		}
7227 
7228 		atomic_set(&cache->count, 1);
7229 		spin_lock_init(&cache->lock);
7230 		cache->fs_info = info;
7231 		INIT_LIST_HEAD(&cache->list);
7232 		INIT_LIST_HEAD(&cache->cluster_list);
7233 
7234 		if (need_clear)
7235 			cache->disk_cache_state = BTRFS_DC_CLEAR;
7236 
7237 		read_extent_buffer(leaf, &cache->item,
7238 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
7239 				   sizeof(cache->item));
7240 		memcpy(&cache->key, &found_key, sizeof(found_key));
7241 
7242 		key.objectid = found_key.objectid + found_key.offset;
7243 		btrfs_release_path(path);
7244 		cache->flags = btrfs_block_group_flags(&cache->item);
7245 		cache->sectorsize = root->sectorsize;
7246 
7247 		btrfs_init_free_space_ctl(cache);
7248 
7249 		/*
7250 		 * We need to exclude the super stripes now so that the space
7251 		 * info has super bytes accounted for, otherwise we'll think
7252 		 * we have more space than we actually do.
7253 		 */
7254 		exclude_super_stripes(root, cache);
7255 
7256 		/*
7257 		 * check for two cases, either we are full, and therefore
7258 		 * don't need to bother with the caching work since we won't
7259 		 * find any space, or we are empty, and we can just add all
7260 		 * the space in and be done with it.  This saves us _alot_ of
7261 		 * time, particularly in the full case.
7262 		 */
7263 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7264 			cache->last_byte_to_unpin = (u64)-1;
7265 			cache->cached = BTRFS_CACHE_FINISHED;
7266 			free_excluded_extents(root, cache);
7267 		} else if (btrfs_block_group_used(&cache->item) == 0) {
7268 			cache->last_byte_to_unpin = (u64)-1;
7269 			cache->cached = BTRFS_CACHE_FINISHED;
7270 			add_new_free_space(cache, root->fs_info,
7271 					   found_key.objectid,
7272 					   found_key.objectid +
7273 					   found_key.offset);
7274 			free_excluded_extents(root, cache);
7275 		}
7276 
7277 		ret = update_space_info(info, cache->flags, found_key.offset,
7278 					btrfs_block_group_used(&cache->item),
7279 					&space_info);
7280 		BUG_ON(ret);
7281 		cache->space_info = space_info;
7282 		spin_lock(&cache->space_info->lock);
7283 		cache->space_info->bytes_readonly += cache->bytes_super;
7284 		spin_unlock(&cache->space_info->lock);
7285 
7286 		__link_block_group(space_info, cache);
7287 
7288 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
7289 		BUG_ON(ret);
7290 
7291 		set_avail_alloc_bits(root->fs_info, cache->flags);
7292 		if (btrfs_chunk_readonly(root, cache->key.objectid))
7293 			set_block_group_ro(cache, 1);
7294 	}
7295 
7296 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7297 		if (!(get_alloc_profile(root, space_info->flags) &
7298 		      (BTRFS_BLOCK_GROUP_RAID10 |
7299 		       BTRFS_BLOCK_GROUP_RAID1 |
7300 		       BTRFS_BLOCK_GROUP_DUP)))
7301 			continue;
7302 		/*
7303 		 * avoid allocating from un-mirrored block group if there are
7304 		 * mirrored block groups.
7305 		 */
7306 		list_for_each_entry(cache, &space_info->block_groups[3], list)
7307 			set_block_group_ro(cache, 1);
7308 		list_for_each_entry(cache, &space_info->block_groups[4], list)
7309 			set_block_group_ro(cache, 1);
7310 	}
7311 
7312 	init_global_block_rsv(info);
7313 	ret = 0;
7314 error:
7315 	btrfs_free_path(path);
7316 	return ret;
7317 }
7318 
7319 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7320 			   struct btrfs_root *root, u64 bytes_used,
7321 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
7322 			   u64 size)
7323 {
7324 	int ret;
7325 	struct btrfs_root *extent_root;
7326 	struct btrfs_block_group_cache *cache;
7327 
7328 	extent_root = root->fs_info->extent_root;
7329 
7330 	root->fs_info->last_trans_log_full_commit = trans->transid;
7331 
7332 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
7333 	if (!cache)
7334 		return -ENOMEM;
7335 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7336 					GFP_NOFS);
7337 	if (!cache->free_space_ctl) {
7338 		kfree(cache);
7339 		return -ENOMEM;
7340 	}
7341 
7342 	cache->key.objectid = chunk_offset;
7343 	cache->key.offset = size;
7344 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7345 	cache->sectorsize = root->sectorsize;
7346 	cache->fs_info = root->fs_info;
7347 
7348 	atomic_set(&cache->count, 1);
7349 	spin_lock_init(&cache->lock);
7350 	INIT_LIST_HEAD(&cache->list);
7351 	INIT_LIST_HEAD(&cache->cluster_list);
7352 
7353 	btrfs_init_free_space_ctl(cache);
7354 
7355 	btrfs_set_block_group_used(&cache->item, bytes_used);
7356 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7357 	cache->flags = type;
7358 	btrfs_set_block_group_flags(&cache->item, type);
7359 
7360 	cache->last_byte_to_unpin = (u64)-1;
7361 	cache->cached = BTRFS_CACHE_FINISHED;
7362 	exclude_super_stripes(root, cache);
7363 
7364 	add_new_free_space(cache, root->fs_info, chunk_offset,
7365 			   chunk_offset + size);
7366 
7367 	free_excluded_extents(root, cache);
7368 
7369 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7370 				&cache->space_info);
7371 	BUG_ON(ret);
7372 
7373 	spin_lock(&cache->space_info->lock);
7374 	cache->space_info->bytes_readonly += cache->bytes_super;
7375 	spin_unlock(&cache->space_info->lock);
7376 
7377 	__link_block_group(cache->space_info, cache);
7378 
7379 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
7380 	BUG_ON(ret);
7381 
7382 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7383 				sizeof(cache->item));
7384 	BUG_ON(ret);
7385 
7386 	set_avail_alloc_bits(extent_root->fs_info, type);
7387 
7388 	return 0;
7389 }
7390 
7391 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7392 			     struct btrfs_root *root, u64 group_start)
7393 {
7394 	struct btrfs_path *path;
7395 	struct btrfs_block_group_cache *block_group;
7396 	struct btrfs_free_cluster *cluster;
7397 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7398 	struct btrfs_key key;
7399 	struct inode *inode;
7400 	int ret;
7401 	int factor;
7402 
7403 	root = root->fs_info->extent_root;
7404 
7405 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7406 	BUG_ON(!block_group);
7407 	BUG_ON(!block_group->ro);
7408 
7409 	/*
7410 	 * Free the reserved super bytes from this block group before
7411 	 * remove it.
7412 	 */
7413 	free_excluded_extents(root, block_group);
7414 
7415 	memcpy(&key, &block_group->key, sizeof(key));
7416 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7417 				  BTRFS_BLOCK_GROUP_RAID1 |
7418 				  BTRFS_BLOCK_GROUP_RAID10))
7419 		factor = 2;
7420 	else
7421 		factor = 1;
7422 
7423 	/* make sure this block group isn't part of an allocation cluster */
7424 	cluster = &root->fs_info->data_alloc_cluster;
7425 	spin_lock(&cluster->refill_lock);
7426 	btrfs_return_cluster_to_free_space(block_group, cluster);
7427 	spin_unlock(&cluster->refill_lock);
7428 
7429 	/*
7430 	 * make sure this block group isn't part of a metadata
7431 	 * allocation cluster
7432 	 */
7433 	cluster = &root->fs_info->meta_alloc_cluster;
7434 	spin_lock(&cluster->refill_lock);
7435 	btrfs_return_cluster_to_free_space(block_group, cluster);
7436 	spin_unlock(&cluster->refill_lock);
7437 
7438 	path = btrfs_alloc_path();
7439 	if (!path) {
7440 		ret = -ENOMEM;
7441 		goto out;
7442 	}
7443 
7444 	inode = lookup_free_space_inode(tree_root, block_group, path);
7445 	if (!IS_ERR(inode)) {
7446 		ret = btrfs_orphan_add(trans, inode);
7447 		BUG_ON(ret);
7448 		clear_nlink(inode);
7449 		/* One for the block groups ref */
7450 		spin_lock(&block_group->lock);
7451 		if (block_group->iref) {
7452 			block_group->iref = 0;
7453 			block_group->inode = NULL;
7454 			spin_unlock(&block_group->lock);
7455 			iput(inode);
7456 		} else {
7457 			spin_unlock(&block_group->lock);
7458 		}
7459 		/* One for our lookup ref */
7460 		btrfs_add_delayed_iput(inode);
7461 	}
7462 
7463 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7464 	key.offset = block_group->key.objectid;
7465 	key.type = 0;
7466 
7467 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7468 	if (ret < 0)
7469 		goto out;
7470 	if (ret > 0)
7471 		btrfs_release_path(path);
7472 	if (ret == 0) {
7473 		ret = btrfs_del_item(trans, tree_root, path);
7474 		if (ret)
7475 			goto out;
7476 		btrfs_release_path(path);
7477 	}
7478 
7479 	spin_lock(&root->fs_info->block_group_cache_lock);
7480 	rb_erase(&block_group->cache_node,
7481 		 &root->fs_info->block_group_cache_tree);
7482 	spin_unlock(&root->fs_info->block_group_cache_lock);
7483 
7484 	down_write(&block_group->space_info->groups_sem);
7485 	/*
7486 	 * we must use list_del_init so people can check to see if they
7487 	 * are still on the list after taking the semaphore
7488 	 */
7489 	list_del_init(&block_group->list);
7490 	up_write(&block_group->space_info->groups_sem);
7491 
7492 	if (block_group->cached == BTRFS_CACHE_STARTED)
7493 		wait_block_group_cache_done(block_group);
7494 
7495 	btrfs_remove_free_space_cache(block_group);
7496 
7497 	spin_lock(&block_group->space_info->lock);
7498 	block_group->space_info->total_bytes -= block_group->key.offset;
7499 	block_group->space_info->bytes_readonly -= block_group->key.offset;
7500 	block_group->space_info->disk_total -= block_group->key.offset * factor;
7501 	spin_unlock(&block_group->space_info->lock);
7502 
7503 	memcpy(&key, &block_group->key, sizeof(key));
7504 
7505 	btrfs_clear_space_info_full(root->fs_info);
7506 
7507 	btrfs_put_block_group(block_group);
7508 	btrfs_put_block_group(block_group);
7509 
7510 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7511 	if (ret > 0)
7512 		ret = -EIO;
7513 	if (ret < 0)
7514 		goto out;
7515 
7516 	ret = btrfs_del_item(trans, root, path);
7517 out:
7518 	btrfs_free_path(path);
7519 	return ret;
7520 }
7521 
7522 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7523 {
7524 	struct btrfs_space_info *space_info;
7525 	struct btrfs_super_block *disk_super;
7526 	u64 features;
7527 	u64 flags;
7528 	int mixed = 0;
7529 	int ret;
7530 
7531 	disk_super = fs_info->super_copy;
7532 	if (!btrfs_super_root(disk_super))
7533 		return 1;
7534 
7535 	features = btrfs_super_incompat_flags(disk_super);
7536 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7537 		mixed = 1;
7538 
7539 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
7540 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7541 	if (ret)
7542 		goto out;
7543 
7544 	if (mixed) {
7545 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7546 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7547 	} else {
7548 		flags = BTRFS_BLOCK_GROUP_METADATA;
7549 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7550 		if (ret)
7551 			goto out;
7552 
7553 		flags = BTRFS_BLOCK_GROUP_DATA;
7554 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7555 	}
7556 out:
7557 	return ret;
7558 }
7559 
7560 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7561 {
7562 	return unpin_extent_range(root, start, end);
7563 }
7564 
7565 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7566 			       u64 num_bytes, u64 *actual_bytes)
7567 {
7568 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7569 }
7570 
7571 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7572 {
7573 	struct btrfs_fs_info *fs_info = root->fs_info;
7574 	struct btrfs_block_group_cache *cache = NULL;
7575 	u64 group_trimmed;
7576 	u64 start;
7577 	u64 end;
7578 	u64 trimmed = 0;
7579 	int ret = 0;
7580 
7581 	cache = btrfs_lookup_block_group(fs_info, range->start);
7582 
7583 	while (cache) {
7584 		if (cache->key.objectid >= (range->start + range->len)) {
7585 			btrfs_put_block_group(cache);
7586 			break;
7587 		}
7588 
7589 		start = max(range->start, cache->key.objectid);
7590 		end = min(range->start + range->len,
7591 				cache->key.objectid + cache->key.offset);
7592 
7593 		if (end - start >= range->minlen) {
7594 			if (!block_group_cache_done(cache)) {
7595 				ret = cache_block_group(cache, NULL, root, 0);
7596 				if (!ret)
7597 					wait_block_group_cache_done(cache);
7598 			}
7599 			ret = btrfs_trim_block_group(cache,
7600 						     &group_trimmed,
7601 						     start,
7602 						     end,
7603 						     range->minlen);
7604 
7605 			trimmed += group_trimmed;
7606 			if (ret) {
7607 				btrfs_put_block_group(cache);
7608 				break;
7609 			}
7610 		}
7611 
7612 		cache = next_block_group(fs_info->tree_root, cache);
7613 	}
7614 
7615 	range->len = trimmed;
7616 	return ret;
7617 }
7618