xref: /linux/fs/btrfs/extent-tree.c (revision 680e6ffa15103ab610c0fc1241d2f98c801b13e2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 
32 #undef SCRAMBLE_DELAYED_REFS
33 
34 /*
35  * control flags for do_chunk_alloc's force field
36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37  * if we really need one.
38  *
39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
40  * if we have very few chunks already allocated.  This is
41  * used as part of the clustering code to help make sure
42  * we have a good pool of storage to cluster in, without
43  * filling the FS with empty chunks
44  *
45  * CHUNK_ALLOC_FORCE means it must try to allocate one
46  *
47  */
48 enum {
49 	CHUNK_ALLOC_NO_FORCE = 0,
50 	CHUNK_ALLOC_LIMITED = 1,
51 	CHUNK_ALLOC_FORCE = 2,
52 };
53 
54 /*
55  * Declare a helper function to detect underflow of various space info members
56  */
57 #define DECLARE_SPACE_INFO_UPDATE(name)					\
58 static inline void update_##name(struct btrfs_space_info *sinfo,	\
59 				 s64 bytes)				\
60 {									\
61 	if (bytes < 0 && sinfo->name < -bytes) {			\
62 		WARN_ON(1);						\
63 		sinfo->name = 0;					\
64 		return;							\
65 	}								\
66 	sinfo->name += bytes;						\
67 }
68 
69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
70 DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
71 
72 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
73 			       struct btrfs_delayed_ref_node *node, u64 parent,
74 			       u64 root_objectid, u64 owner_objectid,
75 			       u64 owner_offset, int refs_to_drop,
76 			       struct btrfs_delayed_extent_op *extra_op);
77 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
78 				    struct extent_buffer *leaf,
79 				    struct btrfs_extent_item *ei);
80 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
81 				      u64 parent, u64 root_objectid,
82 				      u64 flags, u64 owner, u64 offset,
83 				      struct btrfs_key *ins, int ref_mod);
84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
85 				     struct btrfs_delayed_ref_node *node,
86 				     struct btrfs_delayed_extent_op *extent_op);
87 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
88 			  int force);
89 static int find_next_key(struct btrfs_path *path, int level,
90 			 struct btrfs_key *key);
91 static void dump_space_info(struct btrfs_fs_info *fs_info,
92 			    struct btrfs_space_info *info, u64 bytes,
93 			    int dump_block_groups);
94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
95 			       u64 num_bytes);
96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
97 				     struct btrfs_space_info *space_info,
98 				     u64 num_bytes);
99 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
100 				     struct btrfs_space_info *space_info,
101 				     u64 num_bytes);
102 
103 static noinline int
104 block_group_cache_done(struct btrfs_block_group_cache *cache)
105 {
106 	smp_mb();
107 	return cache->cached == BTRFS_CACHE_FINISHED ||
108 		cache->cached == BTRFS_CACHE_ERROR;
109 }
110 
111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
112 {
113 	return (cache->flags & bits) == bits;
114 }
115 
116 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
117 {
118 	atomic_inc(&cache->count);
119 }
120 
121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
122 {
123 	if (atomic_dec_and_test(&cache->count)) {
124 		WARN_ON(cache->pinned > 0);
125 		WARN_ON(cache->reserved > 0);
126 
127 		/*
128 		 * If not empty, someone is still holding mutex of
129 		 * full_stripe_lock, which can only be released by caller.
130 		 * And it will definitely cause use-after-free when caller
131 		 * tries to release full stripe lock.
132 		 *
133 		 * No better way to resolve, but only to warn.
134 		 */
135 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
136 		kfree(cache->free_space_ctl);
137 		kfree(cache);
138 	}
139 }
140 
141 /*
142  * this adds the block group to the fs_info rb tree for the block group
143  * cache
144  */
145 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
146 				struct btrfs_block_group_cache *block_group)
147 {
148 	struct rb_node **p;
149 	struct rb_node *parent = NULL;
150 	struct btrfs_block_group_cache *cache;
151 
152 	spin_lock(&info->block_group_cache_lock);
153 	p = &info->block_group_cache_tree.rb_node;
154 
155 	while (*p) {
156 		parent = *p;
157 		cache = rb_entry(parent, struct btrfs_block_group_cache,
158 				 cache_node);
159 		if (block_group->key.objectid < cache->key.objectid) {
160 			p = &(*p)->rb_left;
161 		} else if (block_group->key.objectid > cache->key.objectid) {
162 			p = &(*p)->rb_right;
163 		} else {
164 			spin_unlock(&info->block_group_cache_lock);
165 			return -EEXIST;
166 		}
167 	}
168 
169 	rb_link_node(&block_group->cache_node, parent, p);
170 	rb_insert_color(&block_group->cache_node,
171 			&info->block_group_cache_tree);
172 
173 	if (info->first_logical_byte > block_group->key.objectid)
174 		info->first_logical_byte = block_group->key.objectid;
175 
176 	spin_unlock(&info->block_group_cache_lock);
177 
178 	return 0;
179 }
180 
181 /*
182  * This will return the block group at or after bytenr if contains is 0, else
183  * it will return the block group that contains the bytenr
184  */
185 static struct btrfs_block_group_cache *
186 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
187 			      int contains)
188 {
189 	struct btrfs_block_group_cache *cache, *ret = NULL;
190 	struct rb_node *n;
191 	u64 end, start;
192 
193 	spin_lock(&info->block_group_cache_lock);
194 	n = info->block_group_cache_tree.rb_node;
195 
196 	while (n) {
197 		cache = rb_entry(n, struct btrfs_block_group_cache,
198 				 cache_node);
199 		end = cache->key.objectid + cache->key.offset - 1;
200 		start = cache->key.objectid;
201 
202 		if (bytenr < start) {
203 			if (!contains && (!ret || start < ret->key.objectid))
204 				ret = cache;
205 			n = n->rb_left;
206 		} else if (bytenr > start) {
207 			if (contains && bytenr <= end) {
208 				ret = cache;
209 				break;
210 			}
211 			n = n->rb_right;
212 		} else {
213 			ret = cache;
214 			break;
215 		}
216 	}
217 	if (ret) {
218 		btrfs_get_block_group(ret);
219 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
220 			info->first_logical_byte = ret->key.objectid;
221 	}
222 	spin_unlock(&info->block_group_cache_lock);
223 
224 	return ret;
225 }
226 
227 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
228 			       u64 start, u64 num_bytes)
229 {
230 	u64 end = start + num_bytes - 1;
231 	set_extent_bits(&fs_info->freed_extents[0],
232 			start, end, EXTENT_UPTODATE);
233 	set_extent_bits(&fs_info->freed_extents[1],
234 			start, end, EXTENT_UPTODATE);
235 	return 0;
236 }
237 
238 static void free_excluded_extents(struct btrfs_block_group_cache *cache)
239 {
240 	struct btrfs_fs_info *fs_info = cache->fs_info;
241 	u64 start, end;
242 
243 	start = cache->key.objectid;
244 	end = start + cache->key.offset - 1;
245 
246 	clear_extent_bits(&fs_info->freed_extents[0],
247 			  start, end, EXTENT_UPTODATE);
248 	clear_extent_bits(&fs_info->freed_extents[1],
249 			  start, end, EXTENT_UPTODATE);
250 }
251 
252 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
253 {
254 	struct btrfs_fs_info *fs_info = cache->fs_info;
255 	u64 bytenr;
256 	u64 *logical;
257 	int stripe_len;
258 	int i, nr, ret;
259 
260 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
261 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
262 		cache->bytes_super += stripe_len;
263 		ret = add_excluded_extent(fs_info, cache->key.objectid,
264 					  stripe_len);
265 		if (ret)
266 			return ret;
267 	}
268 
269 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
270 		bytenr = btrfs_sb_offset(i);
271 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
272 				       bytenr, &logical, &nr, &stripe_len);
273 		if (ret)
274 			return ret;
275 
276 		while (nr--) {
277 			u64 start, len;
278 
279 			if (logical[nr] > cache->key.objectid +
280 			    cache->key.offset)
281 				continue;
282 
283 			if (logical[nr] + stripe_len <= cache->key.objectid)
284 				continue;
285 
286 			start = logical[nr];
287 			if (start < cache->key.objectid) {
288 				start = cache->key.objectid;
289 				len = (logical[nr] + stripe_len) - start;
290 			} else {
291 				len = min_t(u64, stripe_len,
292 					    cache->key.objectid +
293 					    cache->key.offset - start);
294 			}
295 
296 			cache->bytes_super += len;
297 			ret = add_excluded_extent(fs_info, start, len);
298 			if (ret) {
299 				kfree(logical);
300 				return ret;
301 			}
302 		}
303 
304 		kfree(logical);
305 	}
306 	return 0;
307 }
308 
309 static struct btrfs_caching_control *
310 get_caching_control(struct btrfs_block_group_cache *cache)
311 {
312 	struct btrfs_caching_control *ctl;
313 
314 	spin_lock(&cache->lock);
315 	if (!cache->caching_ctl) {
316 		spin_unlock(&cache->lock);
317 		return NULL;
318 	}
319 
320 	ctl = cache->caching_ctl;
321 	refcount_inc(&ctl->count);
322 	spin_unlock(&cache->lock);
323 	return ctl;
324 }
325 
326 static void put_caching_control(struct btrfs_caching_control *ctl)
327 {
328 	if (refcount_dec_and_test(&ctl->count))
329 		kfree(ctl);
330 }
331 
332 #ifdef CONFIG_BTRFS_DEBUG
333 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
334 {
335 	struct btrfs_fs_info *fs_info = block_group->fs_info;
336 	u64 start = block_group->key.objectid;
337 	u64 len = block_group->key.offset;
338 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
339 		fs_info->nodesize : fs_info->sectorsize;
340 	u64 step = chunk << 1;
341 
342 	while (len > chunk) {
343 		btrfs_remove_free_space(block_group, start, chunk);
344 		start += step;
345 		if (len < step)
346 			len = 0;
347 		else
348 			len -= step;
349 	}
350 }
351 #endif
352 
353 /*
354  * this is only called by cache_block_group, since we could have freed extents
355  * we need to check the pinned_extents for any extents that can't be used yet
356  * since their free space will be released as soon as the transaction commits.
357  */
358 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
359 		       u64 start, u64 end)
360 {
361 	struct btrfs_fs_info *info = block_group->fs_info;
362 	u64 extent_start, extent_end, size, total_added = 0;
363 	int ret;
364 
365 	while (start < end) {
366 		ret = find_first_extent_bit(info->pinned_extents, start,
367 					    &extent_start, &extent_end,
368 					    EXTENT_DIRTY | EXTENT_UPTODATE,
369 					    NULL);
370 		if (ret)
371 			break;
372 
373 		if (extent_start <= start) {
374 			start = extent_end + 1;
375 		} else if (extent_start > start && extent_start < end) {
376 			size = extent_start - start;
377 			total_added += size;
378 			ret = btrfs_add_free_space(block_group, start,
379 						   size);
380 			BUG_ON(ret); /* -ENOMEM or logic error */
381 			start = extent_end + 1;
382 		} else {
383 			break;
384 		}
385 	}
386 
387 	if (start < end) {
388 		size = end - start;
389 		total_added += size;
390 		ret = btrfs_add_free_space(block_group, start, size);
391 		BUG_ON(ret); /* -ENOMEM or logic error */
392 	}
393 
394 	return total_added;
395 }
396 
397 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
398 {
399 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
400 	struct btrfs_fs_info *fs_info = block_group->fs_info;
401 	struct btrfs_root *extent_root = fs_info->extent_root;
402 	struct btrfs_path *path;
403 	struct extent_buffer *leaf;
404 	struct btrfs_key key;
405 	u64 total_found = 0;
406 	u64 last = 0;
407 	u32 nritems;
408 	int ret;
409 	bool wakeup = true;
410 
411 	path = btrfs_alloc_path();
412 	if (!path)
413 		return -ENOMEM;
414 
415 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
416 
417 #ifdef CONFIG_BTRFS_DEBUG
418 	/*
419 	 * If we're fragmenting we don't want to make anybody think we can
420 	 * allocate from this block group until we've had a chance to fragment
421 	 * the free space.
422 	 */
423 	if (btrfs_should_fragment_free_space(block_group))
424 		wakeup = false;
425 #endif
426 	/*
427 	 * We don't want to deadlock with somebody trying to allocate a new
428 	 * extent for the extent root while also trying to search the extent
429 	 * root to add free space.  So we skip locking and search the commit
430 	 * root, since its read-only
431 	 */
432 	path->skip_locking = 1;
433 	path->search_commit_root = 1;
434 	path->reada = READA_FORWARD;
435 
436 	key.objectid = last;
437 	key.offset = 0;
438 	key.type = BTRFS_EXTENT_ITEM_KEY;
439 
440 next:
441 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
442 	if (ret < 0)
443 		goto out;
444 
445 	leaf = path->nodes[0];
446 	nritems = btrfs_header_nritems(leaf);
447 
448 	while (1) {
449 		if (btrfs_fs_closing(fs_info) > 1) {
450 			last = (u64)-1;
451 			break;
452 		}
453 
454 		if (path->slots[0] < nritems) {
455 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
456 		} else {
457 			ret = find_next_key(path, 0, &key);
458 			if (ret)
459 				break;
460 
461 			if (need_resched() ||
462 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
463 				if (wakeup)
464 					caching_ctl->progress = last;
465 				btrfs_release_path(path);
466 				up_read(&fs_info->commit_root_sem);
467 				mutex_unlock(&caching_ctl->mutex);
468 				cond_resched();
469 				mutex_lock(&caching_ctl->mutex);
470 				down_read(&fs_info->commit_root_sem);
471 				goto next;
472 			}
473 
474 			ret = btrfs_next_leaf(extent_root, path);
475 			if (ret < 0)
476 				goto out;
477 			if (ret)
478 				break;
479 			leaf = path->nodes[0];
480 			nritems = btrfs_header_nritems(leaf);
481 			continue;
482 		}
483 
484 		if (key.objectid < last) {
485 			key.objectid = last;
486 			key.offset = 0;
487 			key.type = BTRFS_EXTENT_ITEM_KEY;
488 
489 			if (wakeup)
490 				caching_ctl->progress = last;
491 			btrfs_release_path(path);
492 			goto next;
493 		}
494 
495 		if (key.objectid < block_group->key.objectid) {
496 			path->slots[0]++;
497 			continue;
498 		}
499 
500 		if (key.objectid >= block_group->key.objectid +
501 		    block_group->key.offset)
502 			break;
503 
504 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
505 		    key.type == BTRFS_METADATA_ITEM_KEY) {
506 			total_found += add_new_free_space(block_group, last,
507 							  key.objectid);
508 			if (key.type == BTRFS_METADATA_ITEM_KEY)
509 				last = key.objectid +
510 					fs_info->nodesize;
511 			else
512 				last = key.objectid + key.offset;
513 
514 			if (total_found > CACHING_CTL_WAKE_UP) {
515 				total_found = 0;
516 				if (wakeup)
517 					wake_up(&caching_ctl->wait);
518 			}
519 		}
520 		path->slots[0]++;
521 	}
522 	ret = 0;
523 
524 	total_found += add_new_free_space(block_group, last,
525 					  block_group->key.objectid +
526 					  block_group->key.offset);
527 	caching_ctl->progress = (u64)-1;
528 
529 out:
530 	btrfs_free_path(path);
531 	return ret;
532 }
533 
534 static noinline void caching_thread(struct btrfs_work *work)
535 {
536 	struct btrfs_block_group_cache *block_group;
537 	struct btrfs_fs_info *fs_info;
538 	struct btrfs_caching_control *caching_ctl;
539 	int ret;
540 
541 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
542 	block_group = caching_ctl->block_group;
543 	fs_info = block_group->fs_info;
544 
545 	mutex_lock(&caching_ctl->mutex);
546 	down_read(&fs_info->commit_root_sem);
547 
548 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
549 		ret = load_free_space_tree(caching_ctl);
550 	else
551 		ret = load_extent_tree_free(caching_ctl);
552 
553 	spin_lock(&block_group->lock);
554 	block_group->caching_ctl = NULL;
555 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
556 	spin_unlock(&block_group->lock);
557 
558 #ifdef CONFIG_BTRFS_DEBUG
559 	if (btrfs_should_fragment_free_space(block_group)) {
560 		u64 bytes_used;
561 
562 		spin_lock(&block_group->space_info->lock);
563 		spin_lock(&block_group->lock);
564 		bytes_used = block_group->key.offset -
565 			btrfs_block_group_used(&block_group->item);
566 		block_group->space_info->bytes_used += bytes_used >> 1;
567 		spin_unlock(&block_group->lock);
568 		spin_unlock(&block_group->space_info->lock);
569 		fragment_free_space(block_group);
570 	}
571 #endif
572 
573 	caching_ctl->progress = (u64)-1;
574 
575 	up_read(&fs_info->commit_root_sem);
576 	free_excluded_extents(block_group);
577 	mutex_unlock(&caching_ctl->mutex);
578 
579 	wake_up(&caching_ctl->wait);
580 
581 	put_caching_control(caching_ctl);
582 	btrfs_put_block_group(block_group);
583 }
584 
585 static int cache_block_group(struct btrfs_block_group_cache *cache,
586 			     int load_cache_only)
587 {
588 	DEFINE_WAIT(wait);
589 	struct btrfs_fs_info *fs_info = cache->fs_info;
590 	struct btrfs_caching_control *caching_ctl;
591 	int ret = 0;
592 
593 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
594 	if (!caching_ctl)
595 		return -ENOMEM;
596 
597 	INIT_LIST_HEAD(&caching_ctl->list);
598 	mutex_init(&caching_ctl->mutex);
599 	init_waitqueue_head(&caching_ctl->wait);
600 	caching_ctl->block_group = cache;
601 	caching_ctl->progress = cache->key.objectid;
602 	refcount_set(&caching_ctl->count, 1);
603 	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
604 			caching_thread, NULL, NULL);
605 
606 	spin_lock(&cache->lock);
607 	/*
608 	 * This should be a rare occasion, but this could happen I think in the
609 	 * case where one thread starts to load the space cache info, and then
610 	 * some other thread starts a transaction commit which tries to do an
611 	 * allocation while the other thread is still loading the space cache
612 	 * info.  The previous loop should have kept us from choosing this block
613 	 * group, but if we've moved to the state where we will wait on caching
614 	 * block groups we need to first check if we're doing a fast load here,
615 	 * so we can wait for it to finish, otherwise we could end up allocating
616 	 * from a block group who's cache gets evicted for one reason or
617 	 * another.
618 	 */
619 	while (cache->cached == BTRFS_CACHE_FAST) {
620 		struct btrfs_caching_control *ctl;
621 
622 		ctl = cache->caching_ctl;
623 		refcount_inc(&ctl->count);
624 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
625 		spin_unlock(&cache->lock);
626 
627 		schedule();
628 
629 		finish_wait(&ctl->wait, &wait);
630 		put_caching_control(ctl);
631 		spin_lock(&cache->lock);
632 	}
633 
634 	if (cache->cached != BTRFS_CACHE_NO) {
635 		spin_unlock(&cache->lock);
636 		kfree(caching_ctl);
637 		return 0;
638 	}
639 	WARN_ON(cache->caching_ctl);
640 	cache->caching_ctl = caching_ctl;
641 	cache->cached = BTRFS_CACHE_FAST;
642 	spin_unlock(&cache->lock);
643 
644 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
645 		mutex_lock(&caching_ctl->mutex);
646 		ret = load_free_space_cache(fs_info, cache);
647 
648 		spin_lock(&cache->lock);
649 		if (ret == 1) {
650 			cache->caching_ctl = NULL;
651 			cache->cached = BTRFS_CACHE_FINISHED;
652 			cache->last_byte_to_unpin = (u64)-1;
653 			caching_ctl->progress = (u64)-1;
654 		} else {
655 			if (load_cache_only) {
656 				cache->caching_ctl = NULL;
657 				cache->cached = BTRFS_CACHE_NO;
658 			} else {
659 				cache->cached = BTRFS_CACHE_STARTED;
660 				cache->has_caching_ctl = 1;
661 			}
662 		}
663 		spin_unlock(&cache->lock);
664 #ifdef CONFIG_BTRFS_DEBUG
665 		if (ret == 1 &&
666 		    btrfs_should_fragment_free_space(cache)) {
667 			u64 bytes_used;
668 
669 			spin_lock(&cache->space_info->lock);
670 			spin_lock(&cache->lock);
671 			bytes_used = cache->key.offset -
672 				btrfs_block_group_used(&cache->item);
673 			cache->space_info->bytes_used += bytes_used >> 1;
674 			spin_unlock(&cache->lock);
675 			spin_unlock(&cache->space_info->lock);
676 			fragment_free_space(cache);
677 		}
678 #endif
679 		mutex_unlock(&caching_ctl->mutex);
680 
681 		wake_up(&caching_ctl->wait);
682 		if (ret == 1) {
683 			put_caching_control(caching_ctl);
684 			free_excluded_extents(cache);
685 			return 0;
686 		}
687 	} else {
688 		/*
689 		 * We're either using the free space tree or no caching at all.
690 		 * Set cached to the appropriate value and wakeup any waiters.
691 		 */
692 		spin_lock(&cache->lock);
693 		if (load_cache_only) {
694 			cache->caching_ctl = NULL;
695 			cache->cached = BTRFS_CACHE_NO;
696 		} else {
697 			cache->cached = BTRFS_CACHE_STARTED;
698 			cache->has_caching_ctl = 1;
699 		}
700 		spin_unlock(&cache->lock);
701 		wake_up(&caching_ctl->wait);
702 	}
703 
704 	if (load_cache_only) {
705 		put_caching_control(caching_ctl);
706 		return 0;
707 	}
708 
709 	down_write(&fs_info->commit_root_sem);
710 	refcount_inc(&caching_ctl->count);
711 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
712 	up_write(&fs_info->commit_root_sem);
713 
714 	btrfs_get_block_group(cache);
715 
716 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
717 
718 	return ret;
719 }
720 
721 /*
722  * return the block group that starts at or after bytenr
723  */
724 static struct btrfs_block_group_cache *
725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
726 {
727 	return block_group_cache_tree_search(info, bytenr, 0);
728 }
729 
730 /*
731  * return the block group that contains the given bytenr
732  */
733 struct btrfs_block_group_cache *btrfs_lookup_block_group(
734 						 struct btrfs_fs_info *info,
735 						 u64 bytenr)
736 {
737 	return block_group_cache_tree_search(info, bytenr, 1);
738 }
739 
740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
741 						  u64 flags)
742 {
743 	struct list_head *head = &info->space_info;
744 	struct btrfs_space_info *found;
745 
746 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
747 
748 	rcu_read_lock();
749 	list_for_each_entry_rcu(found, head, list) {
750 		if (found->flags & flags) {
751 			rcu_read_unlock();
752 			return found;
753 		}
754 	}
755 	rcu_read_unlock();
756 	return NULL;
757 }
758 
759 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
760 			     bool metadata, u64 root_objectid)
761 {
762 	struct btrfs_space_info *space_info;
763 	u64 flags;
764 
765 	if (metadata) {
766 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
767 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
768 		else
769 			flags = BTRFS_BLOCK_GROUP_METADATA;
770 	} else {
771 		flags = BTRFS_BLOCK_GROUP_DATA;
772 	}
773 
774 	space_info = __find_space_info(fs_info, flags);
775 	ASSERT(space_info);
776 	percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
777 		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
778 }
779 
780 /*
781  * after adding space to the filesystem, we need to clear the full flags
782  * on all the space infos.
783  */
784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
785 {
786 	struct list_head *head = &info->space_info;
787 	struct btrfs_space_info *found;
788 
789 	rcu_read_lock();
790 	list_for_each_entry_rcu(found, head, list)
791 		found->full = 0;
792 	rcu_read_unlock();
793 }
794 
795 /* simple helper to search for an existing data extent at a given offset */
796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
797 {
798 	int ret;
799 	struct btrfs_key key;
800 	struct btrfs_path *path;
801 
802 	path = btrfs_alloc_path();
803 	if (!path)
804 		return -ENOMEM;
805 
806 	key.objectid = start;
807 	key.offset = len;
808 	key.type = BTRFS_EXTENT_ITEM_KEY;
809 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
810 	btrfs_free_path(path);
811 	return ret;
812 }
813 
814 /*
815  * helper function to lookup reference count and flags of a tree block.
816  *
817  * the head node for delayed ref is used to store the sum of all the
818  * reference count modifications queued up in the rbtree. the head
819  * node may also store the extent flags to set. This way you can check
820  * to see what the reference count and extent flags would be if all of
821  * the delayed refs are not processed.
822  */
823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
824 			     struct btrfs_fs_info *fs_info, u64 bytenr,
825 			     u64 offset, int metadata, u64 *refs, u64 *flags)
826 {
827 	struct btrfs_delayed_ref_head *head;
828 	struct btrfs_delayed_ref_root *delayed_refs;
829 	struct btrfs_path *path;
830 	struct btrfs_extent_item *ei;
831 	struct extent_buffer *leaf;
832 	struct btrfs_key key;
833 	u32 item_size;
834 	u64 num_refs;
835 	u64 extent_flags;
836 	int ret;
837 
838 	/*
839 	 * If we don't have skinny metadata, don't bother doing anything
840 	 * different
841 	 */
842 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
843 		offset = fs_info->nodesize;
844 		metadata = 0;
845 	}
846 
847 	path = btrfs_alloc_path();
848 	if (!path)
849 		return -ENOMEM;
850 
851 	if (!trans) {
852 		path->skip_locking = 1;
853 		path->search_commit_root = 1;
854 	}
855 
856 search_again:
857 	key.objectid = bytenr;
858 	key.offset = offset;
859 	if (metadata)
860 		key.type = BTRFS_METADATA_ITEM_KEY;
861 	else
862 		key.type = BTRFS_EXTENT_ITEM_KEY;
863 
864 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
865 	if (ret < 0)
866 		goto out_free;
867 
868 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
869 		if (path->slots[0]) {
870 			path->slots[0]--;
871 			btrfs_item_key_to_cpu(path->nodes[0], &key,
872 					      path->slots[0]);
873 			if (key.objectid == bytenr &&
874 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
875 			    key.offset == fs_info->nodesize)
876 				ret = 0;
877 		}
878 	}
879 
880 	if (ret == 0) {
881 		leaf = path->nodes[0];
882 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
883 		if (item_size >= sizeof(*ei)) {
884 			ei = btrfs_item_ptr(leaf, path->slots[0],
885 					    struct btrfs_extent_item);
886 			num_refs = btrfs_extent_refs(leaf, ei);
887 			extent_flags = btrfs_extent_flags(leaf, ei);
888 		} else {
889 			ret = -EINVAL;
890 			btrfs_print_v0_err(fs_info);
891 			if (trans)
892 				btrfs_abort_transaction(trans, ret);
893 			else
894 				btrfs_handle_fs_error(fs_info, ret, NULL);
895 
896 			goto out_free;
897 		}
898 
899 		BUG_ON(num_refs == 0);
900 	} else {
901 		num_refs = 0;
902 		extent_flags = 0;
903 		ret = 0;
904 	}
905 
906 	if (!trans)
907 		goto out;
908 
909 	delayed_refs = &trans->transaction->delayed_refs;
910 	spin_lock(&delayed_refs->lock);
911 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
912 	if (head) {
913 		if (!mutex_trylock(&head->mutex)) {
914 			refcount_inc(&head->refs);
915 			spin_unlock(&delayed_refs->lock);
916 
917 			btrfs_release_path(path);
918 
919 			/*
920 			 * Mutex was contended, block until it's released and try
921 			 * again
922 			 */
923 			mutex_lock(&head->mutex);
924 			mutex_unlock(&head->mutex);
925 			btrfs_put_delayed_ref_head(head);
926 			goto search_again;
927 		}
928 		spin_lock(&head->lock);
929 		if (head->extent_op && head->extent_op->update_flags)
930 			extent_flags |= head->extent_op->flags_to_set;
931 		else
932 			BUG_ON(num_refs == 0);
933 
934 		num_refs += head->ref_mod;
935 		spin_unlock(&head->lock);
936 		mutex_unlock(&head->mutex);
937 	}
938 	spin_unlock(&delayed_refs->lock);
939 out:
940 	WARN_ON(num_refs == 0);
941 	if (refs)
942 		*refs = num_refs;
943 	if (flags)
944 		*flags = extent_flags;
945 out_free:
946 	btrfs_free_path(path);
947 	return ret;
948 }
949 
950 /*
951  * Back reference rules.  Back refs have three main goals:
952  *
953  * 1) differentiate between all holders of references to an extent so that
954  *    when a reference is dropped we can make sure it was a valid reference
955  *    before freeing the extent.
956  *
957  * 2) Provide enough information to quickly find the holders of an extent
958  *    if we notice a given block is corrupted or bad.
959  *
960  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
961  *    maintenance.  This is actually the same as #2, but with a slightly
962  *    different use case.
963  *
964  * There are two kinds of back refs. The implicit back refs is optimized
965  * for pointers in non-shared tree blocks. For a given pointer in a block,
966  * back refs of this kind provide information about the block's owner tree
967  * and the pointer's key. These information allow us to find the block by
968  * b-tree searching. The full back refs is for pointers in tree blocks not
969  * referenced by their owner trees. The location of tree block is recorded
970  * in the back refs. Actually the full back refs is generic, and can be
971  * used in all cases the implicit back refs is used. The major shortcoming
972  * of the full back refs is its overhead. Every time a tree block gets
973  * COWed, we have to update back refs entry for all pointers in it.
974  *
975  * For a newly allocated tree block, we use implicit back refs for
976  * pointers in it. This means most tree related operations only involve
977  * implicit back refs. For a tree block created in old transaction, the
978  * only way to drop a reference to it is COW it. So we can detect the
979  * event that tree block loses its owner tree's reference and do the
980  * back refs conversion.
981  *
982  * When a tree block is COWed through a tree, there are four cases:
983  *
984  * The reference count of the block is one and the tree is the block's
985  * owner tree. Nothing to do in this case.
986  *
987  * The reference count of the block is one and the tree is not the
988  * block's owner tree. In this case, full back refs is used for pointers
989  * in the block. Remove these full back refs, add implicit back refs for
990  * every pointers in the new block.
991  *
992  * The reference count of the block is greater than one and the tree is
993  * the block's owner tree. In this case, implicit back refs is used for
994  * pointers in the block. Add full back refs for every pointers in the
995  * block, increase lower level extents' reference counts. The original
996  * implicit back refs are entailed to the new block.
997  *
998  * The reference count of the block is greater than one and the tree is
999  * not the block's owner tree. Add implicit back refs for every pointer in
1000  * the new block, increase lower level extents' reference count.
1001  *
1002  * Back Reference Key composing:
1003  *
1004  * The key objectid corresponds to the first byte in the extent,
1005  * The key type is used to differentiate between types of back refs.
1006  * There are different meanings of the key offset for different types
1007  * of back refs.
1008  *
1009  * File extents can be referenced by:
1010  *
1011  * - multiple snapshots, subvolumes, or different generations in one subvol
1012  * - different files inside a single subvolume
1013  * - different offsets inside a file (bookend extents in file.c)
1014  *
1015  * The extent ref structure for the implicit back refs has fields for:
1016  *
1017  * - Objectid of the subvolume root
1018  * - objectid of the file holding the reference
1019  * - original offset in the file
1020  * - how many bookend extents
1021  *
1022  * The key offset for the implicit back refs is hash of the first
1023  * three fields.
1024  *
1025  * The extent ref structure for the full back refs has field for:
1026  *
1027  * - number of pointers in the tree leaf
1028  *
1029  * The key offset for the implicit back refs is the first byte of
1030  * the tree leaf
1031  *
1032  * When a file extent is allocated, The implicit back refs is used.
1033  * the fields are filled in:
1034  *
1035  *     (root_key.objectid, inode objectid, offset in file, 1)
1036  *
1037  * When a file extent is removed file truncation, we find the
1038  * corresponding implicit back refs and check the following fields:
1039  *
1040  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1041  *
1042  * Btree extents can be referenced by:
1043  *
1044  * - Different subvolumes
1045  *
1046  * Both the implicit back refs and the full back refs for tree blocks
1047  * only consist of key. The key offset for the implicit back refs is
1048  * objectid of block's owner tree. The key offset for the full back refs
1049  * is the first byte of parent block.
1050  *
1051  * When implicit back refs is used, information about the lowest key and
1052  * level of the tree block are required. These information are stored in
1053  * tree block info structure.
1054  */
1055 
1056 /*
1057  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1058  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1059  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1060  */
1061 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1062 				     struct btrfs_extent_inline_ref *iref,
1063 				     enum btrfs_inline_ref_type is_data)
1064 {
1065 	int type = btrfs_extent_inline_ref_type(eb, iref);
1066 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1067 
1068 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1069 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
1070 	    type == BTRFS_SHARED_DATA_REF_KEY ||
1071 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
1072 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
1073 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
1074 				return type;
1075 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1076 				ASSERT(eb->fs_info);
1077 				/*
1078 				 * Every shared one has parent tree
1079 				 * block, which must be aligned to
1080 				 * nodesize.
1081 				 */
1082 				if (offset &&
1083 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1084 					return type;
1085 			}
1086 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
1087 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
1088 				return type;
1089 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
1090 				ASSERT(eb->fs_info);
1091 				/*
1092 				 * Every shared one has parent tree
1093 				 * block, which must be aligned to
1094 				 * nodesize.
1095 				 */
1096 				if (offset &&
1097 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1098 					return type;
1099 			}
1100 		} else {
1101 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1102 			return type;
1103 		}
1104 	}
1105 
1106 	btrfs_print_leaf((struct extent_buffer *)eb);
1107 	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1108 		  eb->start, type);
1109 	WARN_ON(1);
1110 
1111 	return BTRFS_REF_TYPE_INVALID;
1112 }
1113 
1114 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1115 {
1116 	u32 high_crc = ~(u32)0;
1117 	u32 low_crc = ~(u32)0;
1118 	__le64 lenum;
1119 
1120 	lenum = cpu_to_le64(root_objectid);
1121 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1122 	lenum = cpu_to_le64(owner);
1123 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1124 	lenum = cpu_to_le64(offset);
1125 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1126 
1127 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1128 }
1129 
1130 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1131 				     struct btrfs_extent_data_ref *ref)
1132 {
1133 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1134 				    btrfs_extent_data_ref_objectid(leaf, ref),
1135 				    btrfs_extent_data_ref_offset(leaf, ref));
1136 }
1137 
1138 static int match_extent_data_ref(struct extent_buffer *leaf,
1139 				 struct btrfs_extent_data_ref *ref,
1140 				 u64 root_objectid, u64 owner, u64 offset)
1141 {
1142 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1143 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1144 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1145 		return 0;
1146 	return 1;
1147 }
1148 
1149 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1150 					   struct btrfs_path *path,
1151 					   u64 bytenr, u64 parent,
1152 					   u64 root_objectid,
1153 					   u64 owner, u64 offset)
1154 {
1155 	struct btrfs_root *root = trans->fs_info->extent_root;
1156 	struct btrfs_key key;
1157 	struct btrfs_extent_data_ref *ref;
1158 	struct extent_buffer *leaf;
1159 	u32 nritems;
1160 	int ret;
1161 	int recow;
1162 	int err = -ENOENT;
1163 
1164 	key.objectid = bytenr;
1165 	if (parent) {
1166 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1167 		key.offset = parent;
1168 	} else {
1169 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1170 		key.offset = hash_extent_data_ref(root_objectid,
1171 						  owner, offset);
1172 	}
1173 again:
1174 	recow = 0;
1175 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1176 	if (ret < 0) {
1177 		err = ret;
1178 		goto fail;
1179 	}
1180 
1181 	if (parent) {
1182 		if (!ret)
1183 			return 0;
1184 		goto fail;
1185 	}
1186 
1187 	leaf = path->nodes[0];
1188 	nritems = btrfs_header_nritems(leaf);
1189 	while (1) {
1190 		if (path->slots[0] >= nritems) {
1191 			ret = btrfs_next_leaf(root, path);
1192 			if (ret < 0)
1193 				err = ret;
1194 			if (ret)
1195 				goto fail;
1196 
1197 			leaf = path->nodes[0];
1198 			nritems = btrfs_header_nritems(leaf);
1199 			recow = 1;
1200 		}
1201 
1202 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1203 		if (key.objectid != bytenr ||
1204 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1205 			goto fail;
1206 
1207 		ref = btrfs_item_ptr(leaf, path->slots[0],
1208 				     struct btrfs_extent_data_ref);
1209 
1210 		if (match_extent_data_ref(leaf, ref, root_objectid,
1211 					  owner, offset)) {
1212 			if (recow) {
1213 				btrfs_release_path(path);
1214 				goto again;
1215 			}
1216 			err = 0;
1217 			break;
1218 		}
1219 		path->slots[0]++;
1220 	}
1221 fail:
1222 	return err;
1223 }
1224 
1225 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1226 					   struct btrfs_path *path,
1227 					   u64 bytenr, u64 parent,
1228 					   u64 root_objectid, u64 owner,
1229 					   u64 offset, int refs_to_add)
1230 {
1231 	struct btrfs_root *root = trans->fs_info->extent_root;
1232 	struct btrfs_key key;
1233 	struct extent_buffer *leaf;
1234 	u32 size;
1235 	u32 num_refs;
1236 	int ret;
1237 
1238 	key.objectid = bytenr;
1239 	if (parent) {
1240 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1241 		key.offset = parent;
1242 		size = sizeof(struct btrfs_shared_data_ref);
1243 	} else {
1244 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1245 		key.offset = hash_extent_data_ref(root_objectid,
1246 						  owner, offset);
1247 		size = sizeof(struct btrfs_extent_data_ref);
1248 	}
1249 
1250 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1251 	if (ret && ret != -EEXIST)
1252 		goto fail;
1253 
1254 	leaf = path->nodes[0];
1255 	if (parent) {
1256 		struct btrfs_shared_data_ref *ref;
1257 		ref = btrfs_item_ptr(leaf, path->slots[0],
1258 				     struct btrfs_shared_data_ref);
1259 		if (ret == 0) {
1260 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1261 		} else {
1262 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1263 			num_refs += refs_to_add;
1264 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1265 		}
1266 	} else {
1267 		struct btrfs_extent_data_ref *ref;
1268 		while (ret == -EEXIST) {
1269 			ref = btrfs_item_ptr(leaf, path->slots[0],
1270 					     struct btrfs_extent_data_ref);
1271 			if (match_extent_data_ref(leaf, ref, root_objectid,
1272 						  owner, offset))
1273 				break;
1274 			btrfs_release_path(path);
1275 			key.offset++;
1276 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1277 						      size);
1278 			if (ret && ret != -EEXIST)
1279 				goto fail;
1280 
1281 			leaf = path->nodes[0];
1282 		}
1283 		ref = btrfs_item_ptr(leaf, path->slots[0],
1284 				     struct btrfs_extent_data_ref);
1285 		if (ret == 0) {
1286 			btrfs_set_extent_data_ref_root(leaf, ref,
1287 						       root_objectid);
1288 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1289 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1290 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1291 		} else {
1292 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1293 			num_refs += refs_to_add;
1294 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1295 		}
1296 	}
1297 	btrfs_mark_buffer_dirty(leaf);
1298 	ret = 0;
1299 fail:
1300 	btrfs_release_path(path);
1301 	return ret;
1302 }
1303 
1304 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1305 					   struct btrfs_path *path,
1306 					   int refs_to_drop, int *last_ref)
1307 {
1308 	struct btrfs_key key;
1309 	struct btrfs_extent_data_ref *ref1 = NULL;
1310 	struct btrfs_shared_data_ref *ref2 = NULL;
1311 	struct extent_buffer *leaf;
1312 	u32 num_refs = 0;
1313 	int ret = 0;
1314 
1315 	leaf = path->nodes[0];
1316 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1317 
1318 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1319 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1320 				      struct btrfs_extent_data_ref);
1321 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1322 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1323 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1324 				      struct btrfs_shared_data_ref);
1325 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1326 	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1327 		btrfs_print_v0_err(trans->fs_info);
1328 		btrfs_abort_transaction(trans, -EINVAL);
1329 		return -EINVAL;
1330 	} else {
1331 		BUG();
1332 	}
1333 
1334 	BUG_ON(num_refs < refs_to_drop);
1335 	num_refs -= refs_to_drop;
1336 
1337 	if (num_refs == 0) {
1338 		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1339 		*last_ref = 1;
1340 	} else {
1341 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1342 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1343 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1344 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1345 		btrfs_mark_buffer_dirty(leaf);
1346 	}
1347 	return ret;
1348 }
1349 
1350 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1351 					  struct btrfs_extent_inline_ref *iref)
1352 {
1353 	struct btrfs_key key;
1354 	struct extent_buffer *leaf;
1355 	struct btrfs_extent_data_ref *ref1;
1356 	struct btrfs_shared_data_ref *ref2;
1357 	u32 num_refs = 0;
1358 	int type;
1359 
1360 	leaf = path->nodes[0];
1361 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1362 
1363 	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1364 	if (iref) {
1365 		/*
1366 		 * If type is invalid, we should have bailed out earlier than
1367 		 * this call.
1368 		 */
1369 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1370 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
1371 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1372 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1373 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1374 		} else {
1375 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1376 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1377 		}
1378 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1379 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1380 				      struct btrfs_extent_data_ref);
1381 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1382 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1383 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1384 				      struct btrfs_shared_data_ref);
1385 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1386 	} else {
1387 		WARN_ON(1);
1388 	}
1389 	return num_refs;
1390 }
1391 
1392 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1393 					  struct btrfs_path *path,
1394 					  u64 bytenr, u64 parent,
1395 					  u64 root_objectid)
1396 {
1397 	struct btrfs_root *root = trans->fs_info->extent_root;
1398 	struct btrfs_key key;
1399 	int ret;
1400 
1401 	key.objectid = bytenr;
1402 	if (parent) {
1403 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1404 		key.offset = parent;
1405 	} else {
1406 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1407 		key.offset = root_objectid;
1408 	}
1409 
1410 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1411 	if (ret > 0)
1412 		ret = -ENOENT;
1413 	return ret;
1414 }
1415 
1416 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1417 					  struct btrfs_path *path,
1418 					  u64 bytenr, u64 parent,
1419 					  u64 root_objectid)
1420 {
1421 	struct btrfs_key key;
1422 	int ret;
1423 
1424 	key.objectid = bytenr;
1425 	if (parent) {
1426 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1427 		key.offset = parent;
1428 	} else {
1429 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1430 		key.offset = root_objectid;
1431 	}
1432 
1433 	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1434 				      path, &key, 0);
1435 	btrfs_release_path(path);
1436 	return ret;
1437 }
1438 
1439 static inline int extent_ref_type(u64 parent, u64 owner)
1440 {
1441 	int type;
1442 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1443 		if (parent > 0)
1444 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1445 		else
1446 			type = BTRFS_TREE_BLOCK_REF_KEY;
1447 	} else {
1448 		if (parent > 0)
1449 			type = BTRFS_SHARED_DATA_REF_KEY;
1450 		else
1451 			type = BTRFS_EXTENT_DATA_REF_KEY;
1452 	}
1453 	return type;
1454 }
1455 
1456 static int find_next_key(struct btrfs_path *path, int level,
1457 			 struct btrfs_key *key)
1458 
1459 {
1460 	for (; level < BTRFS_MAX_LEVEL; level++) {
1461 		if (!path->nodes[level])
1462 			break;
1463 		if (path->slots[level] + 1 >=
1464 		    btrfs_header_nritems(path->nodes[level]))
1465 			continue;
1466 		if (level == 0)
1467 			btrfs_item_key_to_cpu(path->nodes[level], key,
1468 					      path->slots[level] + 1);
1469 		else
1470 			btrfs_node_key_to_cpu(path->nodes[level], key,
1471 					      path->slots[level] + 1);
1472 		return 0;
1473 	}
1474 	return 1;
1475 }
1476 
1477 /*
1478  * look for inline back ref. if back ref is found, *ref_ret is set
1479  * to the address of inline back ref, and 0 is returned.
1480  *
1481  * if back ref isn't found, *ref_ret is set to the address where it
1482  * should be inserted, and -ENOENT is returned.
1483  *
1484  * if insert is true and there are too many inline back refs, the path
1485  * points to the extent item, and -EAGAIN is returned.
1486  *
1487  * NOTE: inline back refs are ordered in the same way that back ref
1488  *	 items in the tree are ordered.
1489  */
1490 static noinline_for_stack
1491 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1492 				 struct btrfs_path *path,
1493 				 struct btrfs_extent_inline_ref **ref_ret,
1494 				 u64 bytenr, u64 num_bytes,
1495 				 u64 parent, u64 root_objectid,
1496 				 u64 owner, u64 offset, int insert)
1497 {
1498 	struct btrfs_fs_info *fs_info = trans->fs_info;
1499 	struct btrfs_root *root = fs_info->extent_root;
1500 	struct btrfs_key key;
1501 	struct extent_buffer *leaf;
1502 	struct btrfs_extent_item *ei;
1503 	struct btrfs_extent_inline_ref *iref;
1504 	u64 flags;
1505 	u64 item_size;
1506 	unsigned long ptr;
1507 	unsigned long end;
1508 	int extra_size;
1509 	int type;
1510 	int want;
1511 	int ret;
1512 	int err = 0;
1513 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1514 	int needed;
1515 
1516 	key.objectid = bytenr;
1517 	key.type = BTRFS_EXTENT_ITEM_KEY;
1518 	key.offset = num_bytes;
1519 
1520 	want = extent_ref_type(parent, owner);
1521 	if (insert) {
1522 		extra_size = btrfs_extent_inline_ref_size(want);
1523 		path->keep_locks = 1;
1524 	} else
1525 		extra_size = -1;
1526 
1527 	/*
1528 	 * Owner is our level, so we can just add one to get the level for the
1529 	 * block we are interested in.
1530 	 */
1531 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1532 		key.type = BTRFS_METADATA_ITEM_KEY;
1533 		key.offset = owner;
1534 	}
1535 
1536 again:
1537 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1538 	if (ret < 0) {
1539 		err = ret;
1540 		goto out;
1541 	}
1542 
1543 	/*
1544 	 * We may be a newly converted file system which still has the old fat
1545 	 * extent entries for metadata, so try and see if we have one of those.
1546 	 */
1547 	if (ret > 0 && skinny_metadata) {
1548 		skinny_metadata = false;
1549 		if (path->slots[0]) {
1550 			path->slots[0]--;
1551 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1552 					      path->slots[0]);
1553 			if (key.objectid == bytenr &&
1554 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1555 			    key.offset == num_bytes)
1556 				ret = 0;
1557 		}
1558 		if (ret) {
1559 			key.objectid = bytenr;
1560 			key.type = BTRFS_EXTENT_ITEM_KEY;
1561 			key.offset = num_bytes;
1562 			btrfs_release_path(path);
1563 			goto again;
1564 		}
1565 	}
1566 
1567 	if (ret && !insert) {
1568 		err = -ENOENT;
1569 		goto out;
1570 	} else if (WARN_ON(ret)) {
1571 		err = -EIO;
1572 		goto out;
1573 	}
1574 
1575 	leaf = path->nodes[0];
1576 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1577 	if (unlikely(item_size < sizeof(*ei))) {
1578 		err = -EINVAL;
1579 		btrfs_print_v0_err(fs_info);
1580 		btrfs_abort_transaction(trans, err);
1581 		goto out;
1582 	}
1583 
1584 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1585 	flags = btrfs_extent_flags(leaf, ei);
1586 
1587 	ptr = (unsigned long)(ei + 1);
1588 	end = (unsigned long)ei + item_size;
1589 
1590 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1591 		ptr += sizeof(struct btrfs_tree_block_info);
1592 		BUG_ON(ptr > end);
1593 	}
1594 
1595 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1596 		needed = BTRFS_REF_TYPE_DATA;
1597 	else
1598 		needed = BTRFS_REF_TYPE_BLOCK;
1599 
1600 	err = -ENOENT;
1601 	while (1) {
1602 		if (ptr >= end) {
1603 			WARN_ON(ptr > end);
1604 			break;
1605 		}
1606 		iref = (struct btrfs_extent_inline_ref *)ptr;
1607 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1608 		if (type == BTRFS_REF_TYPE_INVALID) {
1609 			err = -EUCLEAN;
1610 			goto out;
1611 		}
1612 
1613 		if (want < type)
1614 			break;
1615 		if (want > type) {
1616 			ptr += btrfs_extent_inline_ref_size(type);
1617 			continue;
1618 		}
1619 
1620 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1621 			struct btrfs_extent_data_ref *dref;
1622 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1623 			if (match_extent_data_ref(leaf, dref, root_objectid,
1624 						  owner, offset)) {
1625 				err = 0;
1626 				break;
1627 			}
1628 			if (hash_extent_data_ref_item(leaf, dref) <
1629 			    hash_extent_data_ref(root_objectid, owner, offset))
1630 				break;
1631 		} else {
1632 			u64 ref_offset;
1633 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1634 			if (parent > 0) {
1635 				if (parent == ref_offset) {
1636 					err = 0;
1637 					break;
1638 				}
1639 				if (ref_offset < parent)
1640 					break;
1641 			} else {
1642 				if (root_objectid == ref_offset) {
1643 					err = 0;
1644 					break;
1645 				}
1646 				if (ref_offset < root_objectid)
1647 					break;
1648 			}
1649 		}
1650 		ptr += btrfs_extent_inline_ref_size(type);
1651 	}
1652 	if (err == -ENOENT && insert) {
1653 		if (item_size + extra_size >=
1654 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1655 			err = -EAGAIN;
1656 			goto out;
1657 		}
1658 		/*
1659 		 * To add new inline back ref, we have to make sure
1660 		 * there is no corresponding back ref item.
1661 		 * For simplicity, we just do not add new inline back
1662 		 * ref if there is any kind of item for this block
1663 		 */
1664 		if (find_next_key(path, 0, &key) == 0 &&
1665 		    key.objectid == bytenr &&
1666 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1667 			err = -EAGAIN;
1668 			goto out;
1669 		}
1670 	}
1671 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1672 out:
1673 	if (insert) {
1674 		path->keep_locks = 0;
1675 		btrfs_unlock_up_safe(path, 1);
1676 	}
1677 	return err;
1678 }
1679 
1680 /*
1681  * helper to add new inline back ref
1682  */
1683 static noinline_for_stack
1684 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1685 				 struct btrfs_path *path,
1686 				 struct btrfs_extent_inline_ref *iref,
1687 				 u64 parent, u64 root_objectid,
1688 				 u64 owner, u64 offset, int refs_to_add,
1689 				 struct btrfs_delayed_extent_op *extent_op)
1690 {
1691 	struct extent_buffer *leaf;
1692 	struct btrfs_extent_item *ei;
1693 	unsigned long ptr;
1694 	unsigned long end;
1695 	unsigned long item_offset;
1696 	u64 refs;
1697 	int size;
1698 	int type;
1699 
1700 	leaf = path->nodes[0];
1701 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1702 	item_offset = (unsigned long)iref - (unsigned long)ei;
1703 
1704 	type = extent_ref_type(parent, owner);
1705 	size = btrfs_extent_inline_ref_size(type);
1706 
1707 	btrfs_extend_item(fs_info, path, size);
1708 
1709 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1710 	refs = btrfs_extent_refs(leaf, ei);
1711 	refs += refs_to_add;
1712 	btrfs_set_extent_refs(leaf, ei, refs);
1713 	if (extent_op)
1714 		__run_delayed_extent_op(extent_op, leaf, ei);
1715 
1716 	ptr = (unsigned long)ei + item_offset;
1717 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1718 	if (ptr < end - size)
1719 		memmove_extent_buffer(leaf, ptr + size, ptr,
1720 				      end - size - ptr);
1721 
1722 	iref = (struct btrfs_extent_inline_ref *)ptr;
1723 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1724 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1725 		struct btrfs_extent_data_ref *dref;
1726 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1727 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1728 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1729 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1730 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1731 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1732 		struct btrfs_shared_data_ref *sref;
1733 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1734 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1735 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1736 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1737 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1738 	} else {
1739 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1740 	}
1741 	btrfs_mark_buffer_dirty(leaf);
1742 }
1743 
1744 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1745 				 struct btrfs_path *path,
1746 				 struct btrfs_extent_inline_ref **ref_ret,
1747 				 u64 bytenr, u64 num_bytes, u64 parent,
1748 				 u64 root_objectid, u64 owner, u64 offset)
1749 {
1750 	int ret;
1751 
1752 	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1753 					   num_bytes, parent, root_objectid,
1754 					   owner, offset, 0);
1755 	if (ret != -ENOENT)
1756 		return ret;
1757 
1758 	btrfs_release_path(path);
1759 	*ref_ret = NULL;
1760 
1761 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1762 		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1763 					    root_objectid);
1764 	} else {
1765 		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1766 					     root_objectid, owner, offset);
1767 	}
1768 	return ret;
1769 }
1770 
1771 /*
1772  * helper to update/remove inline back ref
1773  */
1774 static noinline_for_stack
1775 void update_inline_extent_backref(struct btrfs_path *path,
1776 				  struct btrfs_extent_inline_ref *iref,
1777 				  int refs_to_mod,
1778 				  struct btrfs_delayed_extent_op *extent_op,
1779 				  int *last_ref)
1780 {
1781 	struct extent_buffer *leaf = path->nodes[0];
1782 	struct btrfs_fs_info *fs_info = leaf->fs_info;
1783 	struct btrfs_extent_item *ei;
1784 	struct btrfs_extent_data_ref *dref = NULL;
1785 	struct btrfs_shared_data_ref *sref = NULL;
1786 	unsigned long ptr;
1787 	unsigned long end;
1788 	u32 item_size;
1789 	int size;
1790 	int type;
1791 	u64 refs;
1792 
1793 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1794 	refs = btrfs_extent_refs(leaf, ei);
1795 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1796 	refs += refs_to_mod;
1797 	btrfs_set_extent_refs(leaf, ei, refs);
1798 	if (extent_op)
1799 		__run_delayed_extent_op(extent_op, leaf, ei);
1800 
1801 	/*
1802 	 * If type is invalid, we should have bailed out after
1803 	 * lookup_inline_extent_backref().
1804 	 */
1805 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1806 	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1807 
1808 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1809 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1810 		refs = btrfs_extent_data_ref_count(leaf, dref);
1811 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1812 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1813 		refs = btrfs_shared_data_ref_count(leaf, sref);
1814 	} else {
1815 		refs = 1;
1816 		BUG_ON(refs_to_mod != -1);
1817 	}
1818 
1819 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1820 	refs += refs_to_mod;
1821 
1822 	if (refs > 0) {
1823 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1824 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1825 		else
1826 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1827 	} else {
1828 		*last_ref = 1;
1829 		size =  btrfs_extent_inline_ref_size(type);
1830 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1831 		ptr = (unsigned long)iref;
1832 		end = (unsigned long)ei + item_size;
1833 		if (ptr + size < end)
1834 			memmove_extent_buffer(leaf, ptr, ptr + size,
1835 					      end - ptr - size);
1836 		item_size -= size;
1837 		btrfs_truncate_item(fs_info, path, item_size, 1);
1838 	}
1839 	btrfs_mark_buffer_dirty(leaf);
1840 }
1841 
1842 static noinline_for_stack
1843 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1844 				 struct btrfs_path *path,
1845 				 u64 bytenr, u64 num_bytes, u64 parent,
1846 				 u64 root_objectid, u64 owner,
1847 				 u64 offset, int refs_to_add,
1848 				 struct btrfs_delayed_extent_op *extent_op)
1849 {
1850 	struct btrfs_extent_inline_ref *iref;
1851 	int ret;
1852 
1853 	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1854 					   num_bytes, parent, root_objectid,
1855 					   owner, offset, 1);
1856 	if (ret == 0) {
1857 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1858 		update_inline_extent_backref(path, iref, refs_to_add,
1859 					     extent_op, NULL);
1860 	} else if (ret == -ENOENT) {
1861 		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1862 					    root_objectid, owner, offset,
1863 					    refs_to_add, extent_op);
1864 		ret = 0;
1865 	}
1866 	return ret;
1867 }
1868 
1869 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1870 				 struct btrfs_path *path,
1871 				 u64 bytenr, u64 parent, u64 root_objectid,
1872 				 u64 owner, u64 offset, int refs_to_add)
1873 {
1874 	int ret;
1875 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1876 		BUG_ON(refs_to_add != 1);
1877 		ret = insert_tree_block_ref(trans, path, bytenr, parent,
1878 					    root_objectid);
1879 	} else {
1880 		ret = insert_extent_data_ref(trans, path, bytenr, parent,
1881 					     root_objectid, owner, offset,
1882 					     refs_to_add);
1883 	}
1884 	return ret;
1885 }
1886 
1887 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1888 				 struct btrfs_path *path,
1889 				 struct btrfs_extent_inline_ref *iref,
1890 				 int refs_to_drop, int is_data, int *last_ref)
1891 {
1892 	int ret = 0;
1893 
1894 	BUG_ON(!is_data && refs_to_drop != 1);
1895 	if (iref) {
1896 		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1897 					     last_ref);
1898 	} else if (is_data) {
1899 		ret = remove_extent_data_ref(trans, path, refs_to_drop,
1900 					     last_ref);
1901 	} else {
1902 		*last_ref = 1;
1903 		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1904 	}
1905 	return ret;
1906 }
1907 
1908 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1909 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1910 			       u64 *discarded_bytes)
1911 {
1912 	int j, ret = 0;
1913 	u64 bytes_left, end;
1914 	u64 aligned_start = ALIGN(start, 1 << 9);
1915 
1916 	if (WARN_ON(start != aligned_start)) {
1917 		len -= aligned_start - start;
1918 		len = round_down(len, 1 << 9);
1919 		start = aligned_start;
1920 	}
1921 
1922 	*discarded_bytes = 0;
1923 
1924 	if (!len)
1925 		return 0;
1926 
1927 	end = start + len;
1928 	bytes_left = len;
1929 
1930 	/* Skip any superblocks on this device. */
1931 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1932 		u64 sb_start = btrfs_sb_offset(j);
1933 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1934 		u64 size = sb_start - start;
1935 
1936 		if (!in_range(sb_start, start, bytes_left) &&
1937 		    !in_range(sb_end, start, bytes_left) &&
1938 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1939 			continue;
1940 
1941 		/*
1942 		 * Superblock spans beginning of range.  Adjust start and
1943 		 * try again.
1944 		 */
1945 		if (sb_start <= start) {
1946 			start += sb_end - start;
1947 			if (start > end) {
1948 				bytes_left = 0;
1949 				break;
1950 			}
1951 			bytes_left = end - start;
1952 			continue;
1953 		}
1954 
1955 		if (size) {
1956 			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1957 						   GFP_NOFS, 0);
1958 			if (!ret)
1959 				*discarded_bytes += size;
1960 			else if (ret != -EOPNOTSUPP)
1961 				return ret;
1962 		}
1963 
1964 		start = sb_end;
1965 		if (start > end) {
1966 			bytes_left = 0;
1967 			break;
1968 		}
1969 		bytes_left = end - start;
1970 	}
1971 
1972 	if (bytes_left) {
1973 		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1974 					   GFP_NOFS, 0);
1975 		if (!ret)
1976 			*discarded_bytes += bytes_left;
1977 	}
1978 	return ret;
1979 }
1980 
1981 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1982 			 u64 num_bytes, u64 *actual_bytes)
1983 {
1984 	int ret;
1985 	u64 discarded_bytes = 0;
1986 	struct btrfs_bio *bbio = NULL;
1987 
1988 
1989 	/*
1990 	 * Avoid races with device replace and make sure our bbio has devices
1991 	 * associated to its stripes that don't go away while we are discarding.
1992 	 */
1993 	btrfs_bio_counter_inc_blocked(fs_info);
1994 	/* Tell the block device(s) that the sectors can be discarded */
1995 	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1996 			      &bbio, 0);
1997 	/* Error condition is -ENOMEM */
1998 	if (!ret) {
1999 		struct btrfs_bio_stripe *stripe = bbio->stripes;
2000 		int i;
2001 
2002 
2003 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2004 			u64 bytes;
2005 			struct request_queue *req_q;
2006 
2007 			if (!stripe->dev->bdev) {
2008 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2009 				continue;
2010 			}
2011 			req_q = bdev_get_queue(stripe->dev->bdev);
2012 			if (!blk_queue_discard(req_q))
2013 				continue;
2014 
2015 			ret = btrfs_issue_discard(stripe->dev->bdev,
2016 						  stripe->physical,
2017 						  stripe->length,
2018 						  &bytes);
2019 			if (!ret)
2020 				discarded_bytes += bytes;
2021 			else if (ret != -EOPNOTSUPP)
2022 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2023 
2024 			/*
2025 			 * Just in case we get back EOPNOTSUPP for some reason,
2026 			 * just ignore the return value so we don't screw up
2027 			 * people calling discard_extent.
2028 			 */
2029 			ret = 0;
2030 		}
2031 		btrfs_put_bbio(bbio);
2032 	}
2033 	btrfs_bio_counter_dec(fs_info);
2034 
2035 	if (actual_bytes)
2036 		*actual_bytes = discarded_bytes;
2037 
2038 
2039 	if (ret == -EOPNOTSUPP)
2040 		ret = 0;
2041 	return ret;
2042 }
2043 
2044 /* Can return -ENOMEM */
2045 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2046 			 struct btrfs_root *root,
2047 			 u64 bytenr, u64 num_bytes, u64 parent,
2048 			 u64 root_objectid, u64 owner, u64 offset)
2049 {
2050 	struct btrfs_fs_info *fs_info = root->fs_info;
2051 	int old_ref_mod, new_ref_mod;
2052 	int ret;
2053 
2054 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2055 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
2056 
2057 	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2058 			   owner, offset, BTRFS_ADD_DELAYED_REF);
2059 
2060 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2061 		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2062 						 num_bytes, parent,
2063 						 root_objectid, (int)owner,
2064 						 BTRFS_ADD_DELAYED_REF, NULL,
2065 						 &old_ref_mod, &new_ref_mod);
2066 	} else {
2067 		ret = btrfs_add_delayed_data_ref(trans, bytenr,
2068 						 num_bytes, parent,
2069 						 root_objectid, owner, offset,
2070 						 0, BTRFS_ADD_DELAYED_REF,
2071 						 &old_ref_mod, &new_ref_mod);
2072 	}
2073 
2074 	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2075 		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2076 
2077 		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2078 	}
2079 
2080 	return ret;
2081 }
2082 
2083 /*
2084  * __btrfs_inc_extent_ref - insert backreference for a given extent
2085  *
2086  * @trans:	    Handle of transaction
2087  *
2088  * @node:	    The delayed ref node used to get the bytenr/length for
2089  *		    extent whose references are incremented.
2090  *
2091  * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2092  *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2093  *		    bytenr of the parent block. Since new extents are always
2094  *		    created with indirect references, this will only be the case
2095  *		    when relocating a shared extent. In that case, root_objectid
2096  *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2097  *		    be 0
2098  *
2099  * @root_objectid:  The id of the root where this modification has originated,
2100  *		    this can be either one of the well-known metadata trees or
2101  *		    the subvolume id which references this extent.
2102  *
2103  * @owner:	    For data extents it is the inode number of the owning file.
2104  *		    For metadata extents this parameter holds the level in the
2105  *		    tree of the extent.
2106  *
2107  * @offset:	    For metadata extents the offset is ignored and is currently
2108  *		    always passed as 0. For data extents it is the fileoffset
2109  *		    this extent belongs to.
2110  *
2111  * @refs_to_add     Number of references to add
2112  *
2113  * @extent_op       Pointer to a structure, holding information necessary when
2114  *                  updating a tree block's flags
2115  *
2116  */
2117 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2118 				  struct btrfs_delayed_ref_node *node,
2119 				  u64 parent, u64 root_objectid,
2120 				  u64 owner, u64 offset, int refs_to_add,
2121 				  struct btrfs_delayed_extent_op *extent_op)
2122 {
2123 	struct btrfs_path *path;
2124 	struct extent_buffer *leaf;
2125 	struct btrfs_extent_item *item;
2126 	struct btrfs_key key;
2127 	u64 bytenr = node->bytenr;
2128 	u64 num_bytes = node->num_bytes;
2129 	u64 refs;
2130 	int ret;
2131 
2132 	path = btrfs_alloc_path();
2133 	if (!path)
2134 		return -ENOMEM;
2135 
2136 	path->reada = READA_FORWARD;
2137 	path->leave_spinning = 1;
2138 	/* this will setup the path even if it fails to insert the back ref */
2139 	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2140 					   parent, root_objectid, owner,
2141 					   offset, refs_to_add, extent_op);
2142 	if ((ret < 0 && ret != -EAGAIN) || !ret)
2143 		goto out;
2144 
2145 	/*
2146 	 * Ok we had -EAGAIN which means we didn't have space to insert and
2147 	 * inline extent ref, so just update the reference count and add a
2148 	 * normal backref.
2149 	 */
2150 	leaf = path->nodes[0];
2151 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2152 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2153 	refs = btrfs_extent_refs(leaf, item);
2154 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2155 	if (extent_op)
2156 		__run_delayed_extent_op(extent_op, leaf, item);
2157 
2158 	btrfs_mark_buffer_dirty(leaf);
2159 	btrfs_release_path(path);
2160 
2161 	path->reada = READA_FORWARD;
2162 	path->leave_spinning = 1;
2163 	/* now insert the actual backref */
2164 	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2165 				    owner, offset, refs_to_add);
2166 	if (ret)
2167 		btrfs_abort_transaction(trans, ret);
2168 out:
2169 	btrfs_free_path(path);
2170 	return ret;
2171 }
2172 
2173 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2174 				struct btrfs_delayed_ref_node *node,
2175 				struct btrfs_delayed_extent_op *extent_op,
2176 				int insert_reserved)
2177 {
2178 	int ret = 0;
2179 	struct btrfs_delayed_data_ref *ref;
2180 	struct btrfs_key ins;
2181 	u64 parent = 0;
2182 	u64 ref_root = 0;
2183 	u64 flags = 0;
2184 
2185 	ins.objectid = node->bytenr;
2186 	ins.offset = node->num_bytes;
2187 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2188 
2189 	ref = btrfs_delayed_node_to_data_ref(node);
2190 	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2191 
2192 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2193 		parent = ref->parent;
2194 	ref_root = ref->root;
2195 
2196 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2197 		if (extent_op)
2198 			flags |= extent_op->flags_to_set;
2199 		ret = alloc_reserved_file_extent(trans, parent, ref_root,
2200 						 flags, ref->objectid,
2201 						 ref->offset, &ins,
2202 						 node->ref_mod);
2203 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2204 		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2205 					     ref->objectid, ref->offset,
2206 					     node->ref_mod, extent_op);
2207 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2208 		ret = __btrfs_free_extent(trans, node, parent,
2209 					  ref_root, ref->objectid,
2210 					  ref->offset, node->ref_mod,
2211 					  extent_op);
2212 	} else {
2213 		BUG();
2214 	}
2215 	return ret;
2216 }
2217 
2218 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2219 				    struct extent_buffer *leaf,
2220 				    struct btrfs_extent_item *ei)
2221 {
2222 	u64 flags = btrfs_extent_flags(leaf, ei);
2223 	if (extent_op->update_flags) {
2224 		flags |= extent_op->flags_to_set;
2225 		btrfs_set_extent_flags(leaf, ei, flags);
2226 	}
2227 
2228 	if (extent_op->update_key) {
2229 		struct btrfs_tree_block_info *bi;
2230 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2231 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2232 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2233 	}
2234 }
2235 
2236 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2237 				 struct btrfs_delayed_ref_head *head,
2238 				 struct btrfs_delayed_extent_op *extent_op)
2239 {
2240 	struct btrfs_fs_info *fs_info = trans->fs_info;
2241 	struct btrfs_key key;
2242 	struct btrfs_path *path;
2243 	struct btrfs_extent_item *ei;
2244 	struct extent_buffer *leaf;
2245 	u32 item_size;
2246 	int ret;
2247 	int err = 0;
2248 	int metadata = !extent_op->is_data;
2249 
2250 	if (trans->aborted)
2251 		return 0;
2252 
2253 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2254 		metadata = 0;
2255 
2256 	path = btrfs_alloc_path();
2257 	if (!path)
2258 		return -ENOMEM;
2259 
2260 	key.objectid = head->bytenr;
2261 
2262 	if (metadata) {
2263 		key.type = BTRFS_METADATA_ITEM_KEY;
2264 		key.offset = extent_op->level;
2265 	} else {
2266 		key.type = BTRFS_EXTENT_ITEM_KEY;
2267 		key.offset = head->num_bytes;
2268 	}
2269 
2270 again:
2271 	path->reada = READA_FORWARD;
2272 	path->leave_spinning = 1;
2273 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2274 	if (ret < 0) {
2275 		err = ret;
2276 		goto out;
2277 	}
2278 	if (ret > 0) {
2279 		if (metadata) {
2280 			if (path->slots[0] > 0) {
2281 				path->slots[0]--;
2282 				btrfs_item_key_to_cpu(path->nodes[0], &key,
2283 						      path->slots[0]);
2284 				if (key.objectid == head->bytenr &&
2285 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
2286 				    key.offset == head->num_bytes)
2287 					ret = 0;
2288 			}
2289 			if (ret > 0) {
2290 				btrfs_release_path(path);
2291 				metadata = 0;
2292 
2293 				key.objectid = head->bytenr;
2294 				key.offset = head->num_bytes;
2295 				key.type = BTRFS_EXTENT_ITEM_KEY;
2296 				goto again;
2297 			}
2298 		} else {
2299 			err = -EIO;
2300 			goto out;
2301 		}
2302 	}
2303 
2304 	leaf = path->nodes[0];
2305 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2306 
2307 	if (unlikely(item_size < sizeof(*ei))) {
2308 		err = -EINVAL;
2309 		btrfs_print_v0_err(fs_info);
2310 		btrfs_abort_transaction(trans, err);
2311 		goto out;
2312 	}
2313 
2314 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2315 	__run_delayed_extent_op(extent_op, leaf, ei);
2316 
2317 	btrfs_mark_buffer_dirty(leaf);
2318 out:
2319 	btrfs_free_path(path);
2320 	return err;
2321 }
2322 
2323 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2324 				struct btrfs_delayed_ref_node *node,
2325 				struct btrfs_delayed_extent_op *extent_op,
2326 				int insert_reserved)
2327 {
2328 	int ret = 0;
2329 	struct btrfs_delayed_tree_ref *ref;
2330 	u64 parent = 0;
2331 	u64 ref_root = 0;
2332 
2333 	ref = btrfs_delayed_node_to_tree_ref(node);
2334 	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2335 
2336 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2337 		parent = ref->parent;
2338 	ref_root = ref->root;
2339 
2340 	if (node->ref_mod != 1) {
2341 		btrfs_err(trans->fs_info,
2342 	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2343 			  node->bytenr, node->ref_mod, node->action, ref_root,
2344 			  parent);
2345 		return -EIO;
2346 	}
2347 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2348 		BUG_ON(!extent_op || !extent_op->update_flags);
2349 		ret = alloc_reserved_tree_block(trans, node, extent_op);
2350 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2351 		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2352 					     ref->level, 0, 1, extent_op);
2353 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2354 		ret = __btrfs_free_extent(trans, node, parent, ref_root,
2355 					  ref->level, 0, 1, extent_op);
2356 	} else {
2357 		BUG();
2358 	}
2359 	return ret;
2360 }
2361 
2362 /* helper function to actually process a single delayed ref entry */
2363 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2364 			       struct btrfs_delayed_ref_node *node,
2365 			       struct btrfs_delayed_extent_op *extent_op,
2366 			       int insert_reserved)
2367 {
2368 	int ret = 0;
2369 
2370 	if (trans->aborted) {
2371 		if (insert_reserved)
2372 			btrfs_pin_extent(trans->fs_info, node->bytenr,
2373 					 node->num_bytes, 1);
2374 		return 0;
2375 	}
2376 
2377 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2378 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2379 		ret = run_delayed_tree_ref(trans, node, extent_op,
2380 					   insert_reserved);
2381 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2382 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2383 		ret = run_delayed_data_ref(trans, node, extent_op,
2384 					   insert_reserved);
2385 	else
2386 		BUG();
2387 	if (ret && insert_reserved)
2388 		btrfs_pin_extent(trans->fs_info, node->bytenr,
2389 				 node->num_bytes, 1);
2390 	return ret;
2391 }
2392 
2393 static inline struct btrfs_delayed_ref_node *
2394 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2395 {
2396 	struct btrfs_delayed_ref_node *ref;
2397 
2398 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2399 		return NULL;
2400 
2401 	/*
2402 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2403 	 * This is to prevent a ref count from going down to zero, which deletes
2404 	 * the extent item from the extent tree, when there still are references
2405 	 * to add, which would fail because they would not find the extent item.
2406 	 */
2407 	if (!list_empty(&head->ref_add_list))
2408 		return list_first_entry(&head->ref_add_list,
2409 				struct btrfs_delayed_ref_node, add_list);
2410 
2411 	ref = rb_entry(rb_first_cached(&head->ref_tree),
2412 		       struct btrfs_delayed_ref_node, ref_node);
2413 	ASSERT(list_empty(&ref->add_list));
2414 	return ref;
2415 }
2416 
2417 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2418 				      struct btrfs_delayed_ref_head *head)
2419 {
2420 	spin_lock(&delayed_refs->lock);
2421 	head->processing = 0;
2422 	delayed_refs->num_heads_ready++;
2423 	spin_unlock(&delayed_refs->lock);
2424 	btrfs_delayed_ref_unlock(head);
2425 }
2426 
2427 static struct btrfs_delayed_extent_op *cleanup_extent_op(
2428 				struct btrfs_delayed_ref_head *head)
2429 {
2430 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2431 
2432 	if (!extent_op)
2433 		return NULL;
2434 
2435 	if (head->must_insert_reserved) {
2436 		head->extent_op = NULL;
2437 		btrfs_free_delayed_extent_op(extent_op);
2438 		return NULL;
2439 	}
2440 	return extent_op;
2441 }
2442 
2443 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2444 				     struct btrfs_delayed_ref_head *head)
2445 {
2446 	struct btrfs_delayed_extent_op *extent_op;
2447 	int ret;
2448 
2449 	extent_op = cleanup_extent_op(head);
2450 	if (!extent_op)
2451 		return 0;
2452 	head->extent_op = NULL;
2453 	spin_unlock(&head->lock);
2454 	ret = run_delayed_extent_op(trans, head, extent_op);
2455 	btrfs_free_delayed_extent_op(extent_op);
2456 	return ret ? ret : 1;
2457 }
2458 
2459 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2460 				  struct btrfs_delayed_ref_root *delayed_refs,
2461 				  struct btrfs_delayed_ref_head *head)
2462 {
2463 	int nr_items = 1;	/* Dropping this ref head update. */
2464 
2465 	if (head->total_ref_mod < 0) {
2466 		struct btrfs_space_info *space_info;
2467 		u64 flags;
2468 
2469 		if (head->is_data)
2470 			flags = BTRFS_BLOCK_GROUP_DATA;
2471 		else if (head->is_system)
2472 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
2473 		else
2474 			flags = BTRFS_BLOCK_GROUP_METADATA;
2475 		space_info = __find_space_info(fs_info, flags);
2476 		ASSERT(space_info);
2477 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
2478 				   -head->num_bytes,
2479 				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
2480 
2481 		/*
2482 		 * We had csum deletions accounted for in our delayed refs rsv,
2483 		 * we need to drop the csum leaves for this update from our
2484 		 * delayed_refs_rsv.
2485 		 */
2486 		if (head->is_data) {
2487 			spin_lock(&delayed_refs->lock);
2488 			delayed_refs->pending_csums -= head->num_bytes;
2489 			spin_unlock(&delayed_refs->lock);
2490 			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2491 				head->num_bytes);
2492 		}
2493 	}
2494 
2495 	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2496 }
2497 
2498 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2499 			    struct btrfs_delayed_ref_head *head)
2500 {
2501 
2502 	struct btrfs_fs_info *fs_info = trans->fs_info;
2503 	struct btrfs_delayed_ref_root *delayed_refs;
2504 	int ret;
2505 
2506 	delayed_refs = &trans->transaction->delayed_refs;
2507 
2508 	ret = run_and_cleanup_extent_op(trans, head);
2509 	if (ret < 0) {
2510 		unselect_delayed_ref_head(delayed_refs, head);
2511 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2512 		return ret;
2513 	} else if (ret) {
2514 		return ret;
2515 	}
2516 
2517 	/*
2518 	 * Need to drop our head ref lock and re-acquire the delayed ref lock
2519 	 * and then re-check to make sure nobody got added.
2520 	 */
2521 	spin_unlock(&head->lock);
2522 	spin_lock(&delayed_refs->lock);
2523 	spin_lock(&head->lock);
2524 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2525 		spin_unlock(&head->lock);
2526 		spin_unlock(&delayed_refs->lock);
2527 		return 1;
2528 	}
2529 	btrfs_delete_ref_head(delayed_refs, head);
2530 	spin_unlock(&head->lock);
2531 	spin_unlock(&delayed_refs->lock);
2532 
2533 	if (head->must_insert_reserved) {
2534 		btrfs_pin_extent(fs_info, head->bytenr,
2535 				 head->num_bytes, 1);
2536 		if (head->is_data) {
2537 			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2538 					      head->num_bytes);
2539 		}
2540 	}
2541 
2542 	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2543 
2544 	trace_run_delayed_ref_head(fs_info, head, 0);
2545 	btrfs_delayed_ref_unlock(head);
2546 	btrfs_put_delayed_ref_head(head);
2547 	return 0;
2548 }
2549 
2550 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2551 					struct btrfs_trans_handle *trans)
2552 {
2553 	struct btrfs_delayed_ref_root *delayed_refs =
2554 		&trans->transaction->delayed_refs;
2555 	struct btrfs_delayed_ref_head *head = NULL;
2556 	int ret;
2557 
2558 	spin_lock(&delayed_refs->lock);
2559 	head = btrfs_select_ref_head(delayed_refs);
2560 	if (!head) {
2561 		spin_unlock(&delayed_refs->lock);
2562 		return head;
2563 	}
2564 
2565 	/*
2566 	 * Grab the lock that says we are going to process all the refs for
2567 	 * this head
2568 	 */
2569 	ret = btrfs_delayed_ref_lock(delayed_refs, head);
2570 	spin_unlock(&delayed_refs->lock);
2571 
2572 	/*
2573 	 * We may have dropped the spin lock to get the head mutex lock, and
2574 	 * that might have given someone else time to free the head.  If that's
2575 	 * true, it has been removed from our list and we can move on.
2576 	 */
2577 	if (ret == -EAGAIN)
2578 		head = ERR_PTR(-EAGAIN);
2579 
2580 	return head;
2581 }
2582 
2583 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2584 				    struct btrfs_delayed_ref_head *locked_ref,
2585 				    unsigned long *run_refs)
2586 {
2587 	struct btrfs_fs_info *fs_info = trans->fs_info;
2588 	struct btrfs_delayed_ref_root *delayed_refs;
2589 	struct btrfs_delayed_extent_op *extent_op;
2590 	struct btrfs_delayed_ref_node *ref;
2591 	int must_insert_reserved = 0;
2592 	int ret;
2593 
2594 	delayed_refs = &trans->transaction->delayed_refs;
2595 
2596 	lockdep_assert_held(&locked_ref->mutex);
2597 	lockdep_assert_held(&locked_ref->lock);
2598 
2599 	while ((ref = select_delayed_ref(locked_ref))) {
2600 		if (ref->seq &&
2601 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
2602 			spin_unlock(&locked_ref->lock);
2603 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2604 			return -EAGAIN;
2605 		}
2606 
2607 		(*run_refs)++;
2608 		ref->in_tree = 0;
2609 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2610 		RB_CLEAR_NODE(&ref->ref_node);
2611 		if (!list_empty(&ref->add_list))
2612 			list_del(&ref->add_list);
2613 		/*
2614 		 * When we play the delayed ref, also correct the ref_mod on
2615 		 * head
2616 		 */
2617 		switch (ref->action) {
2618 		case BTRFS_ADD_DELAYED_REF:
2619 		case BTRFS_ADD_DELAYED_EXTENT:
2620 			locked_ref->ref_mod -= ref->ref_mod;
2621 			break;
2622 		case BTRFS_DROP_DELAYED_REF:
2623 			locked_ref->ref_mod += ref->ref_mod;
2624 			break;
2625 		default:
2626 			WARN_ON(1);
2627 		}
2628 		atomic_dec(&delayed_refs->num_entries);
2629 
2630 		/*
2631 		 * Record the must_insert_reserved flag before we drop the
2632 		 * spin lock.
2633 		 */
2634 		must_insert_reserved = locked_ref->must_insert_reserved;
2635 		locked_ref->must_insert_reserved = 0;
2636 
2637 		extent_op = locked_ref->extent_op;
2638 		locked_ref->extent_op = NULL;
2639 		spin_unlock(&locked_ref->lock);
2640 
2641 		ret = run_one_delayed_ref(trans, ref, extent_op,
2642 					  must_insert_reserved);
2643 
2644 		btrfs_free_delayed_extent_op(extent_op);
2645 		if (ret) {
2646 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2647 			btrfs_put_delayed_ref(ref);
2648 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2649 				    ret);
2650 			return ret;
2651 		}
2652 
2653 		btrfs_put_delayed_ref(ref);
2654 		cond_resched();
2655 
2656 		spin_lock(&locked_ref->lock);
2657 		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2658 	}
2659 
2660 	return 0;
2661 }
2662 
2663 /*
2664  * Returns 0 on success or if called with an already aborted transaction.
2665  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2666  */
2667 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2668 					     unsigned long nr)
2669 {
2670 	struct btrfs_fs_info *fs_info = trans->fs_info;
2671 	struct btrfs_delayed_ref_root *delayed_refs;
2672 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2673 	ktime_t start = ktime_get();
2674 	int ret;
2675 	unsigned long count = 0;
2676 	unsigned long actual_count = 0;
2677 
2678 	delayed_refs = &trans->transaction->delayed_refs;
2679 	do {
2680 		if (!locked_ref) {
2681 			locked_ref = btrfs_obtain_ref_head(trans);
2682 			if (IS_ERR_OR_NULL(locked_ref)) {
2683 				if (PTR_ERR(locked_ref) == -EAGAIN) {
2684 					continue;
2685 				} else {
2686 					break;
2687 				}
2688 			}
2689 			count++;
2690 		}
2691 		/*
2692 		 * We need to try and merge add/drops of the same ref since we
2693 		 * can run into issues with relocate dropping the implicit ref
2694 		 * and then it being added back again before the drop can
2695 		 * finish.  If we merged anything we need to re-loop so we can
2696 		 * get a good ref.
2697 		 * Or we can get node references of the same type that weren't
2698 		 * merged when created due to bumps in the tree mod seq, and
2699 		 * we need to merge them to prevent adding an inline extent
2700 		 * backref before dropping it (triggering a BUG_ON at
2701 		 * insert_inline_extent_backref()).
2702 		 */
2703 		spin_lock(&locked_ref->lock);
2704 		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2705 
2706 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2707 						      &actual_count);
2708 		if (ret < 0 && ret != -EAGAIN) {
2709 			/*
2710 			 * Error, btrfs_run_delayed_refs_for_head already
2711 			 * unlocked everything so just bail out
2712 			 */
2713 			return ret;
2714 		} else if (!ret) {
2715 			/*
2716 			 * Success, perform the usual cleanup of a processed
2717 			 * head
2718 			 */
2719 			ret = cleanup_ref_head(trans, locked_ref);
2720 			if (ret > 0 ) {
2721 				/* We dropped our lock, we need to loop. */
2722 				ret = 0;
2723 				continue;
2724 			} else if (ret) {
2725 				return ret;
2726 			}
2727 		}
2728 
2729 		/*
2730 		 * Either success case or btrfs_run_delayed_refs_for_head
2731 		 * returned -EAGAIN, meaning we need to select another head
2732 		 */
2733 
2734 		locked_ref = NULL;
2735 		cond_resched();
2736 	} while ((nr != -1 && count < nr) || locked_ref);
2737 
2738 	/*
2739 	 * We don't want to include ref heads since we can have empty ref heads
2740 	 * and those will drastically skew our runtime down since we just do
2741 	 * accounting, no actual extent tree updates.
2742 	 */
2743 	if (actual_count > 0) {
2744 		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2745 		u64 avg;
2746 
2747 		/*
2748 		 * We weigh the current average higher than our current runtime
2749 		 * to avoid large swings in the average.
2750 		 */
2751 		spin_lock(&delayed_refs->lock);
2752 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2753 		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2754 		spin_unlock(&delayed_refs->lock);
2755 	}
2756 	return 0;
2757 }
2758 
2759 #ifdef SCRAMBLE_DELAYED_REFS
2760 /*
2761  * Normally delayed refs get processed in ascending bytenr order. This
2762  * correlates in most cases to the order added. To expose dependencies on this
2763  * order, we start to process the tree in the middle instead of the beginning
2764  */
2765 static u64 find_middle(struct rb_root *root)
2766 {
2767 	struct rb_node *n = root->rb_node;
2768 	struct btrfs_delayed_ref_node *entry;
2769 	int alt = 1;
2770 	u64 middle;
2771 	u64 first = 0, last = 0;
2772 
2773 	n = rb_first(root);
2774 	if (n) {
2775 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2776 		first = entry->bytenr;
2777 	}
2778 	n = rb_last(root);
2779 	if (n) {
2780 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2781 		last = entry->bytenr;
2782 	}
2783 	n = root->rb_node;
2784 
2785 	while (n) {
2786 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2787 		WARN_ON(!entry->in_tree);
2788 
2789 		middle = entry->bytenr;
2790 
2791 		if (alt)
2792 			n = n->rb_left;
2793 		else
2794 			n = n->rb_right;
2795 
2796 		alt = 1 - alt;
2797 	}
2798 	return middle;
2799 }
2800 #endif
2801 
2802 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2803 {
2804 	u64 num_bytes;
2805 
2806 	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2807 			     sizeof(struct btrfs_extent_inline_ref));
2808 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2809 		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2810 
2811 	/*
2812 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2813 	 * closer to what we're really going to want to use.
2814 	 */
2815 	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2816 }
2817 
2818 /*
2819  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2820  * would require to store the csums for that many bytes.
2821  */
2822 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2823 {
2824 	u64 csum_size;
2825 	u64 num_csums_per_leaf;
2826 	u64 num_csums;
2827 
2828 	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2829 	num_csums_per_leaf = div64_u64(csum_size,
2830 			(u64)btrfs_super_csum_size(fs_info->super_copy));
2831 	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2832 	num_csums += num_csums_per_leaf - 1;
2833 	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2834 	return num_csums;
2835 }
2836 
2837 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2838 {
2839 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2840 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2841 	bool ret = false;
2842 	u64 reserved;
2843 
2844 	spin_lock(&global_rsv->lock);
2845 	reserved = global_rsv->reserved;
2846 	spin_unlock(&global_rsv->lock);
2847 
2848 	/*
2849 	 * Since the global reserve is just kind of magic we don't really want
2850 	 * to rely on it to save our bacon, so if our size is more than the
2851 	 * delayed_refs_rsv and the global rsv then it's time to think about
2852 	 * bailing.
2853 	 */
2854 	spin_lock(&delayed_refs_rsv->lock);
2855 	reserved += delayed_refs_rsv->reserved;
2856 	if (delayed_refs_rsv->size >= reserved)
2857 		ret = true;
2858 	spin_unlock(&delayed_refs_rsv->lock);
2859 	return ret;
2860 }
2861 
2862 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2863 {
2864 	u64 num_entries =
2865 		atomic_read(&trans->transaction->delayed_refs.num_entries);
2866 	u64 avg_runtime;
2867 	u64 val;
2868 
2869 	smp_mb();
2870 	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2871 	val = num_entries * avg_runtime;
2872 	if (val >= NSEC_PER_SEC)
2873 		return 1;
2874 	if (val >= NSEC_PER_SEC / 2)
2875 		return 2;
2876 
2877 	return btrfs_check_space_for_delayed_refs(trans->fs_info);
2878 }
2879 
2880 struct async_delayed_refs {
2881 	struct btrfs_root *root;
2882 	u64 transid;
2883 	int count;
2884 	int error;
2885 	int sync;
2886 	struct completion wait;
2887 	struct btrfs_work work;
2888 };
2889 
2890 static inline struct async_delayed_refs *
2891 to_async_delayed_refs(struct btrfs_work *work)
2892 {
2893 	return container_of(work, struct async_delayed_refs, work);
2894 }
2895 
2896 static void delayed_ref_async_start(struct btrfs_work *work)
2897 {
2898 	struct async_delayed_refs *async = to_async_delayed_refs(work);
2899 	struct btrfs_trans_handle *trans;
2900 	struct btrfs_fs_info *fs_info = async->root->fs_info;
2901 	int ret;
2902 
2903 	/* if the commit is already started, we don't need to wait here */
2904 	if (btrfs_transaction_blocked(fs_info))
2905 		goto done;
2906 
2907 	trans = btrfs_join_transaction(async->root);
2908 	if (IS_ERR(trans)) {
2909 		async->error = PTR_ERR(trans);
2910 		goto done;
2911 	}
2912 
2913 	/*
2914 	 * trans->sync means that when we call end_transaction, we won't
2915 	 * wait on delayed refs
2916 	 */
2917 	trans->sync = true;
2918 
2919 	/* Don't bother flushing if we got into a different transaction */
2920 	if (trans->transid > async->transid)
2921 		goto end;
2922 
2923 	ret = btrfs_run_delayed_refs(trans, async->count);
2924 	if (ret)
2925 		async->error = ret;
2926 end:
2927 	ret = btrfs_end_transaction(trans);
2928 	if (ret && !async->error)
2929 		async->error = ret;
2930 done:
2931 	if (async->sync)
2932 		complete(&async->wait);
2933 	else
2934 		kfree(async);
2935 }
2936 
2937 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2938 				 unsigned long count, u64 transid, int wait)
2939 {
2940 	struct async_delayed_refs *async;
2941 	int ret;
2942 
2943 	async = kmalloc(sizeof(*async), GFP_NOFS);
2944 	if (!async)
2945 		return -ENOMEM;
2946 
2947 	async->root = fs_info->tree_root;
2948 	async->count = count;
2949 	async->error = 0;
2950 	async->transid = transid;
2951 	if (wait)
2952 		async->sync = 1;
2953 	else
2954 		async->sync = 0;
2955 	init_completion(&async->wait);
2956 
2957 	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2958 			delayed_ref_async_start, NULL, NULL);
2959 
2960 	btrfs_queue_work(fs_info->extent_workers, &async->work);
2961 
2962 	if (wait) {
2963 		wait_for_completion(&async->wait);
2964 		ret = async->error;
2965 		kfree(async);
2966 		return ret;
2967 	}
2968 	return 0;
2969 }
2970 
2971 /*
2972  * this starts processing the delayed reference count updates and
2973  * extent insertions we have queued up so far.  count can be
2974  * 0, which means to process everything in the tree at the start
2975  * of the run (but not newly added entries), or it can be some target
2976  * number you'd like to process.
2977  *
2978  * Returns 0 on success or if called with an aborted transaction
2979  * Returns <0 on error and aborts the transaction
2980  */
2981 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2982 			   unsigned long count)
2983 {
2984 	struct btrfs_fs_info *fs_info = trans->fs_info;
2985 	struct rb_node *node;
2986 	struct btrfs_delayed_ref_root *delayed_refs;
2987 	struct btrfs_delayed_ref_head *head;
2988 	int ret;
2989 	int run_all = count == (unsigned long)-1;
2990 
2991 	/* We'll clean this up in btrfs_cleanup_transaction */
2992 	if (trans->aborted)
2993 		return 0;
2994 
2995 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2996 		return 0;
2997 
2998 	delayed_refs = &trans->transaction->delayed_refs;
2999 	if (count == 0)
3000 		count = atomic_read(&delayed_refs->num_entries) * 2;
3001 
3002 again:
3003 #ifdef SCRAMBLE_DELAYED_REFS
3004 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3005 #endif
3006 	ret = __btrfs_run_delayed_refs(trans, count);
3007 	if (ret < 0) {
3008 		btrfs_abort_transaction(trans, ret);
3009 		return ret;
3010 	}
3011 
3012 	if (run_all) {
3013 		btrfs_create_pending_block_groups(trans);
3014 
3015 		spin_lock(&delayed_refs->lock);
3016 		node = rb_first_cached(&delayed_refs->href_root);
3017 		if (!node) {
3018 			spin_unlock(&delayed_refs->lock);
3019 			goto out;
3020 		}
3021 		head = rb_entry(node, struct btrfs_delayed_ref_head,
3022 				href_node);
3023 		refcount_inc(&head->refs);
3024 		spin_unlock(&delayed_refs->lock);
3025 
3026 		/* Mutex was contended, block until it's released and retry. */
3027 		mutex_lock(&head->mutex);
3028 		mutex_unlock(&head->mutex);
3029 
3030 		btrfs_put_delayed_ref_head(head);
3031 		cond_resched();
3032 		goto again;
3033 	}
3034 out:
3035 	return 0;
3036 }
3037 
3038 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3039 				struct btrfs_fs_info *fs_info,
3040 				u64 bytenr, u64 num_bytes, u64 flags,
3041 				int level, int is_data)
3042 {
3043 	struct btrfs_delayed_extent_op *extent_op;
3044 	int ret;
3045 
3046 	extent_op = btrfs_alloc_delayed_extent_op();
3047 	if (!extent_op)
3048 		return -ENOMEM;
3049 
3050 	extent_op->flags_to_set = flags;
3051 	extent_op->update_flags = true;
3052 	extent_op->update_key = false;
3053 	extent_op->is_data = is_data ? true : false;
3054 	extent_op->level = level;
3055 
3056 	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3057 					  num_bytes, extent_op);
3058 	if (ret)
3059 		btrfs_free_delayed_extent_op(extent_op);
3060 	return ret;
3061 }
3062 
3063 static noinline int check_delayed_ref(struct btrfs_root *root,
3064 				      struct btrfs_path *path,
3065 				      u64 objectid, u64 offset, u64 bytenr)
3066 {
3067 	struct btrfs_delayed_ref_head *head;
3068 	struct btrfs_delayed_ref_node *ref;
3069 	struct btrfs_delayed_data_ref *data_ref;
3070 	struct btrfs_delayed_ref_root *delayed_refs;
3071 	struct btrfs_transaction *cur_trans;
3072 	struct rb_node *node;
3073 	int ret = 0;
3074 
3075 	spin_lock(&root->fs_info->trans_lock);
3076 	cur_trans = root->fs_info->running_transaction;
3077 	if (cur_trans)
3078 		refcount_inc(&cur_trans->use_count);
3079 	spin_unlock(&root->fs_info->trans_lock);
3080 	if (!cur_trans)
3081 		return 0;
3082 
3083 	delayed_refs = &cur_trans->delayed_refs;
3084 	spin_lock(&delayed_refs->lock);
3085 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3086 	if (!head) {
3087 		spin_unlock(&delayed_refs->lock);
3088 		btrfs_put_transaction(cur_trans);
3089 		return 0;
3090 	}
3091 
3092 	if (!mutex_trylock(&head->mutex)) {
3093 		refcount_inc(&head->refs);
3094 		spin_unlock(&delayed_refs->lock);
3095 
3096 		btrfs_release_path(path);
3097 
3098 		/*
3099 		 * Mutex was contended, block until it's released and let
3100 		 * caller try again
3101 		 */
3102 		mutex_lock(&head->mutex);
3103 		mutex_unlock(&head->mutex);
3104 		btrfs_put_delayed_ref_head(head);
3105 		btrfs_put_transaction(cur_trans);
3106 		return -EAGAIN;
3107 	}
3108 	spin_unlock(&delayed_refs->lock);
3109 
3110 	spin_lock(&head->lock);
3111 	/*
3112 	 * XXX: We should replace this with a proper search function in the
3113 	 * future.
3114 	 */
3115 	for (node = rb_first_cached(&head->ref_tree); node;
3116 	     node = rb_next(node)) {
3117 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3118 		/* If it's a shared ref we know a cross reference exists */
3119 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3120 			ret = 1;
3121 			break;
3122 		}
3123 
3124 		data_ref = btrfs_delayed_node_to_data_ref(ref);
3125 
3126 		/*
3127 		 * If our ref doesn't match the one we're currently looking at
3128 		 * then we have a cross reference.
3129 		 */
3130 		if (data_ref->root != root->root_key.objectid ||
3131 		    data_ref->objectid != objectid ||
3132 		    data_ref->offset != offset) {
3133 			ret = 1;
3134 			break;
3135 		}
3136 	}
3137 	spin_unlock(&head->lock);
3138 	mutex_unlock(&head->mutex);
3139 	btrfs_put_transaction(cur_trans);
3140 	return ret;
3141 }
3142 
3143 static noinline int check_committed_ref(struct btrfs_root *root,
3144 					struct btrfs_path *path,
3145 					u64 objectid, u64 offset, u64 bytenr)
3146 {
3147 	struct btrfs_fs_info *fs_info = root->fs_info;
3148 	struct btrfs_root *extent_root = fs_info->extent_root;
3149 	struct extent_buffer *leaf;
3150 	struct btrfs_extent_data_ref *ref;
3151 	struct btrfs_extent_inline_ref *iref;
3152 	struct btrfs_extent_item *ei;
3153 	struct btrfs_key key;
3154 	u32 item_size;
3155 	int type;
3156 	int ret;
3157 
3158 	key.objectid = bytenr;
3159 	key.offset = (u64)-1;
3160 	key.type = BTRFS_EXTENT_ITEM_KEY;
3161 
3162 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3163 	if (ret < 0)
3164 		goto out;
3165 	BUG_ON(ret == 0); /* Corruption */
3166 
3167 	ret = -ENOENT;
3168 	if (path->slots[0] == 0)
3169 		goto out;
3170 
3171 	path->slots[0]--;
3172 	leaf = path->nodes[0];
3173 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3174 
3175 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3176 		goto out;
3177 
3178 	ret = 1;
3179 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3180 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3181 
3182 	if (item_size != sizeof(*ei) +
3183 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3184 		goto out;
3185 
3186 	if (btrfs_extent_generation(leaf, ei) <=
3187 	    btrfs_root_last_snapshot(&root->root_item))
3188 		goto out;
3189 
3190 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3191 
3192 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3193 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
3194 		goto out;
3195 
3196 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3197 	if (btrfs_extent_refs(leaf, ei) !=
3198 	    btrfs_extent_data_ref_count(leaf, ref) ||
3199 	    btrfs_extent_data_ref_root(leaf, ref) !=
3200 	    root->root_key.objectid ||
3201 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3202 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
3203 		goto out;
3204 
3205 	ret = 0;
3206 out:
3207 	return ret;
3208 }
3209 
3210 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3211 			  u64 bytenr)
3212 {
3213 	struct btrfs_path *path;
3214 	int ret;
3215 
3216 	path = btrfs_alloc_path();
3217 	if (!path)
3218 		return -ENOMEM;
3219 
3220 	do {
3221 		ret = check_committed_ref(root, path, objectid,
3222 					  offset, bytenr);
3223 		if (ret && ret != -ENOENT)
3224 			goto out;
3225 
3226 		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3227 	} while (ret == -EAGAIN);
3228 
3229 out:
3230 	btrfs_free_path(path);
3231 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3232 		WARN_ON(ret > 0);
3233 	return ret;
3234 }
3235 
3236 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3237 			   struct btrfs_root *root,
3238 			   struct extent_buffer *buf,
3239 			   int full_backref, int inc)
3240 {
3241 	struct btrfs_fs_info *fs_info = root->fs_info;
3242 	u64 bytenr;
3243 	u64 num_bytes;
3244 	u64 parent;
3245 	u64 ref_root;
3246 	u32 nritems;
3247 	struct btrfs_key key;
3248 	struct btrfs_file_extent_item *fi;
3249 	int i;
3250 	int level;
3251 	int ret = 0;
3252 	int (*process_func)(struct btrfs_trans_handle *,
3253 			    struct btrfs_root *,
3254 			    u64, u64, u64, u64, u64, u64);
3255 
3256 
3257 	if (btrfs_is_testing(fs_info))
3258 		return 0;
3259 
3260 	ref_root = btrfs_header_owner(buf);
3261 	nritems = btrfs_header_nritems(buf);
3262 	level = btrfs_header_level(buf);
3263 
3264 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3265 		return 0;
3266 
3267 	if (inc)
3268 		process_func = btrfs_inc_extent_ref;
3269 	else
3270 		process_func = btrfs_free_extent;
3271 
3272 	if (full_backref)
3273 		parent = buf->start;
3274 	else
3275 		parent = 0;
3276 
3277 	for (i = 0; i < nritems; i++) {
3278 		if (level == 0) {
3279 			btrfs_item_key_to_cpu(buf, &key, i);
3280 			if (key.type != BTRFS_EXTENT_DATA_KEY)
3281 				continue;
3282 			fi = btrfs_item_ptr(buf, i,
3283 					    struct btrfs_file_extent_item);
3284 			if (btrfs_file_extent_type(buf, fi) ==
3285 			    BTRFS_FILE_EXTENT_INLINE)
3286 				continue;
3287 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3288 			if (bytenr == 0)
3289 				continue;
3290 
3291 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3292 			key.offset -= btrfs_file_extent_offset(buf, fi);
3293 			ret = process_func(trans, root, bytenr, num_bytes,
3294 					   parent, ref_root, key.objectid,
3295 					   key.offset);
3296 			if (ret)
3297 				goto fail;
3298 		} else {
3299 			bytenr = btrfs_node_blockptr(buf, i);
3300 			num_bytes = fs_info->nodesize;
3301 			ret = process_func(trans, root, bytenr, num_bytes,
3302 					   parent, ref_root, level - 1, 0);
3303 			if (ret)
3304 				goto fail;
3305 		}
3306 	}
3307 	return 0;
3308 fail:
3309 	return ret;
3310 }
3311 
3312 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3313 		  struct extent_buffer *buf, int full_backref)
3314 {
3315 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3316 }
3317 
3318 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3319 		  struct extent_buffer *buf, int full_backref)
3320 {
3321 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3322 }
3323 
3324 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3325 				 struct btrfs_fs_info *fs_info,
3326 				 struct btrfs_path *path,
3327 				 struct btrfs_block_group_cache *cache)
3328 {
3329 	int ret;
3330 	struct btrfs_root *extent_root = fs_info->extent_root;
3331 	unsigned long bi;
3332 	struct extent_buffer *leaf;
3333 
3334 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3335 	if (ret) {
3336 		if (ret > 0)
3337 			ret = -ENOENT;
3338 		goto fail;
3339 	}
3340 
3341 	leaf = path->nodes[0];
3342 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3343 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3344 	btrfs_mark_buffer_dirty(leaf);
3345 fail:
3346 	btrfs_release_path(path);
3347 	return ret;
3348 
3349 }
3350 
3351 static struct btrfs_block_group_cache *
3352 next_block_group(struct btrfs_fs_info *fs_info,
3353 		 struct btrfs_block_group_cache *cache)
3354 {
3355 	struct rb_node *node;
3356 
3357 	spin_lock(&fs_info->block_group_cache_lock);
3358 
3359 	/* If our block group was removed, we need a full search. */
3360 	if (RB_EMPTY_NODE(&cache->cache_node)) {
3361 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3362 
3363 		spin_unlock(&fs_info->block_group_cache_lock);
3364 		btrfs_put_block_group(cache);
3365 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3366 	}
3367 	node = rb_next(&cache->cache_node);
3368 	btrfs_put_block_group(cache);
3369 	if (node) {
3370 		cache = rb_entry(node, struct btrfs_block_group_cache,
3371 				 cache_node);
3372 		btrfs_get_block_group(cache);
3373 	} else
3374 		cache = NULL;
3375 	spin_unlock(&fs_info->block_group_cache_lock);
3376 	return cache;
3377 }
3378 
3379 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3380 			    struct btrfs_trans_handle *trans,
3381 			    struct btrfs_path *path)
3382 {
3383 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3384 	struct btrfs_root *root = fs_info->tree_root;
3385 	struct inode *inode = NULL;
3386 	struct extent_changeset *data_reserved = NULL;
3387 	u64 alloc_hint = 0;
3388 	int dcs = BTRFS_DC_ERROR;
3389 	u64 num_pages = 0;
3390 	int retries = 0;
3391 	int ret = 0;
3392 
3393 	/*
3394 	 * If this block group is smaller than 100 megs don't bother caching the
3395 	 * block group.
3396 	 */
3397 	if (block_group->key.offset < (100 * SZ_1M)) {
3398 		spin_lock(&block_group->lock);
3399 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3400 		spin_unlock(&block_group->lock);
3401 		return 0;
3402 	}
3403 
3404 	if (trans->aborted)
3405 		return 0;
3406 again:
3407 	inode = lookup_free_space_inode(fs_info, block_group, path);
3408 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3409 		ret = PTR_ERR(inode);
3410 		btrfs_release_path(path);
3411 		goto out;
3412 	}
3413 
3414 	if (IS_ERR(inode)) {
3415 		BUG_ON(retries);
3416 		retries++;
3417 
3418 		if (block_group->ro)
3419 			goto out_free;
3420 
3421 		ret = create_free_space_inode(fs_info, trans, block_group,
3422 					      path);
3423 		if (ret)
3424 			goto out_free;
3425 		goto again;
3426 	}
3427 
3428 	/*
3429 	 * We want to set the generation to 0, that way if anything goes wrong
3430 	 * from here on out we know not to trust this cache when we load up next
3431 	 * time.
3432 	 */
3433 	BTRFS_I(inode)->generation = 0;
3434 	ret = btrfs_update_inode(trans, root, inode);
3435 	if (ret) {
3436 		/*
3437 		 * So theoretically we could recover from this, simply set the
3438 		 * super cache generation to 0 so we know to invalidate the
3439 		 * cache, but then we'd have to keep track of the block groups
3440 		 * that fail this way so we know we _have_ to reset this cache
3441 		 * before the next commit or risk reading stale cache.  So to
3442 		 * limit our exposure to horrible edge cases lets just abort the
3443 		 * transaction, this only happens in really bad situations
3444 		 * anyway.
3445 		 */
3446 		btrfs_abort_transaction(trans, ret);
3447 		goto out_put;
3448 	}
3449 	WARN_ON(ret);
3450 
3451 	/* We've already setup this transaction, go ahead and exit */
3452 	if (block_group->cache_generation == trans->transid &&
3453 	    i_size_read(inode)) {
3454 		dcs = BTRFS_DC_SETUP;
3455 		goto out_put;
3456 	}
3457 
3458 	if (i_size_read(inode) > 0) {
3459 		ret = btrfs_check_trunc_cache_free_space(fs_info,
3460 					&fs_info->global_block_rsv);
3461 		if (ret)
3462 			goto out_put;
3463 
3464 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3465 		if (ret)
3466 			goto out_put;
3467 	}
3468 
3469 	spin_lock(&block_group->lock);
3470 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3471 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3472 		/*
3473 		 * don't bother trying to write stuff out _if_
3474 		 * a) we're not cached,
3475 		 * b) we're with nospace_cache mount option,
3476 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3477 		 */
3478 		dcs = BTRFS_DC_WRITTEN;
3479 		spin_unlock(&block_group->lock);
3480 		goto out_put;
3481 	}
3482 	spin_unlock(&block_group->lock);
3483 
3484 	/*
3485 	 * We hit an ENOSPC when setting up the cache in this transaction, just
3486 	 * skip doing the setup, we've already cleared the cache so we're safe.
3487 	 */
3488 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3489 		ret = -ENOSPC;
3490 		goto out_put;
3491 	}
3492 
3493 	/*
3494 	 * Try to preallocate enough space based on how big the block group is.
3495 	 * Keep in mind this has to include any pinned space which could end up
3496 	 * taking up quite a bit since it's not folded into the other space
3497 	 * cache.
3498 	 */
3499 	num_pages = div_u64(block_group->key.offset, SZ_256M);
3500 	if (!num_pages)
3501 		num_pages = 1;
3502 
3503 	num_pages *= 16;
3504 	num_pages *= PAGE_SIZE;
3505 
3506 	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3507 	if (ret)
3508 		goto out_put;
3509 
3510 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3511 					      num_pages, num_pages,
3512 					      &alloc_hint);
3513 	/*
3514 	 * Our cache requires contiguous chunks so that we don't modify a bunch
3515 	 * of metadata or split extents when writing the cache out, which means
3516 	 * we can enospc if we are heavily fragmented in addition to just normal
3517 	 * out of space conditions.  So if we hit this just skip setting up any
3518 	 * other block groups for this transaction, maybe we'll unpin enough
3519 	 * space the next time around.
3520 	 */
3521 	if (!ret)
3522 		dcs = BTRFS_DC_SETUP;
3523 	else if (ret == -ENOSPC)
3524 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3525 
3526 out_put:
3527 	iput(inode);
3528 out_free:
3529 	btrfs_release_path(path);
3530 out:
3531 	spin_lock(&block_group->lock);
3532 	if (!ret && dcs == BTRFS_DC_SETUP)
3533 		block_group->cache_generation = trans->transid;
3534 	block_group->disk_cache_state = dcs;
3535 	spin_unlock(&block_group->lock);
3536 
3537 	extent_changeset_free(data_reserved);
3538 	return ret;
3539 }
3540 
3541 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3542 			    struct btrfs_fs_info *fs_info)
3543 {
3544 	struct btrfs_block_group_cache *cache, *tmp;
3545 	struct btrfs_transaction *cur_trans = trans->transaction;
3546 	struct btrfs_path *path;
3547 
3548 	if (list_empty(&cur_trans->dirty_bgs) ||
3549 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
3550 		return 0;
3551 
3552 	path = btrfs_alloc_path();
3553 	if (!path)
3554 		return -ENOMEM;
3555 
3556 	/* Could add new block groups, use _safe just in case */
3557 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3558 				 dirty_list) {
3559 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3560 			cache_save_setup(cache, trans, path);
3561 	}
3562 
3563 	btrfs_free_path(path);
3564 	return 0;
3565 }
3566 
3567 /*
3568  * transaction commit does final block group cache writeback during a
3569  * critical section where nothing is allowed to change the FS.  This is
3570  * required in order for the cache to actually match the block group,
3571  * but can introduce a lot of latency into the commit.
3572  *
3573  * So, btrfs_start_dirty_block_groups is here to kick off block group
3574  * cache IO.  There's a chance we'll have to redo some of it if the
3575  * block group changes again during the commit, but it greatly reduces
3576  * the commit latency by getting rid of the easy block groups while
3577  * we're still allowing others to join the commit.
3578  */
3579 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3580 {
3581 	struct btrfs_fs_info *fs_info = trans->fs_info;
3582 	struct btrfs_block_group_cache *cache;
3583 	struct btrfs_transaction *cur_trans = trans->transaction;
3584 	int ret = 0;
3585 	int should_put;
3586 	struct btrfs_path *path = NULL;
3587 	LIST_HEAD(dirty);
3588 	struct list_head *io = &cur_trans->io_bgs;
3589 	int num_started = 0;
3590 	int loops = 0;
3591 
3592 	spin_lock(&cur_trans->dirty_bgs_lock);
3593 	if (list_empty(&cur_trans->dirty_bgs)) {
3594 		spin_unlock(&cur_trans->dirty_bgs_lock);
3595 		return 0;
3596 	}
3597 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3598 	spin_unlock(&cur_trans->dirty_bgs_lock);
3599 
3600 again:
3601 	/*
3602 	 * make sure all the block groups on our dirty list actually
3603 	 * exist
3604 	 */
3605 	btrfs_create_pending_block_groups(trans);
3606 
3607 	if (!path) {
3608 		path = btrfs_alloc_path();
3609 		if (!path)
3610 			return -ENOMEM;
3611 	}
3612 
3613 	/*
3614 	 * cache_write_mutex is here only to save us from balance or automatic
3615 	 * removal of empty block groups deleting this block group while we are
3616 	 * writing out the cache
3617 	 */
3618 	mutex_lock(&trans->transaction->cache_write_mutex);
3619 	while (!list_empty(&dirty)) {
3620 		bool drop_reserve = true;
3621 
3622 		cache = list_first_entry(&dirty,
3623 					 struct btrfs_block_group_cache,
3624 					 dirty_list);
3625 		/*
3626 		 * this can happen if something re-dirties a block
3627 		 * group that is already under IO.  Just wait for it to
3628 		 * finish and then do it all again
3629 		 */
3630 		if (!list_empty(&cache->io_list)) {
3631 			list_del_init(&cache->io_list);
3632 			btrfs_wait_cache_io(trans, cache, path);
3633 			btrfs_put_block_group(cache);
3634 		}
3635 
3636 
3637 		/*
3638 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3639 		 * if it should update the cache_state.  Don't delete
3640 		 * until after we wait.
3641 		 *
3642 		 * Since we're not running in the commit critical section
3643 		 * we need the dirty_bgs_lock to protect from update_block_group
3644 		 */
3645 		spin_lock(&cur_trans->dirty_bgs_lock);
3646 		list_del_init(&cache->dirty_list);
3647 		spin_unlock(&cur_trans->dirty_bgs_lock);
3648 
3649 		should_put = 1;
3650 
3651 		cache_save_setup(cache, trans, path);
3652 
3653 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3654 			cache->io_ctl.inode = NULL;
3655 			ret = btrfs_write_out_cache(fs_info, trans,
3656 						    cache, path);
3657 			if (ret == 0 && cache->io_ctl.inode) {
3658 				num_started++;
3659 				should_put = 0;
3660 
3661 				/*
3662 				 * The cache_write_mutex is protecting the
3663 				 * io_list, also refer to the definition of
3664 				 * btrfs_transaction::io_bgs for more details
3665 				 */
3666 				list_add_tail(&cache->io_list, io);
3667 			} else {
3668 				/*
3669 				 * if we failed to write the cache, the
3670 				 * generation will be bad and life goes on
3671 				 */
3672 				ret = 0;
3673 			}
3674 		}
3675 		if (!ret) {
3676 			ret = write_one_cache_group(trans, fs_info,
3677 						    path, cache);
3678 			/*
3679 			 * Our block group might still be attached to the list
3680 			 * of new block groups in the transaction handle of some
3681 			 * other task (struct btrfs_trans_handle->new_bgs). This
3682 			 * means its block group item isn't yet in the extent
3683 			 * tree. If this happens ignore the error, as we will
3684 			 * try again later in the critical section of the
3685 			 * transaction commit.
3686 			 */
3687 			if (ret == -ENOENT) {
3688 				ret = 0;
3689 				spin_lock(&cur_trans->dirty_bgs_lock);
3690 				if (list_empty(&cache->dirty_list)) {
3691 					list_add_tail(&cache->dirty_list,
3692 						      &cur_trans->dirty_bgs);
3693 					btrfs_get_block_group(cache);
3694 					drop_reserve = false;
3695 				}
3696 				spin_unlock(&cur_trans->dirty_bgs_lock);
3697 			} else if (ret) {
3698 				btrfs_abort_transaction(trans, ret);
3699 			}
3700 		}
3701 
3702 		/* if it's not on the io list, we need to put the block group */
3703 		if (should_put)
3704 			btrfs_put_block_group(cache);
3705 		if (drop_reserve)
3706 			btrfs_delayed_refs_rsv_release(fs_info, 1);
3707 
3708 		if (ret)
3709 			break;
3710 
3711 		/*
3712 		 * Avoid blocking other tasks for too long. It might even save
3713 		 * us from writing caches for block groups that are going to be
3714 		 * removed.
3715 		 */
3716 		mutex_unlock(&trans->transaction->cache_write_mutex);
3717 		mutex_lock(&trans->transaction->cache_write_mutex);
3718 	}
3719 	mutex_unlock(&trans->transaction->cache_write_mutex);
3720 
3721 	/*
3722 	 * go through delayed refs for all the stuff we've just kicked off
3723 	 * and then loop back (just once)
3724 	 */
3725 	ret = btrfs_run_delayed_refs(trans, 0);
3726 	if (!ret && loops == 0) {
3727 		loops++;
3728 		spin_lock(&cur_trans->dirty_bgs_lock);
3729 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3730 		/*
3731 		 * dirty_bgs_lock protects us from concurrent block group
3732 		 * deletes too (not just cache_write_mutex).
3733 		 */
3734 		if (!list_empty(&dirty)) {
3735 			spin_unlock(&cur_trans->dirty_bgs_lock);
3736 			goto again;
3737 		}
3738 		spin_unlock(&cur_trans->dirty_bgs_lock);
3739 	} else if (ret < 0) {
3740 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3741 	}
3742 
3743 	btrfs_free_path(path);
3744 	return ret;
3745 }
3746 
3747 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3748 				   struct btrfs_fs_info *fs_info)
3749 {
3750 	struct btrfs_block_group_cache *cache;
3751 	struct btrfs_transaction *cur_trans = trans->transaction;
3752 	int ret = 0;
3753 	int should_put;
3754 	struct btrfs_path *path;
3755 	struct list_head *io = &cur_trans->io_bgs;
3756 	int num_started = 0;
3757 
3758 	path = btrfs_alloc_path();
3759 	if (!path)
3760 		return -ENOMEM;
3761 
3762 	/*
3763 	 * Even though we are in the critical section of the transaction commit,
3764 	 * we can still have concurrent tasks adding elements to this
3765 	 * transaction's list of dirty block groups. These tasks correspond to
3766 	 * endio free space workers started when writeback finishes for a
3767 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3768 	 * allocate new block groups as a result of COWing nodes of the root
3769 	 * tree when updating the free space inode. The writeback for the space
3770 	 * caches is triggered by an earlier call to
3771 	 * btrfs_start_dirty_block_groups() and iterations of the following
3772 	 * loop.
3773 	 * Also we want to do the cache_save_setup first and then run the
3774 	 * delayed refs to make sure we have the best chance at doing this all
3775 	 * in one shot.
3776 	 */
3777 	spin_lock(&cur_trans->dirty_bgs_lock);
3778 	while (!list_empty(&cur_trans->dirty_bgs)) {
3779 		cache = list_first_entry(&cur_trans->dirty_bgs,
3780 					 struct btrfs_block_group_cache,
3781 					 dirty_list);
3782 
3783 		/*
3784 		 * this can happen if cache_save_setup re-dirties a block
3785 		 * group that is already under IO.  Just wait for it to
3786 		 * finish and then do it all again
3787 		 */
3788 		if (!list_empty(&cache->io_list)) {
3789 			spin_unlock(&cur_trans->dirty_bgs_lock);
3790 			list_del_init(&cache->io_list);
3791 			btrfs_wait_cache_io(trans, cache, path);
3792 			btrfs_put_block_group(cache);
3793 			spin_lock(&cur_trans->dirty_bgs_lock);
3794 		}
3795 
3796 		/*
3797 		 * don't remove from the dirty list until after we've waited
3798 		 * on any pending IO
3799 		 */
3800 		list_del_init(&cache->dirty_list);
3801 		spin_unlock(&cur_trans->dirty_bgs_lock);
3802 		should_put = 1;
3803 
3804 		cache_save_setup(cache, trans, path);
3805 
3806 		if (!ret)
3807 			ret = btrfs_run_delayed_refs(trans,
3808 						     (unsigned long) -1);
3809 
3810 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3811 			cache->io_ctl.inode = NULL;
3812 			ret = btrfs_write_out_cache(fs_info, trans,
3813 						    cache, path);
3814 			if (ret == 0 && cache->io_ctl.inode) {
3815 				num_started++;
3816 				should_put = 0;
3817 				list_add_tail(&cache->io_list, io);
3818 			} else {
3819 				/*
3820 				 * if we failed to write the cache, the
3821 				 * generation will be bad and life goes on
3822 				 */
3823 				ret = 0;
3824 			}
3825 		}
3826 		if (!ret) {
3827 			ret = write_one_cache_group(trans, fs_info,
3828 						    path, cache);
3829 			/*
3830 			 * One of the free space endio workers might have
3831 			 * created a new block group while updating a free space
3832 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3833 			 * and hasn't released its transaction handle yet, in
3834 			 * which case the new block group is still attached to
3835 			 * its transaction handle and its creation has not
3836 			 * finished yet (no block group item in the extent tree
3837 			 * yet, etc). If this is the case, wait for all free
3838 			 * space endio workers to finish and retry. This is a
3839 			 * a very rare case so no need for a more efficient and
3840 			 * complex approach.
3841 			 */
3842 			if (ret == -ENOENT) {
3843 				wait_event(cur_trans->writer_wait,
3844 				   atomic_read(&cur_trans->num_writers) == 1);
3845 				ret = write_one_cache_group(trans, fs_info,
3846 							    path, cache);
3847 			}
3848 			if (ret)
3849 				btrfs_abort_transaction(trans, ret);
3850 		}
3851 
3852 		/* if its not on the io list, we need to put the block group */
3853 		if (should_put)
3854 			btrfs_put_block_group(cache);
3855 		btrfs_delayed_refs_rsv_release(fs_info, 1);
3856 		spin_lock(&cur_trans->dirty_bgs_lock);
3857 	}
3858 	spin_unlock(&cur_trans->dirty_bgs_lock);
3859 
3860 	/*
3861 	 * Refer to the definition of io_bgs member for details why it's safe
3862 	 * to use it without any locking
3863 	 */
3864 	while (!list_empty(io)) {
3865 		cache = list_first_entry(io, struct btrfs_block_group_cache,
3866 					 io_list);
3867 		list_del_init(&cache->io_list);
3868 		btrfs_wait_cache_io(trans, cache, path);
3869 		btrfs_put_block_group(cache);
3870 	}
3871 
3872 	btrfs_free_path(path);
3873 	return ret;
3874 }
3875 
3876 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3877 {
3878 	struct btrfs_block_group_cache *block_group;
3879 	int readonly = 0;
3880 
3881 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
3882 	if (!block_group || block_group->ro)
3883 		readonly = 1;
3884 	if (block_group)
3885 		btrfs_put_block_group(block_group);
3886 	return readonly;
3887 }
3888 
3889 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3890 {
3891 	struct btrfs_block_group_cache *bg;
3892 	bool ret = true;
3893 
3894 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3895 	if (!bg)
3896 		return false;
3897 
3898 	spin_lock(&bg->lock);
3899 	if (bg->ro)
3900 		ret = false;
3901 	else
3902 		atomic_inc(&bg->nocow_writers);
3903 	spin_unlock(&bg->lock);
3904 
3905 	/* no put on block group, done by btrfs_dec_nocow_writers */
3906 	if (!ret)
3907 		btrfs_put_block_group(bg);
3908 
3909 	return ret;
3910 
3911 }
3912 
3913 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3914 {
3915 	struct btrfs_block_group_cache *bg;
3916 
3917 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3918 	ASSERT(bg);
3919 	if (atomic_dec_and_test(&bg->nocow_writers))
3920 		wake_up_var(&bg->nocow_writers);
3921 	/*
3922 	 * Once for our lookup and once for the lookup done by a previous call
3923 	 * to btrfs_inc_nocow_writers()
3924 	 */
3925 	btrfs_put_block_group(bg);
3926 	btrfs_put_block_group(bg);
3927 }
3928 
3929 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3930 {
3931 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3932 }
3933 
3934 static const char *alloc_name(u64 flags)
3935 {
3936 	switch (flags) {
3937 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3938 		return "mixed";
3939 	case BTRFS_BLOCK_GROUP_METADATA:
3940 		return "metadata";
3941 	case BTRFS_BLOCK_GROUP_DATA:
3942 		return "data";
3943 	case BTRFS_BLOCK_GROUP_SYSTEM:
3944 		return "system";
3945 	default:
3946 		WARN_ON(1);
3947 		return "invalid-combination";
3948 	};
3949 }
3950 
3951 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3952 {
3953 
3954 	struct btrfs_space_info *space_info;
3955 	int i;
3956 	int ret;
3957 
3958 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3959 	if (!space_info)
3960 		return -ENOMEM;
3961 
3962 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3963 				 GFP_KERNEL);
3964 	if (ret) {
3965 		kfree(space_info);
3966 		return ret;
3967 	}
3968 
3969 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3970 		INIT_LIST_HEAD(&space_info->block_groups[i]);
3971 	init_rwsem(&space_info->groups_sem);
3972 	spin_lock_init(&space_info->lock);
3973 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3974 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3975 	init_waitqueue_head(&space_info->wait);
3976 	INIT_LIST_HEAD(&space_info->ro_bgs);
3977 	INIT_LIST_HEAD(&space_info->tickets);
3978 	INIT_LIST_HEAD(&space_info->priority_tickets);
3979 
3980 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3981 				    info->space_info_kobj, "%s",
3982 				    alloc_name(space_info->flags));
3983 	if (ret) {
3984 		percpu_counter_destroy(&space_info->total_bytes_pinned);
3985 		kfree(space_info);
3986 		return ret;
3987 	}
3988 
3989 	list_add_rcu(&space_info->list, &info->space_info);
3990 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3991 		info->data_sinfo = space_info;
3992 
3993 	return ret;
3994 }
3995 
3996 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3997 			     u64 total_bytes, u64 bytes_used,
3998 			     u64 bytes_readonly,
3999 			     struct btrfs_space_info **space_info)
4000 {
4001 	struct btrfs_space_info *found;
4002 	int factor;
4003 
4004 	factor = btrfs_bg_type_to_factor(flags);
4005 
4006 	found = __find_space_info(info, flags);
4007 	ASSERT(found);
4008 	spin_lock(&found->lock);
4009 	found->total_bytes += total_bytes;
4010 	found->disk_total += total_bytes * factor;
4011 	found->bytes_used += bytes_used;
4012 	found->disk_used += bytes_used * factor;
4013 	found->bytes_readonly += bytes_readonly;
4014 	if (total_bytes > 0)
4015 		found->full = 0;
4016 	space_info_add_new_bytes(info, found, total_bytes -
4017 				 bytes_used - bytes_readonly);
4018 	spin_unlock(&found->lock);
4019 	*space_info = found;
4020 }
4021 
4022 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4023 {
4024 	u64 extra_flags = chunk_to_extended(flags) &
4025 				BTRFS_EXTENDED_PROFILE_MASK;
4026 
4027 	write_seqlock(&fs_info->profiles_lock);
4028 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4029 		fs_info->avail_data_alloc_bits |= extra_flags;
4030 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
4031 		fs_info->avail_metadata_alloc_bits |= extra_flags;
4032 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4033 		fs_info->avail_system_alloc_bits |= extra_flags;
4034 	write_sequnlock(&fs_info->profiles_lock);
4035 }
4036 
4037 /*
4038  * returns target flags in extended format or 0 if restripe for this
4039  * chunk_type is not in progress
4040  *
4041  * should be called with balance_lock held
4042  */
4043 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4044 {
4045 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4046 	u64 target = 0;
4047 
4048 	if (!bctl)
4049 		return 0;
4050 
4051 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
4052 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4053 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4054 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4055 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4056 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4057 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4058 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4059 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4060 	}
4061 
4062 	return target;
4063 }
4064 
4065 /*
4066  * @flags: available profiles in extended format (see ctree.h)
4067  *
4068  * Returns reduced profile in chunk format.  If profile changing is in
4069  * progress (either running or paused) picks the target profile (if it's
4070  * already available), otherwise falls back to plain reducing.
4071  */
4072 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4073 {
4074 	u64 num_devices = fs_info->fs_devices->rw_devices;
4075 	u64 target;
4076 	u64 raid_type;
4077 	u64 allowed = 0;
4078 
4079 	/*
4080 	 * see if restripe for this chunk_type is in progress, if so
4081 	 * try to reduce to the target profile
4082 	 */
4083 	spin_lock(&fs_info->balance_lock);
4084 	target = get_restripe_target(fs_info, flags);
4085 	if (target) {
4086 		/* pick target profile only if it's already available */
4087 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4088 			spin_unlock(&fs_info->balance_lock);
4089 			return extended_to_chunk(target);
4090 		}
4091 	}
4092 	spin_unlock(&fs_info->balance_lock);
4093 
4094 	/* First, mask out the RAID levels which aren't possible */
4095 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4096 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4097 			allowed |= btrfs_raid_array[raid_type].bg_flag;
4098 	}
4099 	allowed &= flags;
4100 
4101 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4102 		allowed = BTRFS_BLOCK_GROUP_RAID6;
4103 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4104 		allowed = BTRFS_BLOCK_GROUP_RAID5;
4105 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4106 		allowed = BTRFS_BLOCK_GROUP_RAID10;
4107 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4108 		allowed = BTRFS_BLOCK_GROUP_RAID1;
4109 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4110 		allowed = BTRFS_BLOCK_GROUP_RAID0;
4111 
4112 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4113 
4114 	return extended_to_chunk(flags | allowed);
4115 }
4116 
4117 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4118 {
4119 	unsigned seq;
4120 	u64 flags;
4121 
4122 	do {
4123 		flags = orig_flags;
4124 		seq = read_seqbegin(&fs_info->profiles_lock);
4125 
4126 		if (flags & BTRFS_BLOCK_GROUP_DATA)
4127 			flags |= fs_info->avail_data_alloc_bits;
4128 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4129 			flags |= fs_info->avail_system_alloc_bits;
4130 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4131 			flags |= fs_info->avail_metadata_alloc_bits;
4132 	} while (read_seqretry(&fs_info->profiles_lock, seq));
4133 
4134 	return btrfs_reduce_alloc_profile(fs_info, flags);
4135 }
4136 
4137 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4138 {
4139 	struct btrfs_fs_info *fs_info = root->fs_info;
4140 	u64 flags;
4141 	u64 ret;
4142 
4143 	if (data)
4144 		flags = BTRFS_BLOCK_GROUP_DATA;
4145 	else if (root == fs_info->chunk_root)
4146 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
4147 	else
4148 		flags = BTRFS_BLOCK_GROUP_METADATA;
4149 
4150 	ret = get_alloc_profile(fs_info, flags);
4151 	return ret;
4152 }
4153 
4154 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4155 {
4156 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4157 }
4158 
4159 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4160 {
4161 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4162 }
4163 
4164 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4165 {
4166 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4167 }
4168 
4169 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4170 				 bool may_use_included)
4171 {
4172 	ASSERT(s_info);
4173 	return s_info->bytes_used + s_info->bytes_reserved +
4174 		s_info->bytes_pinned + s_info->bytes_readonly +
4175 		(may_use_included ? s_info->bytes_may_use : 0);
4176 }
4177 
4178 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4179 {
4180 	struct btrfs_root *root = inode->root;
4181 	struct btrfs_fs_info *fs_info = root->fs_info;
4182 	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4183 	u64 used;
4184 	int ret = 0;
4185 	int need_commit = 2;
4186 	int have_pinned_space;
4187 
4188 	/* make sure bytes are sectorsize aligned */
4189 	bytes = ALIGN(bytes, fs_info->sectorsize);
4190 
4191 	if (btrfs_is_free_space_inode(inode)) {
4192 		need_commit = 0;
4193 		ASSERT(current->journal_info);
4194 	}
4195 
4196 again:
4197 	/* make sure we have enough space to handle the data first */
4198 	spin_lock(&data_sinfo->lock);
4199 	used = btrfs_space_info_used(data_sinfo, true);
4200 
4201 	if (used + bytes > data_sinfo->total_bytes) {
4202 		struct btrfs_trans_handle *trans;
4203 
4204 		/*
4205 		 * if we don't have enough free bytes in this space then we need
4206 		 * to alloc a new chunk.
4207 		 */
4208 		if (!data_sinfo->full) {
4209 			u64 alloc_target;
4210 
4211 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4212 			spin_unlock(&data_sinfo->lock);
4213 
4214 			alloc_target = btrfs_data_alloc_profile(fs_info);
4215 			/*
4216 			 * It is ugly that we don't call nolock join
4217 			 * transaction for the free space inode case here.
4218 			 * But it is safe because we only do the data space
4219 			 * reservation for the free space cache in the
4220 			 * transaction context, the common join transaction
4221 			 * just increase the counter of the current transaction
4222 			 * handler, doesn't try to acquire the trans_lock of
4223 			 * the fs.
4224 			 */
4225 			trans = btrfs_join_transaction(root);
4226 			if (IS_ERR(trans))
4227 				return PTR_ERR(trans);
4228 
4229 			ret = do_chunk_alloc(trans, alloc_target,
4230 					     CHUNK_ALLOC_NO_FORCE);
4231 			btrfs_end_transaction(trans);
4232 			if (ret < 0) {
4233 				if (ret != -ENOSPC)
4234 					return ret;
4235 				else {
4236 					have_pinned_space = 1;
4237 					goto commit_trans;
4238 				}
4239 			}
4240 
4241 			goto again;
4242 		}
4243 
4244 		/*
4245 		 * If we don't have enough pinned space to deal with this
4246 		 * allocation, and no removed chunk in current transaction,
4247 		 * don't bother committing the transaction.
4248 		 */
4249 		have_pinned_space = __percpu_counter_compare(
4250 			&data_sinfo->total_bytes_pinned,
4251 			used + bytes - data_sinfo->total_bytes,
4252 			BTRFS_TOTAL_BYTES_PINNED_BATCH);
4253 		spin_unlock(&data_sinfo->lock);
4254 
4255 		/* commit the current transaction and try again */
4256 commit_trans:
4257 		if (need_commit) {
4258 			need_commit--;
4259 
4260 			if (need_commit > 0) {
4261 				btrfs_start_delalloc_roots(fs_info, -1);
4262 				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4263 							 (u64)-1);
4264 			}
4265 
4266 			trans = btrfs_join_transaction(root);
4267 			if (IS_ERR(trans))
4268 				return PTR_ERR(trans);
4269 			if (have_pinned_space >= 0 ||
4270 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4271 				     &trans->transaction->flags) ||
4272 			    need_commit > 0) {
4273 				ret = btrfs_commit_transaction(trans);
4274 				if (ret)
4275 					return ret;
4276 				/*
4277 				 * The cleaner kthread might still be doing iput
4278 				 * operations. Wait for it to finish so that
4279 				 * more space is released.  We don't need to
4280 				 * explicitly run the delayed iputs here because
4281 				 * the commit_transaction would have woken up
4282 				 * the cleaner.
4283 				 */
4284 				ret = btrfs_wait_on_delayed_iputs(fs_info);
4285 				if (ret)
4286 					return ret;
4287 				goto again;
4288 			} else {
4289 				btrfs_end_transaction(trans);
4290 			}
4291 		}
4292 
4293 		trace_btrfs_space_reservation(fs_info,
4294 					      "space_info:enospc",
4295 					      data_sinfo->flags, bytes, 1);
4296 		return -ENOSPC;
4297 	}
4298 	update_bytes_may_use(data_sinfo, bytes);
4299 	trace_btrfs_space_reservation(fs_info, "space_info",
4300 				      data_sinfo->flags, bytes, 1);
4301 	spin_unlock(&data_sinfo->lock);
4302 
4303 	return 0;
4304 }
4305 
4306 int btrfs_check_data_free_space(struct inode *inode,
4307 			struct extent_changeset **reserved, u64 start, u64 len)
4308 {
4309 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4310 	int ret;
4311 
4312 	/* align the range */
4313 	len = round_up(start + len, fs_info->sectorsize) -
4314 	      round_down(start, fs_info->sectorsize);
4315 	start = round_down(start, fs_info->sectorsize);
4316 
4317 	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4318 	if (ret < 0)
4319 		return ret;
4320 
4321 	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4322 	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4323 	if (ret < 0)
4324 		btrfs_free_reserved_data_space_noquota(inode, start, len);
4325 	else
4326 		ret = 0;
4327 	return ret;
4328 }
4329 
4330 /*
4331  * Called if we need to clear a data reservation for this inode
4332  * Normally in a error case.
4333  *
4334  * This one will *NOT* use accurate qgroup reserved space API, just for case
4335  * which we can't sleep and is sure it won't affect qgroup reserved space.
4336  * Like clear_bit_hook().
4337  */
4338 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4339 					    u64 len)
4340 {
4341 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4342 	struct btrfs_space_info *data_sinfo;
4343 
4344 	/* Make sure the range is aligned to sectorsize */
4345 	len = round_up(start + len, fs_info->sectorsize) -
4346 	      round_down(start, fs_info->sectorsize);
4347 	start = round_down(start, fs_info->sectorsize);
4348 
4349 	data_sinfo = fs_info->data_sinfo;
4350 	spin_lock(&data_sinfo->lock);
4351 	update_bytes_may_use(data_sinfo, -len);
4352 	trace_btrfs_space_reservation(fs_info, "space_info",
4353 				      data_sinfo->flags, len, 0);
4354 	spin_unlock(&data_sinfo->lock);
4355 }
4356 
4357 /*
4358  * Called if we need to clear a data reservation for this inode
4359  * Normally in a error case.
4360  *
4361  * This one will handle the per-inode data rsv map for accurate reserved
4362  * space framework.
4363  */
4364 void btrfs_free_reserved_data_space(struct inode *inode,
4365 			struct extent_changeset *reserved, u64 start, u64 len)
4366 {
4367 	struct btrfs_root *root = BTRFS_I(inode)->root;
4368 
4369 	/* Make sure the range is aligned to sectorsize */
4370 	len = round_up(start + len, root->fs_info->sectorsize) -
4371 	      round_down(start, root->fs_info->sectorsize);
4372 	start = round_down(start, root->fs_info->sectorsize);
4373 
4374 	btrfs_free_reserved_data_space_noquota(inode, start, len);
4375 	btrfs_qgroup_free_data(inode, reserved, start, len);
4376 }
4377 
4378 static void force_metadata_allocation(struct btrfs_fs_info *info)
4379 {
4380 	struct list_head *head = &info->space_info;
4381 	struct btrfs_space_info *found;
4382 
4383 	rcu_read_lock();
4384 	list_for_each_entry_rcu(found, head, list) {
4385 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4386 			found->force_alloc = CHUNK_ALLOC_FORCE;
4387 	}
4388 	rcu_read_unlock();
4389 }
4390 
4391 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4392 {
4393 	return (global->size << 1);
4394 }
4395 
4396 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4397 			      struct btrfs_space_info *sinfo, int force)
4398 {
4399 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
4400 	u64 thresh;
4401 
4402 	if (force == CHUNK_ALLOC_FORCE)
4403 		return 1;
4404 
4405 	/*
4406 	 * in limited mode, we want to have some free space up to
4407 	 * about 1% of the FS size.
4408 	 */
4409 	if (force == CHUNK_ALLOC_LIMITED) {
4410 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
4411 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4412 
4413 		if (sinfo->total_bytes - bytes_used < thresh)
4414 			return 1;
4415 	}
4416 
4417 	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4418 		return 0;
4419 	return 1;
4420 }
4421 
4422 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4423 {
4424 	u64 num_dev;
4425 
4426 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4427 		    BTRFS_BLOCK_GROUP_RAID0 |
4428 		    BTRFS_BLOCK_GROUP_RAID5 |
4429 		    BTRFS_BLOCK_GROUP_RAID6))
4430 		num_dev = fs_info->fs_devices->rw_devices;
4431 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
4432 		num_dev = 2;
4433 	else
4434 		num_dev = 1;	/* DUP or single */
4435 
4436 	return num_dev;
4437 }
4438 
4439 /*
4440  * If @is_allocation is true, reserve space in the system space info necessary
4441  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4442  * removing a chunk.
4443  */
4444 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4445 {
4446 	struct btrfs_fs_info *fs_info = trans->fs_info;
4447 	struct btrfs_space_info *info;
4448 	u64 left;
4449 	u64 thresh;
4450 	int ret = 0;
4451 	u64 num_devs;
4452 
4453 	/*
4454 	 * Needed because we can end up allocating a system chunk and for an
4455 	 * atomic and race free space reservation in the chunk block reserve.
4456 	 */
4457 	lockdep_assert_held(&fs_info->chunk_mutex);
4458 
4459 	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4460 	spin_lock(&info->lock);
4461 	left = info->total_bytes - btrfs_space_info_used(info, true);
4462 	spin_unlock(&info->lock);
4463 
4464 	num_devs = get_profile_num_devs(fs_info, type);
4465 
4466 	/* num_devs device items to update and 1 chunk item to add or remove */
4467 	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4468 		btrfs_calc_trans_metadata_size(fs_info, 1);
4469 
4470 	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4471 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4472 			   left, thresh, type);
4473 		dump_space_info(fs_info, info, 0, 0);
4474 	}
4475 
4476 	if (left < thresh) {
4477 		u64 flags = btrfs_system_alloc_profile(fs_info);
4478 
4479 		/*
4480 		 * Ignore failure to create system chunk. We might end up not
4481 		 * needing it, as we might not need to COW all nodes/leafs from
4482 		 * the paths we visit in the chunk tree (they were already COWed
4483 		 * or created in the current transaction for example).
4484 		 */
4485 		ret = btrfs_alloc_chunk(trans, flags);
4486 	}
4487 
4488 	if (!ret) {
4489 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
4490 					  &fs_info->chunk_block_rsv,
4491 					  thresh, BTRFS_RESERVE_NO_FLUSH);
4492 		if (!ret)
4493 			trans->chunk_bytes_reserved += thresh;
4494 	}
4495 }
4496 
4497 /*
4498  * If force is CHUNK_ALLOC_FORCE:
4499  *    - return 1 if it successfully allocates a chunk,
4500  *    - return errors including -ENOSPC otherwise.
4501  * If force is NOT CHUNK_ALLOC_FORCE:
4502  *    - return 0 if it doesn't need to allocate a new chunk,
4503  *    - return 1 if it successfully allocates a chunk,
4504  *    - return errors including -ENOSPC otherwise.
4505  */
4506 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4507 			  int force)
4508 {
4509 	struct btrfs_fs_info *fs_info = trans->fs_info;
4510 	struct btrfs_space_info *space_info;
4511 	bool wait_for_alloc = false;
4512 	bool should_alloc = false;
4513 	int ret = 0;
4514 
4515 	/* Don't re-enter if we're already allocating a chunk */
4516 	if (trans->allocating_chunk)
4517 		return -ENOSPC;
4518 
4519 	space_info = __find_space_info(fs_info, flags);
4520 	ASSERT(space_info);
4521 
4522 	do {
4523 		spin_lock(&space_info->lock);
4524 		if (force < space_info->force_alloc)
4525 			force = space_info->force_alloc;
4526 		should_alloc = should_alloc_chunk(fs_info, space_info, force);
4527 		if (space_info->full) {
4528 			/* No more free physical space */
4529 			if (should_alloc)
4530 				ret = -ENOSPC;
4531 			else
4532 				ret = 0;
4533 			spin_unlock(&space_info->lock);
4534 			return ret;
4535 		} else if (!should_alloc) {
4536 			spin_unlock(&space_info->lock);
4537 			return 0;
4538 		} else if (space_info->chunk_alloc) {
4539 			/*
4540 			 * Someone is already allocating, so we need to block
4541 			 * until this someone is finished and then loop to
4542 			 * recheck if we should continue with our allocation
4543 			 * attempt.
4544 			 */
4545 			wait_for_alloc = true;
4546 			spin_unlock(&space_info->lock);
4547 			mutex_lock(&fs_info->chunk_mutex);
4548 			mutex_unlock(&fs_info->chunk_mutex);
4549 		} else {
4550 			/* Proceed with allocation */
4551 			space_info->chunk_alloc = 1;
4552 			wait_for_alloc = false;
4553 			spin_unlock(&space_info->lock);
4554 		}
4555 
4556 		cond_resched();
4557 	} while (wait_for_alloc);
4558 
4559 	mutex_lock(&fs_info->chunk_mutex);
4560 	trans->allocating_chunk = true;
4561 
4562 	/*
4563 	 * If we have mixed data/metadata chunks we want to make sure we keep
4564 	 * allocating mixed chunks instead of individual chunks.
4565 	 */
4566 	if (btrfs_mixed_space_info(space_info))
4567 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4568 
4569 	/*
4570 	 * if we're doing a data chunk, go ahead and make sure that
4571 	 * we keep a reasonable number of metadata chunks allocated in the
4572 	 * FS as well.
4573 	 */
4574 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4575 		fs_info->data_chunk_allocations++;
4576 		if (!(fs_info->data_chunk_allocations %
4577 		      fs_info->metadata_ratio))
4578 			force_metadata_allocation(fs_info);
4579 	}
4580 
4581 	/*
4582 	 * Check if we have enough space in SYSTEM chunk because we may need
4583 	 * to update devices.
4584 	 */
4585 	check_system_chunk(trans, flags);
4586 
4587 	ret = btrfs_alloc_chunk(trans, flags);
4588 	trans->allocating_chunk = false;
4589 
4590 	spin_lock(&space_info->lock);
4591 	if (ret < 0) {
4592 		if (ret == -ENOSPC)
4593 			space_info->full = 1;
4594 		else
4595 			goto out;
4596 	} else {
4597 		ret = 1;
4598 		space_info->max_extent_size = 0;
4599 	}
4600 
4601 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4602 out:
4603 	space_info->chunk_alloc = 0;
4604 	spin_unlock(&space_info->lock);
4605 	mutex_unlock(&fs_info->chunk_mutex);
4606 	/*
4607 	 * When we allocate a new chunk we reserve space in the chunk block
4608 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4609 	 * add new nodes/leafs to it if we end up needing to do it when
4610 	 * inserting the chunk item and updating device items as part of the
4611 	 * second phase of chunk allocation, performed by
4612 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4613 	 * large number of new block groups to create in our transaction
4614 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
4615 	 * in extreme cases - like having a single transaction create many new
4616 	 * block groups when starting to write out the free space caches of all
4617 	 * the block groups that were made dirty during the lifetime of the
4618 	 * transaction.
4619 	 */
4620 	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4621 		btrfs_create_pending_block_groups(trans);
4622 
4623 	return ret;
4624 }
4625 
4626 static int can_overcommit(struct btrfs_fs_info *fs_info,
4627 			  struct btrfs_space_info *space_info, u64 bytes,
4628 			  enum btrfs_reserve_flush_enum flush,
4629 			  bool system_chunk)
4630 {
4631 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4632 	u64 profile;
4633 	u64 space_size;
4634 	u64 avail;
4635 	u64 used;
4636 	int factor;
4637 
4638 	/* Don't overcommit when in mixed mode. */
4639 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4640 		return 0;
4641 
4642 	if (system_chunk)
4643 		profile = btrfs_system_alloc_profile(fs_info);
4644 	else
4645 		profile = btrfs_metadata_alloc_profile(fs_info);
4646 
4647 	used = btrfs_space_info_used(space_info, false);
4648 
4649 	/*
4650 	 * We only want to allow over committing if we have lots of actual space
4651 	 * free, but if we don't have enough space to handle the global reserve
4652 	 * space then we could end up having a real enospc problem when trying
4653 	 * to allocate a chunk or some other such important allocation.
4654 	 */
4655 	spin_lock(&global_rsv->lock);
4656 	space_size = calc_global_rsv_need_space(global_rsv);
4657 	spin_unlock(&global_rsv->lock);
4658 	if (used + space_size >= space_info->total_bytes)
4659 		return 0;
4660 
4661 	used += space_info->bytes_may_use;
4662 
4663 	avail = atomic64_read(&fs_info->free_chunk_space);
4664 
4665 	/*
4666 	 * If we have dup, raid1 or raid10 then only half of the free
4667 	 * space is actually usable.  For raid56, the space info used
4668 	 * doesn't include the parity drive, so we don't have to
4669 	 * change the math
4670 	 */
4671 	factor = btrfs_bg_type_to_factor(profile);
4672 	avail = div_u64(avail, factor);
4673 
4674 	/*
4675 	 * If we aren't flushing all things, let us overcommit up to
4676 	 * 1/2th of the space. If we can flush, don't let us overcommit
4677 	 * too much, let it overcommit up to 1/8 of the space.
4678 	 */
4679 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
4680 		avail >>= 3;
4681 	else
4682 		avail >>= 1;
4683 
4684 	if (used + bytes < space_info->total_bytes + avail)
4685 		return 1;
4686 	return 0;
4687 }
4688 
4689 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4690 					 unsigned long nr_pages, int nr_items)
4691 {
4692 	struct super_block *sb = fs_info->sb;
4693 
4694 	if (down_read_trylock(&sb->s_umount)) {
4695 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4696 		up_read(&sb->s_umount);
4697 	} else {
4698 		/*
4699 		 * We needn't worry the filesystem going from r/w to r/o though
4700 		 * we don't acquire ->s_umount mutex, because the filesystem
4701 		 * should guarantee the delalloc inodes list be empty after
4702 		 * the filesystem is readonly(all dirty pages are written to
4703 		 * the disk).
4704 		 */
4705 		btrfs_start_delalloc_roots(fs_info, nr_items);
4706 		if (!current->journal_info)
4707 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4708 	}
4709 }
4710 
4711 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4712 					u64 to_reclaim)
4713 {
4714 	u64 bytes;
4715 	u64 nr;
4716 
4717 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4718 	nr = div64_u64(to_reclaim, bytes);
4719 	if (!nr)
4720 		nr = 1;
4721 	return nr;
4722 }
4723 
4724 #define EXTENT_SIZE_PER_ITEM	SZ_256K
4725 
4726 /*
4727  * shrink metadata reservation for delalloc
4728  */
4729 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4730 			    u64 orig, bool wait_ordered)
4731 {
4732 	struct btrfs_space_info *space_info;
4733 	struct btrfs_trans_handle *trans;
4734 	u64 delalloc_bytes;
4735 	u64 async_pages;
4736 	u64 items;
4737 	long time_left;
4738 	unsigned long nr_pages;
4739 	int loops;
4740 
4741 	/* Calc the number of the pages we need flush for space reservation */
4742 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
4743 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4744 
4745 	trans = (struct btrfs_trans_handle *)current->journal_info;
4746 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4747 
4748 	delalloc_bytes = percpu_counter_sum_positive(
4749 						&fs_info->delalloc_bytes);
4750 	if (delalloc_bytes == 0) {
4751 		if (trans)
4752 			return;
4753 		if (wait_ordered)
4754 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4755 		return;
4756 	}
4757 
4758 	loops = 0;
4759 	while (delalloc_bytes && loops < 3) {
4760 		nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4761 
4762 		/*
4763 		 * Triggers inode writeback for up to nr_pages. This will invoke
4764 		 * ->writepages callback and trigger delalloc filling
4765 		 *  (btrfs_run_delalloc_range()).
4766 		 */
4767 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4768 
4769 		/*
4770 		 * We need to wait for the compressed pages to start before
4771 		 * we continue.
4772 		 */
4773 		async_pages = atomic_read(&fs_info->async_delalloc_pages);
4774 		if (!async_pages)
4775 			goto skip_async;
4776 
4777 		/*
4778 		 * Calculate how many compressed pages we want to be written
4779 		 * before we continue. I.e if there are more async pages than we
4780 		 * require wait_event will wait until nr_pages are written.
4781 		 */
4782 		if (async_pages <= nr_pages)
4783 			async_pages = 0;
4784 		else
4785 			async_pages -= nr_pages;
4786 
4787 		wait_event(fs_info->async_submit_wait,
4788 			   atomic_read(&fs_info->async_delalloc_pages) <=
4789 			   (int)async_pages);
4790 skip_async:
4791 		spin_lock(&space_info->lock);
4792 		if (list_empty(&space_info->tickets) &&
4793 		    list_empty(&space_info->priority_tickets)) {
4794 			spin_unlock(&space_info->lock);
4795 			break;
4796 		}
4797 		spin_unlock(&space_info->lock);
4798 
4799 		loops++;
4800 		if (wait_ordered && !trans) {
4801 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4802 		} else {
4803 			time_left = schedule_timeout_killable(1);
4804 			if (time_left)
4805 				break;
4806 		}
4807 		delalloc_bytes = percpu_counter_sum_positive(
4808 						&fs_info->delalloc_bytes);
4809 	}
4810 }
4811 
4812 struct reserve_ticket {
4813 	u64 orig_bytes;
4814 	u64 bytes;
4815 	int error;
4816 	struct list_head list;
4817 	wait_queue_head_t wait;
4818 };
4819 
4820 /**
4821  * maybe_commit_transaction - possibly commit the transaction if its ok to
4822  * @root - the root we're allocating for
4823  * @bytes - the number of bytes we want to reserve
4824  * @force - force the commit
4825  *
4826  * This will check to make sure that committing the transaction will actually
4827  * get us somewhere and then commit the transaction if it does.  Otherwise it
4828  * will return -ENOSPC.
4829  */
4830 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4831 				  struct btrfs_space_info *space_info)
4832 {
4833 	struct reserve_ticket *ticket = NULL;
4834 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4835 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4836 	struct btrfs_trans_handle *trans;
4837 	u64 bytes_needed;
4838 	u64 reclaim_bytes = 0;
4839 
4840 	trans = (struct btrfs_trans_handle *)current->journal_info;
4841 	if (trans)
4842 		return -EAGAIN;
4843 
4844 	spin_lock(&space_info->lock);
4845 	if (!list_empty(&space_info->priority_tickets))
4846 		ticket = list_first_entry(&space_info->priority_tickets,
4847 					  struct reserve_ticket, list);
4848 	else if (!list_empty(&space_info->tickets))
4849 		ticket = list_first_entry(&space_info->tickets,
4850 					  struct reserve_ticket, list);
4851 	bytes_needed = (ticket) ? ticket->bytes : 0;
4852 	spin_unlock(&space_info->lock);
4853 
4854 	if (!bytes_needed)
4855 		return 0;
4856 
4857 	trans = btrfs_join_transaction(fs_info->extent_root);
4858 	if (IS_ERR(trans))
4859 		return PTR_ERR(trans);
4860 
4861 	/*
4862 	 * See if there is enough pinned space to make this reservation, or if
4863 	 * we have block groups that are going to be freed, allowing us to
4864 	 * possibly do a chunk allocation the next loop through.
4865 	 */
4866 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4867 	    __percpu_counter_compare(&space_info->total_bytes_pinned,
4868 				     bytes_needed,
4869 				     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4870 		goto commit;
4871 
4872 	/*
4873 	 * See if there is some space in the delayed insertion reservation for
4874 	 * this reservation.
4875 	 */
4876 	if (space_info != delayed_rsv->space_info)
4877 		goto enospc;
4878 
4879 	spin_lock(&delayed_rsv->lock);
4880 	reclaim_bytes += delayed_rsv->reserved;
4881 	spin_unlock(&delayed_rsv->lock);
4882 
4883 	spin_lock(&delayed_refs_rsv->lock);
4884 	reclaim_bytes += delayed_refs_rsv->reserved;
4885 	spin_unlock(&delayed_refs_rsv->lock);
4886 	if (reclaim_bytes >= bytes_needed)
4887 		goto commit;
4888 	bytes_needed -= reclaim_bytes;
4889 
4890 	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4891 				   bytes_needed,
4892 				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4893 		goto enospc;
4894 
4895 commit:
4896 	return btrfs_commit_transaction(trans);
4897 enospc:
4898 	btrfs_end_transaction(trans);
4899 	return -ENOSPC;
4900 }
4901 
4902 /*
4903  * Try to flush some data based on policy set by @state. This is only advisory
4904  * and may fail for various reasons. The caller is supposed to examine the
4905  * state of @space_info to detect the outcome.
4906  */
4907 static void flush_space(struct btrfs_fs_info *fs_info,
4908 		       struct btrfs_space_info *space_info, u64 num_bytes,
4909 		       int state)
4910 {
4911 	struct btrfs_root *root = fs_info->extent_root;
4912 	struct btrfs_trans_handle *trans;
4913 	int nr;
4914 	int ret = 0;
4915 
4916 	switch (state) {
4917 	case FLUSH_DELAYED_ITEMS_NR:
4918 	case FLUSH_DELAYED_ITEMS:
4919 		if (state == FLUSH_DELAYED_ITEMS_NR)
4920 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4921 		else
4922 			nr = -1;
4923 
4924 		trans = btrfs_join_transaction(root);
4925 		if (IS_ERR(trans)) {
4926 			ret = PTR_ERR(trans);
4927 			break;
4928 		}
4929 		ret = btrfs_run_delayed_items_nr(trans, nr);
4930 		btrfs_end_transaction(trans);
4931 		break;
4932 	case FLUSH_DELALLOC:
4933 	case FLUSH_DELALLOC_WAIT:
4934 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4935 				state == FLUSH_DELALLOC_WAIT);
4936 		break;
4937 	case FLUSH_DELAYED_REFS_NR:
4938 	case FLUSH_DELAYED_REFS:
4939 		trans = btrfs_join_transaction(root);
4940 		if (IS_ERR(trans)) {
4941 			ret = PTR_ERR(trans);
4942 			break;
4943 		}
4944 		if (state == FLUSH_DELAYED_REFS_NR)
4945 			nr = calc_reclaim_items_nr(fs_info, num_bytes);
4946 		else
4947 			nr = 0;
4948 		btrfs_run_delayed_refs(trans, nr);
4949 		btrfs_end_transaction(trans);
4950 		break;
4951 	case ALLOC_CHUNK:
4952 	case ALLOC_CHUNK_FORCE:
4953 		trans = btrfs_join_transaction(root);
4954 		if (IS_ERR(trans)) {
4955 			ret = PTR_ERR(trans);
4956 			break;
4957 		}
4958 		ret = do_chunk_alloc(trans,
4959 				     btrfs_metadata_alloc_profile(fs_info),
4960 				     (state == ALLOC_CHUNK) ?
4961 				      CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
4962 		btrfs_end_transaction(trans);
4963 		if (ret > 0 || ret == -ENOSPC)
4964 			ret = 0;
4965 		break;
4966 	case COMMIT_TRANS:
4967 		/*
4968 		 * If we have pending delayed iputs then we could free up a
4969 		 * bunch of pinned space, so make sure we run the iputs before
4970 		 * we do our pinned bytes check below.
4971 		 */
4972 		btrfs_run_delayed_iputs(fs_info);
4973 		btrfs_wait_on_delayed_iputs(fs_info);
4974 
4975 		ret = may_commit_transaction(fs_info, space_info);
4976 		break;
4977 	default:
4978 		ret = -ENOSPC;
4979 		break;
4980 	}
4981 
4982 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4983 				ret);
4984 	return;
4985 }
4986 
4987 static inline u64
4988 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4989 				 struct btrfs_space_info *space_info,
4990 				 bool system_chunk)
4991 {
4992 	struct reserve_ticket *ticket;
4993 	u64 used;
4994 	u64 expected;
4995 	u64 to_reclaim = 0;
4996 
4997 	list_for_each_entry(ticket, &space_info->tickets, list)
4998 		to_reclaim += ticket->bytes;
4999 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
5000 		to_reclaim += ticket->bytes;
5001 	if (to_reclaim)
5002 		return to_reclaim;
5003 
5004 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
5005 	if (can_overcommit(fs_info, space_info, to_reclaim,
5006 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5007 		return 0;
5008 
5009 	used = btrfs_space_info_used(space_info, true);
5010 
5011 	if (can_overcommit(fs_info, space_info, SZ_1M,
5012 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5013 		expected = div_factor_fine(space_info->total_bytes, 95);
5014 	else
5015 		expected = div_factor_fine(space_info->total_bytes, 90);
5016 
5017 	if (used > expected)
5018 		to_reclaim = used - expected;
5019 	else
5020 		to_reclaim = 0;
5021 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5022 				     space_info->bytes_reserved);
5023 	return to_reclaim;
5024 }
5025 
5026 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5027 					struct btrfs_space_info *space_info,
5028 					u64 used, bool system_chunk)
5029 {
5030 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5031 
5032 	/* If we're just plain full then async reclaim just slows us down. */
5033 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5034 		return 0;
5035 
5036 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5037 					      system_chunk))
5038 		return 0;
5039 
5040 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5041 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5042 }
5043 
5044 static bool wake_all_tickets(struct list_head *head)
5045 {
5046 	struct reserve_ticket *ticket;
5047 
5048 	while (!list_empty(head)) {
5049 		ticket = list_first_entry(head, struct reserve_ticket, list);
5050 		list_del_init(&ticket->list);
5051 		ticket->error = -ENOSPC;
5052 		wake_up(&ticket->wait);
5053 		if (ticket->bytes != ticket->orig_bytes)
5054 			return true;
5055 	}
5056 	return false;
5057 }
5058 
5059 /*
5060  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5061  * will loop and continuously try to flush as long as we are making progress.
5062  * We count progress as clearing off tickets each time we have to loop.
5063  */
5064 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5065 {
5066 	struct btrfs_fs_info *fs_info;
5067 	struct btrfs_space_info *space_info;
5068 	u64 to_reclaim;
5069 	int flush_state;
5070 	int commit_cycles = 0;
5071 	u64 last_tickets_id;
5072 
5073 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5074 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5075 
5076 	spin_lock(&space_info->lock);
5077 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5078 						      false);
5079 	if (!to_reclaim) {
5080 		space_info->flush = 0;
5081 		spin_unlock(&space_info->lock);
5082 		return;
5083 	}
5084 	last_tickets_id = space_info->tickets_id;
5085 	spin_unlock(&space_info->lock);
5086 
5087 	flush_state = FLUSH_DELAYED_ITEMS_NR;
5088 	do {
5089 		flush_space(fs_info, space_info, to_reclaim, flush_state);
5090 		spin_lock(&space_info->lock);
5091 		if (list_empty(&space_info->tickets)) {
5092 			space_info->flush = 0;
5093 			spin_unlock(&space_info->lock);
5094 			return;
5095 		}
5096 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5097 							      space_info,
5098 							      false);
5099 		if (last_tickets_id == space_info->tickets_id) {
5100 			flush_state++;
5101 		} else {
5102 			last_tickets_id = space_info->tickets_id;
5103 			flush_state = FLUSH_DELAYED_ITEMS_NR;
5104 			if (commit_cycles)
5105 				commit_cycles--;
5106 		}
5107 
5108 		/*
5109 		 * We don't want to force a chunk allocation until we've tried
5110 		 * pretty hard to reclaim space.  Think of the case where we
5111 		 * freed up a bunch of space and so have a lot of pinned space
5112 		 * to reclaim.  We would rather use that than possibly create a
5113 		 * underutilized metadata chunk.  So if this is our first run
5114 		 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
5115 		 * commit the transaction.  If nothing has changed the next go
5116 		 * around then we can force a chunk allocation.
5117 		 */
5118 		if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
5119 			flush_state++;
5120 
5121 		if (flush_state > COMMIT_TRANS) {
5122 			commit_cycles++;
5123 			if (commit_cycles > 2) {
5124 				if (wake_all_tickets(&space_info->tickets)) {
5125 					flush_state = FLUSH_DELAYED_ITEMS_NR;
5126 					commit_cycles--;
5127 				} else {
5128 					space_info->flush = 0;
5129 				}
5130 			} else {
5131 				flush_state = FLUSH_DELAYED_ITEMS_NR;
5132 			}
5133 		}
5134 		spin_unlock(&space_info->lock);
5135 	} while (flush_state <= COMMIT_TRANS);
5136 }
5137 
5138 void btrfs_init_async_reclaim_work(struct work_struct *work)
5139 {
5140 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5141 }
5142 
5143 static const enum btrfs_flush_state priority_flush_states[] = {
5144 	FLUSH_DELAYED_ITEMS_NR,
5145 	FLUSH_DELAYED_ITEMS,
5146 	ALLOC_CHUNK,
5147 };
5148 
5149 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5150 					    struct btrfs_space_info *space_info,
5151 					    struct reserve_ticket *ticket)
5152 {
5153 	u64 to_reclaim;
5154 	int flush_state;
5155 
5156 	spin_lock(&space_info->lock);
5157 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5158 						      false);
5159 	if (!to_reclaim) {
5160 		spin_unlock(&space_info->lock);
5161 		return;
5162 	}
5163 	spin_unlock(&space_info->lock);
5164 
5165 	flush_state = 0;
5166 	do {
5167 		flush_space(fs_info, space_info, to_reclaim,
5168 			    priority_flush_states[flush_state]);
5169 		flush_state++;
5170 		spin_lock(&space_info->lock);
5171 		if (ticket->bytes == 0) {
5172 			spin_unlock(&space_info->lock);
5173 			return;
5174 		}
5175 		spin_unlock(&space_info->lock);
5176 	} while (flush_state < ARRAY_SIZE(priority_flush_states));
5177 }
5178 
5179 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5180 			       struct btrfs_space_info *space_info,
5181 			       struct reserve_ticket *ticket)
5182 
5183 {
5184 	DEFINE_WAIT(wait);
5185 	u64 reclaim_bytes = 0;
5186 	int ret = 0;
5187 
5188 	spin_lock(&space_info->lock);
5189 	while (ticket->bytes > 0 && ticket->error == 0) {
5190 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5191 		if (ret) {
5192 			ret = -EINTR;
5193 			break;
5194 		}
5195 		spin_unlock(&space_info->lock);
5196 
5197 		schedule();
5198 
5199 		finish_wait(&ticket->wait, &wait);
5200 		spin_lock(&space_info->lock);
5201 	}
5202 	if (!ret)
5203 		ret = ticket->error;
5204 	if (!list_empty(&ticket->list))
5205 		list_del_init(&ticket->list);
5206 	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
5207 		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
5208 	spin_unlock(&space_info->lock);
5209 
5210 	if (reclaim_bytes)
5211 		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5212 	return ret;
5213 }
5214 
5215 /**
5216  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5217  * @root - the root we're allocating for
5218  * @space_info - the space info we want to allocate from
5219  * @orig_bytes - the number of bytes we want
5220  * @flush - whether or not we can flush to make our reservation
5221  *
5222  * This will reserve orig_bytes number of bytes from the space info associated
5223  * with the block_rsv.  If there is not enough space it will make an attempt to
5224  * flush out space to make room.  It will do this by flushing delalloc if
5225  * possible or committing the transaction.  If flush is 0 then no attempts to
5226  * regain reservations will be made and this will fail if there is not enough
5227  * space already.
5228  */
5229 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5230 				    struct btrfs_space_info *space_info,
5231 				    u64 orig_bytes,
5232 				    enum btrfs_reserve_flush_enum flush,
5233 				    bool system_chunk)
5234 {
5235 	struct reserve_ticket ticket;
5236 	u64 used;
5237 	u64 reclaim_bytes = 0;
5238 	int ret = 0;
5239 
5240 	ASSERT(orig_bytes);
5241 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5242 
5243 	spin_lock(&space_info->lock);
5244 	ret = -ENOSPC;
5245 	used = btrfs_space_info_used(space_info, true);
5246 
5247 	/*
5248 	 * If we have enough space then hooray, make our reservation and carry
5249 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
5250 	 * If not things get more complicated.
5251 	 */
5252 	if (used + orig_bytes <= space_info->total_bytes) {
5253 		update_bytes_may_use(space_info, orig_bytes);
5254 		trace_btrfs_space_reservation(fs_info, "space_info",
5255 					      space_info->flags, orig_bytes, 1);
5256 		ret = 0;
5257 	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5258 				  system_chunk)) {
5259 		update_bytes_may_use(space_info, orig_bytes);
5260 		trace_btrfs_space_reservation(fs_info, "space_info",
5261 					      space_info->flags, orig_bytes, 1);
5262 		ret = 0;
5263 	}
5264 
5265 	/*
5266 	 * If we couldn't make a reservation then setup our reservation ticket
5267 	 * and kick the async worker if it's not already running.
5268 	 *
5269 	 * If we are a priority flusher then we just need to add our ticket to
5270 	 * the list and we will do our own flushing further down.
5271 	 */
5272 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5273 		ticket.orig_bytes = orig_bytes;
5274 		ticket.bytes = orig_bytes;
5275 		ticket.error = 0;
5276 		init_waitqueue_head(&ticket.wait);
5277 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5278 			list_add_tail(&ticket.list, &space_info->tickets);
5279 			if (!space_info->flush) {
5280 				space_info->flush = 1;
5281 				trace_btrfs_trigger_flush(fs_info,
5282 							  space_info->flags,
5283 							  orig_bytes, flush,
5284 							  "enospc");
5285 				queue_work(system_unbound_wq,
5286 					   &fs_info->async_reclaim_work);
5287 			}
5288 		} else {
5289 			list_add_tail(&ticket.list,
5290 				      &space_info->priority_tickets);
5291 		}
5292 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5293 		used += orig_bytes;
5294 		/*
5295 		 * We will do the space reservation dance during log replay,
5296 		 * which means we won't have fs_info->fs_root set, so don't do
5297 		 * the async reclaim as we will panic.
5298 		 */
5299 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5300 		    need_do_async_reclaim(fs_info, space_info,
5301 					  used, system_chunk) &&
5302 		    !work_busy(&fs_info->async_reclaim_work)) {
5303 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
5304 						  orig_bytes, flush, "preempt");
5305 			queue_work(system_unbound_wq,
5306 				   &fs_info->async_reclaim_work);
5307 		}
5308 	}
5309 	spin_unlock(&space_info->lock);
5310 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5311 		return ret;
5312 
5313 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
5314 		return wait_reserve_ticket(fs_info, space_info, &ticket);
5315 
5316 	ret = 0;
5317 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5318 	spin_lock(&space_info->lock);
5319 	if (ticket.bytes) {
5320 		if (ticket.bytes < orig_bytes)
5321 			reclaim_bytes = orig_bytes - ticket.bytes;
5322 		list_del_init(&ticket.list);
5323 		ret = -ENOSPC;
5324 	}
5325 	spin_unlock(&space_info->lock);
5326 
5327 	if (reclaim_bytes)
5328 		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5329 	ASSERT(list_empty(&ticket.list));
5330 	return ret;
5331 }
5332 
5333 /**
5334  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5335  * @root - the root we're allocating for
5336  * @block_rsv - the block_rsv we're allocating for
5337  * @orig_bytes - the number of bytes we want
5338  * @flush - whether or not we can flush to make our reservation
5339  *
5340  * This will reserve orig_bytes number of bytes from the space info associated
5341  * with the block_rsv.  If there is not enough space it will make an attempt to
5342  * flush out space to make room.  It will do this by flushing delalloc if
5343  * possible or committing the transaction.  If flush is 0 then no attempts to
5344  * regain reservations will be made and this will fail if there is not enough
5345  * space already.
5346  */
5347 static int reserve_metadata_bytes(struct btrfs_root *root,
5348 				  struct btrfs_block_rsv *block_rsv,
5349 				  u64 orig_bytes,
5350 				  enum btrfs_reserve_flush_enum flush)
5351 {
5352 	struct btrfs_fs_info *fs_info = root->fs_info;
5353 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5354 	int ret;
5355 	bool system_chunk = (root == fs_info->chunk_root);
5356 
5357 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5358 				       orig_bytes, flush, system_chunk);
5359 	if (ret == -ENOSPC &&
5360 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5361 		if (block_rsv != global_rsv &&
5362 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
5363 			ret = 0;
5364 	}
5365 	if (ret == -ENOSPC) {
5366 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5367 					      block_rsv->space_info->flags,
5368 					      orig_bytes, 1);
5369 
5370 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5371 			dump_space_info(fs_info, block_rsv->space_info,
5372 					orig_bytes, 0);
5373 	}
5374 	return ret;
5375 }
5376 
5377 static struct btrfs_block_rsv *get_block_rsv(
5378 					const struct btrfs_trans_handle *trans,
5379 					const struct btrfs_root *root)
5380 {
5381 	struct btrfs_fs_info *fs_info = root->fs_info;
5382 	struct btrfs_block_rsv *block_rsv = NULL;
5383 
5384 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5385 	    (root == fs_info->csum_root && trans->adding_csums) ||
5386 	    (root == fs_info->uuid_root))
5387 		block_rsv = trans->block_rsv;
5388 
5389 	if (!block_rsv)
5390 		block_rsv = root->block_rsv;
5391 
5392 	if (!block_rsv)
5393 		block_rsv = &fs_info->empty_block_rsv;
5394 
5395 	return block_rsv;
5396 }
5397 
5398 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5399 			       u64 num_bytes)
5400 {
5401 	int ret = -ENOSPC;
5402 	spin_lock(&block_rsv->lock);
5403 	if (block_rsv->reserved >= num_bytes) {
5404 		block_rsv->reserved -= num_bytes;
5405 		if (block_rsv->reserved < block_rsv->size)
5406 			block_rsv->full = 0;
5407 		ret = 0;
5408 	}
5409 	spin_unlock(&block_rsv->lock);
5410 	return ret;
5411 }
5412 
5413 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5414 				u64 num_bytes, bool update_size)
5415 {
5416 	spin_lock(&block_rsv->lock);
5417 	block_rsv->reserved += num_bytes;
5418 	if (update_size)
5419 		block_rsv->size += num_bytes;
5420 	else if (block_rsv->reserved >= block_rsv->size)
5421 		block_rsv->full = 1;
5422 	spin_unlock(&block_rsv->lock);
5423 }
5424 
5425 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5426 			     struct btrfs_block_rsv *dest, u64 num_bytes,
5427 			     int min_factor)
5428 {
5429 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5430 	u64 min_bytes;
5431 
5432 	if (global_rsv->space_info != dest->space_info)
5433 		return -ENOSPC;
5434 
5435 	spin_lock(&global_rsv->lock);
5436 	min_bytes = div_factor(global_rsv->size, min_factor);
5437 	if (global_rsv->reserved < min_bytes + num_bytes) {
5438 		spin_unlock(&global_rsv->lock);
5439 		return -ENOSPC;
5440 	}
5441 	global_rsv->reserved -= num_bytes;
5442 	if (global_rsv->reserved < global_rsv->size)
5443 		global_rsv->full = 0;
5444 	spin_unlock(&global_rsv->lock);
5445 
5446 	block_rsv_add_bytes(dest, num_bytes, true);
5447 	return 0;
5448 }
5449 
5450 /**
5451  * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5452  * @fs_info - the fs info for our fs.
5453  * @src - the source block rsv to transfer from.
5454  * @num_bytes - the number of bytes to transfer.
5455  *
5456  * This transfers up to the num_bytes amount from the src rsv to the
5457  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
5458  */
5459 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5460 				       struct btrfs_block_rsv *src,
5461 				       u64 num_bytes)
5462 {
5463 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5464 	u64 to_free = 0;
5465 
5466 	spin_lock(&src->lock);
5467 	src->reserved -= num_bytes;
5468 	src->size -= num_bytes;
5469 	spin_unlock(&src->lock);
5470 
5471 	spin_lock(&delayed_refs_rsv->lock);
5472 	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5473 		u64 delta = delayed_refs_rsv->size -
5474 			delayed_refs_rsv->reserved;
5475 		if (num_bytes > delta) {
5476 			to_free = num_bytes - delta;
5477 			num_bytes = delta;
5478 		}
5479 	} else {
5480 		to_free = num_bytes;
5481 		num_bytes = 0;
5482 	}
5483 
5484 	if (num_bytes)
5485 		delayed_refs_rsv->reserved += num_bytes;
5486 	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5487 		delayed_refs_rsv->full = 1;
5488 	spin_unlock(&delayed_refs_rsv->lock);
5489 
5490 	if (num_bytes)
5491 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5492 					      0, num_bytes, 1);
5493 	if (to_free)
5494 		space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
5495 					 to_free);
5496 }
5497 
5498 /**
5499  * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5500  * @fs_info - the fs_info for our fs.
5501  * @flush - control how we can flush for this reservation.
5502  *
5503  * This will refill the delayed block_rsv up to 1 items size worth of space and
5504  * will return -ENOSPC if we can't make the reservation.
5505  */
5506 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5507 				  enum btrfs_reserve_flush_enum flush)
5508 {
5509 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5510 	u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5511 	u64 num_bytes = 0;
5512 	int ret = -ENOSPC;
5513 
5514 	spin_lock(&block_rsv->lock);
5515 	if (block_rsv->reserved < block_rsv->size) {
5516 		num_bytes = block_rsv->size - block_rsv->reserved;
5517 		num_bytes = min(num_bytes, limit);
5518 	}
5519 	spin_unlock(&block_rsv->lock);
5520 
5521 	if (!num_bytes)
5522 		return 0;
5523 
5524 	ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5525 				     num_bytes, flush);
5526 	if (ret)
5527 		return ret;
5528 	block_rsv_add_bytes(block_rsv, num_bytes, 0);
5529 	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5530 				      0, num_bytes, 1);
5531 	return 0;
5532 }
5533 
5534 /*
5535  * This is for space we already have accounted in space_info->bytes_may_use, so
5536  * basically when we're returning space from block_rsv's.
5537  */
5538 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5539 				     struct btrfs_space_info *space_info,
5540 				     u64 num_bytes)
5541 {
5542 	struct reserve_ticket *ticket;
5543 	struct list_head *head;
5544 	u64 used;
5545 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5546 	bool check_overcommit = false;
5547 
5548 	spin_lock(&space_info->lock);
5549 	head = &space_info->priority_tickets;
5550 
5551 	/*
5552 	 * If we are over our limit then we need to check and see if we can
5553 	 * overcommit, and if we can't then we just need to free up our space
5554 	 * and not satisfy any requests.
5555 	 */
5556 	used = btrfs_space_info_used(space_info, true);
5557 	if (used - num_bytes >= space_info->total_bytes)
5558 		check_overcommit = true;
5559 again:
5560 	while (!list_empty(head) && num_bytes) {
5561 		ticket = list_first_entry(head, struct reserve_ticket,
5562 					  list);
5563 		/*
5564 		 * We use 0 bytes because this space is already reserved, so
5565 		 * adding the ticket space would be a double count.
5566 		 */
5567 		if (check_overcommit &&
5568 		    !can_overcommit(fs_info, space_info, 0, flush, false))
5569 			break;
5570 		if (num_bytes >= ticket->bytes) {
5571 			list_del_init(&ticket->list);
5572 			num_bytes -= ticket->bytes;
5573 			ticket->bytes = 0;
5574 			space_info->tickets_id++;
5575 			wake_up(&ticket->wait);
5576 		} else {
5577 			ticket->bytes -= num_bytes;
5578 			num_bytes = 0;
5579 		}
5580 	}
5581 
5582 	if (num_bytes && head == &space_info->priority_tickets) {
5583 		head = &space_info->tickets;
5584 		flush = BTRFS_RESERVE_FLUSH_ALL;
5585 		goto again;
5586 	}
5587 	update_bytes_may_use(space_info, -num_bytes);
5588 	trace_btrfs_space_reservation(fs_info, "space_info",
5589 				      space_info->flags, num_bytes, 0);
5590 	spin_unlock(&space_info->lock);
5591 }
5592 
5593 /*
5594  * This is for newly allocated space that isn't accounted in
5595  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5596  * we use this helper.
5597  */
5598 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5599 				     struct btrfs_space_info *space_info,
5600 				     u64 num_bytes)
5601 {
5602 	struct reserve_ticket *ticket;
5603 	struct list_head *head = &space_info->priority_tickets;
5604 
5605 again:
5606 	while (!list_empty(head) && num_bytes) {
5607 		ticket = list_first_entry(head, struct reserve_ticket,
5608 					  list);
5609 		if (num_bytes >= ticket->bytes) {
5610 			trace_btrfs_space_reservation(fs_info, "space_info",
5611 						      space_info->flags,
5612 						      ticket->bytes, 1);
5613 			list_del_init(&ticket->list);
5614 			num_bytes -= ticket->bytes;
5615 			update_bytes_may_use(space_info, ticket->bytes);
5616 			ticket->bytes = 0;
5617 			space_info->tickets_id++;
5618 			wake_up(&ticket->wait);
5619 		} else {
5620 			trace_btrfs_space_reservation(fs_info, "space_info",
5621 						      space_info->flags,
5622 						      num_bytes, 1);
5623 			update_bytes_may_use(space_info, num_bytes);
5624 			ticket->bytes -= num_bytes;
5625 			num_bytes = 0;
5626 		}
5627 	}
5628 
5629 	if (num_bytes && head == &space_info->priority_tickets) {
5630 		head = &space_info->tickets;
5631 		goto again;
5632 	}
5633 }
5634 
5635 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5636 				    struct btrfs_block_rsv *block_rsv,
5637 				    struct btrfs_block_rsv *dest, u64 num_bytes,
5638 				    u64 *qgroup_to_release_ret)
5639 {
5640 	struct btrfs_space_info *space_info = block_rsv->space_info;
5641 	u64 qgroup_to_release = 0;
5642 	u64 ret;
5643 
5644 	spin_lock(&block_rsv->lock);
5645 	if (num_bytes == (u64)-1) {
5646 		num_bytes = block_rsv->size;
5647 		qgroup_to_release = block_rsv->qgroup_rsv_size;
5648 	}
5649 	block_rsv->size -= num_bytes;
5650 	if (block_rsv->reserved >= block_rsv->size) {
5651 		num_bytes = block_rsv->reserved - block_rsv->size;
5652 		block_rsv->reserved = block_rsv->size;
5653 		block_rsv->full = 1;
5654 	} else {
5655 		num_bytes = 0;
5656 	}
5657 	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5658 		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5659 				    block_rsv->qgroup_rsv_size;
5660 		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5661 	} else {
5662 		qgroup_to_release = 0;
5663 	}
5664 	spin_unlock(&block_rsv->lock);
5665 
5666 	ret = num_bytes;
5667 	if (num_bytes > 0) {
5668 		if (dest) {
5669 			spin_lock(&dest->lock);
5670 			if (!dest->full) {
5671 				u64 bytes_to_add;
5672 
5673 				bytes_to_add = dest->size - dest->reserved;
5674 				bytes_to_add = min(num_bytes, bytes_to_add);
5675 				dest->reserved += bytes_to_add;
5676 				if (dest->reserved >= dest->size)
5677 					dest->full = 1;
5678 				num_bytes -= bytes_to_add;
5679 			}
5680 			spin_unlock(&dest->lock);
5681 		}
5682 		if (num_bytes)
5683 			space_info_add_old_bytes(fs_info, space_info,
5684 						 num_bytes);
5685 	}
5686 	if (qgroup_to_release_ret)
5687 		*qgroup_to_release_ret = qgroup_to_release;
5688 	return ret;
5689 }
5690 
5691 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5692 			    struct btrfs_block_rsv *dst, u64 num_bytes,
5693 			    bool update_size)
5694 {
5695 	int ret;
5696 
5697 	ret = block_rsv_use_bytes(src, num_bytes);
5698 	if (ret)
5699 		return ret;
5700 
5701 	block_rsv_add_bytes(dst, num_bytes, update_size);
5702 	return 0;
5703 }
5704 
5705 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5706 {
5707 	memset(rsv, 0, sizeof(*rsv));
5708 	spin_lock_init(&rsv->lock);
5709 	rsv->type = type;
5710 }
5711 
5712 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5713 				   struct btrfs_block_rsv *rsv,
5714 				   unsigned short type)
5715 {
5716 	btrfs_init_block_rsv(rsv, type);
5717 	rsv->space_info = __find_space_info(fs_info,
5718 					    BTRFS_BLOCK_GROUP_METADATA);
5719 }
5720 
5721 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5722 					      unsigned short type)
5723 {
5724 	struct btrfs_block_rsv *block_rsv;
5725 
5726 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5727 	if (!block_rsv)
5728 		return NULL;
5729 
5730 	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5731 	return block_rsv;
5732 }
5733 
5734 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5735 			  struct btrfs_block_rsv *rsv)
5736 {
5737 	if (!rsv)
5738 		return;
5739 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5740 	kfree(rsv);
5741 }
5742 
5743 int btrfs_block_rsv_add(struct btrfs_root *root,
5744 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5745 			enum btrfs_reserve_flush_enum flush)
5746 {
5747 	int ret;
5748 
5749 	if (num_bytes == 0)
5750 		return 0;
5751 
5752 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5753 	if (!ret)
5754 		block_rsv_add_bytes(block_rsv, num_bytes, true);
5755 
5756 	return ret;
5757 }
5758 
5759 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5760 {
5761 	u64 num_bytes = 0;
5762 	int ret = -ENOSPC;
5763 
5764 	if (!block_rsv)
5765 		return 0;
5766 
5767 	spin_lock(&block_rsv->lock);
5768 	num_bytes = div_factor(block_rsv->size, min_factor);
5769 	if (block_rsv->reserved >= num_bytes)
5770 		ret = 0;
5771 	spin_unlock(&block_rsv->lock);
5772 
5773 	return ret;
5774 }
5775 
5776 int btrfs_block_rsv_refill(struct btrfs_root *root,
5777 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5778 			   enum btrfs_reserve_flush_enum flush)
5779 {
5780 	u64 num_bytes = 0;
5781 	int ret = -ENOSPC;
5782 
5783 	if (!block_rsv)
5784 		return 0;
5785 
5786 	spin_lock(&block_rsv->lock);
5787 	num_bytes = min_reserved;
5788 	if (block_rsv->reserved >= num_bytes)
5789 		ret = 0;
5790 	else
5791 		num_bytes -= block_rsv->reserved;
5792 	spin_unlock(&block_rsv->lock);
5793 
5794 	if (!ret)
5795 		return 0;
5796 
5797 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5798 	if (!ret) {
5799 		block_rsv_add_bytes(block_rsv, num_bytes, false);
5800 		return 0;
5801 	}
5802 
5803 	return ret;
5804 }
5805 
5806 static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
5807 				u64 *metadata_bytes, u64 *qgroup_bytes)
5808 {
5809 	*metadata_bytes = 0;
5810 	*qgroup_bytes = 0;
5811 
5812 	spin_lock(&block_rsv->lock);
5813 	if (block_rsv->reserved < block_rsv->size)
5814 		*metadata_bytes = block_rsv->size - block_rsv->reserved;
5815 	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5816 		*qgroup_bytes = block_rsv->qgroup_rsv_size -
5817 			block_rsv->qgroup_rsv_reserved;
5818 	spin_unlock(&block_rsv->lock);
5819 }
5820 
5821 /**
5822  * btrfs_inode_rsv_refill - refill the inode block rsv.
5823  * @inode - the inode we are refilling.
5824  * @flush - the flushing restriction.
5825  *
5826  * Essentially the same as btrfs_block_rsv_refill, except it uses the
5827  * block_rsv->size as the minimum size.  We'll either refill the missing amount
5828  * or return if we already have enough space.  This will also handle the reserve
5829  * tracepoint for the reserved amount.
5830  */
5831 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5832 				  enum btrfs_reserve_flush_enum flush)
5833 {
5834 	struct btrfs_root *root = inode->root;
5835 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5836 	u64 num_bytes, last = 0;
5837 	u64 qgroup_num_bytes;
5838 	int ret = -ENOSPC;
5839 
5840 	calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
5841 	if (num_bytes == 0)
5842 		return 0;
5843 
5844 	do {
5845 		ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
5846 							 true);
5847 		if (ret)
5848 			return ret;
5849 		ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5850 		if (ret) {
5851 			btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5852 			last = num_bytes;
5853 			/*
5854 			 * If we are fragmented we can end up with a lot of
5855 			 * outstanding extents which will make our size be much
5856 			 * larger than our reserved amount.
5857 			 *
5858 			 * If the reservation happens here, it might be very
5859 			 * big though not needed in the end, if the delalloc
5860 			 * flushing happens.
5861 			 *
5862 			 * If this is the case try and do the reserve again.
5863 			 */
5864 			if (flush == BTRFS_RESERVE_FLUSH_ALL)
5865 				calc_refill_bytes(block_rsv, &num_bytes,
5866 						   &qgroup_num_bytes);
5867 			if (num_bytes == 0)
5868 				return 0;
5869 		}
5870 	} while (ret && last != num_bytes);
5871 
5872 	if (!ret) {
5873 		block_rsv_add_bytes(block_rsv, num_bytes, false);
5874 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
5875 					      btrfs_ino(inode), num_bytes, 1);
5876 
5877 		/* Don't forget to increase qgroup_rsv_reserved */
5878 		spin_lock(&block_rsv->lock);
5879 		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5880 		spin_unlock(&block_rsv->lock);
5881 	}
5882 	return ret;
5883 }
5884 
5885 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5886 				     struct btrfs_block_rsv *block_rsv,
5887 				     u64 num_bytes, u64 *qgroup_to_release)
5888 {
5889 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5890 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5891 	struct btrfs_block_rsv *target = delayed_rsv;
5892 
5893 	if (target->full || target == block_rsv)
5894 		target = global_rsv;
5895 
5896 	if (block_rsv->space_info != target->space_info)
5897 		target = NULL;
5898 
5899 	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5900 				       qgroup_to_release);
5901 }
5902 
5903 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5904 			     struct btrfs_block_rsv *block_rsv,
5905 			     u64 num_bytes)
5906 {
5907 	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5908 }
5909 
5910 /**
5911  * btrfs_inode_rsv_release - release any excessive reservation.
5912  * @inode - the inode we need to release from.
5913  * @qgroup_free - free or convert qgroup meta.
5914  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5915  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5916  *   @qgroup_free is true for error handling, and false for normal release.
5917  *
5918  * This is the same as btrfs_block_rsv_release, except that it handles the
5919  * tracepoint for the reservation.
5920  */
5921 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5922 {
5923 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5924 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5925 	u64 released = 0;
5926 	u64 qgroup_to_release = 0;
5927 
5928 	/*
5929 	 * Since we statically set the block_rsv->size we just want to say we
5930 	 * are releasing 0 bytes, and then we'll just get the reservation over
5931 	 * the size free'd.
5932 	 */
5933 	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5934 					     &qgroup_to_release);
5935 	if (released > 0)
5936 		trace_btrfs_space_reservation(fs_info, "delalloc",
5937 					      btrfs_ino(inode), released, 0);
5938 	if (qgroup_free)
5939 		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5940 	else
5941 		btrfs_qgroup_convert_reserved_meta(inode->root,
5942 						   qgroup_to_release);
5943 }
5944 
5945 /**
5946  * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5947  * @fs_info - the fs_info for our fs.
5948  * @nr - the number of items to drop.
5949  *
5950  * This drops the delayed ref head's count from the delayed refs rsv and frees
5951  * any excess reservation we had.
5952  */
5953 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5954 {
5955 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5956 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5957 	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5958 	u64 released = 0;
5959 
5960 	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5961 					   num_bytes, NULL);
5962 	if (released)
5963 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5964 					      0, released, 0);
5965 }
5966 
5967 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5968 {
5969 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5970 	struct btrfs_space_info *sinfo = block_rsv->space_info;
5971 	u64 num_bytes;
5972 
5973 	/*
5974 	 * The global block rsv is based on the size of the extent tree, the
5975 	 * checksum tree and the root tree.  If the fs is empty we want to set
5976 	 * it to a minimal amount for safety.
5977 	 */
5978 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5979 		btrfs_root_used(&fs_info->csum_root->root_item) +
5980 		btrfs_root_used(&fs_info->tree_root->root_item);
5981 	num_bytes = max_t(u64, num_bytes, SZ_16M);
5982 
5983 	spin_lock(&sinfo->lock);
5984 	spin_lock(&block_rsv->lock);
5985 
5986 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5987 
5988 	if (block_rsv->reserved < block_rsv->size) {
5989 		num_bytes = btrfs_space_info_used(sinfo, true);
5990 		if (sinfo->total_bytes > num_bytes) {
5991 			num_bytes = sinfo->total_bytes - num_bytes;
5992 			num_bytes = min(num_bytes,
5993 					block_rsv->size - block_rsv->reserved);
5994 			block_rsv->reserved += num_bytes;
5995 			update_bytes_may_use(sinfo, num_bytes);
5996 			trace_btrfs_space_reservation(fs_info, "space_info",
5997 						      sinfo->flags, num_bytes,
5998 						      1);
5999 		}
6000 	} else if (block_rsv->reserved > block_rsv->size) {
6001 		num_bytes = block_rsv->reserved - block_rsv->size;
6002 		update_bytes_may_use(sinfo, -num_bytes);
6003 		trace_btrfs_space_reservation(fs_info, "space_info",
6004 				      sinfo->flags, num_bytes, 0);
6005 		block_rsv->reserved = block_rsv->size;
6006 	}
6007 
6008 	if (block_rsv->reserved == block_rsv->size)
6009 		block_rsv->full = 1;
6010 	else
6011 		block_rsv->full = 0;
6012 
6013 	spin_unlock(&block_rsv->lock);
6014 	spin_unlock(&sinfo->lock);
6015 }
6016 
6017 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
6018 {
6019 	struct btrfs_space_info *space_info;
6020 
6021 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
6022 	fs_info->chunk_block_rsv.space_info = space_info;
6023 
6024 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
6025 	fs_info->global_block_rsv.space_info = space_info;
6026 	fs_info->trans_block_rsv.space_info = space_info;
6027 	fs_info->empty_block_rsv.space_info = space_info;
6028 	fs_info->delayed_block_rsv.space_info = space_info;
6029 	fs_info->delayed_refs_rsv.space_info = space_info;
6030 
6031 	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
6032 	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
6033 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
6034 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
6035 	if (fs_info->quota_root)
6036 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
6037 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
6038 
6039 	update_global_block_rsv(fs_info);
6040 }
6041 
6042 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
6043 {
6044 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
6045 				(u64)-1, NULL);
6046 	WARN_ON(fs_info->trans_block_rsv.size > 0);
6047 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
6048 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
6049 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
6050 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
6051 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
6052 	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
6053 	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
6054 }
6055 
6056 /*
6057  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
6058  * @trans - the trans that may have generated delayed refs
6059  *
6060  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
6061  * it'll calculate the additional size and add it to the delayed_refs_rsv.
6062  */
6063 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
6064 {
6065 	struct btrfs_fs_info *fs_info = trans->fs_info;
6066 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
6067 	u64 num_bytes;
6068 
6069 	if (!trans->delayed_ref_updates)
6070 		return;
6071 
6072 	num_bytes = btrfs_calc_trans_metadata_size(fs_info,
6073 						   trans->delayed_ref_updates);
6074 	spin_lock(&delayed_rsv->lock);
6075 	delayed_rsv->size += num_bytes;
6076 	delayed_rsv->full = 0;
6077 	spin_unlock(&delayed_rsv->lock);
6078 	trans->delayed_ref_updates = 0;
6079 }
6080 
6081 /*
6082  * To be called after all the new block groups attached to the transaction
6083  * handle have been created (btrfs_create_pending_block_groups()).
6084  */
6085 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
6086 {
6087 	struct btrfs_fs_info *fs_info = trans->fs_info;
6088 
6089 	if (!trans->chunk_bytes_reserved)
6090 		return;
6091 
6092 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
6093 
6094 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
6095 				trans->chunk_bytes_reserved, NULL);
6096 	trans->chunk_bytes_reserved = 0;
6097 }
6098 
6099 /*
6100  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
6101  * root: the root of the parent directory
6102  * rsv: block reservation
6103  * items: the number of items that we need do reservation
6104  * use_global_rsv: allow fallback to the global block reservation
6105  *
6106  * This function is used to reserve the space for snapshot/subvolume
6107  * creation and deletion. Those operations are different with the
6108  * common file/directory operations, they change two fs/file trees
6109  * and root tree, the number of items that the qgroup reserves is
6110  * different with the free space reservation. So we can not use
6111  * the space reservation mechanism in start_transaction().
6112  */
6113 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
6114 				     struct btrfs_block_rsv *rsv, int items,
6115 				     bool use_global_rsv)
6116 {
6117 	u64 qgroup_num_bytes = 0;
6118 	u64 num_bytes;
6119 	int ret;
6120 	struct btrfs_fs_info *fs_info = root->fs_info;
6121 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6122 
6123 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6124 		/* One for parent inode, two for dir entries */
6125 		qgroup_num_bytes = 3 * fs_info->nodesize;
6126 		ret = btrfs_qgroup_reserve_meta_prealloc(root,
6127 				qgroup_num_bytes, true);
6128 		if (ret)
6129 			return ret;
6130 	}
6131 
6132 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
6133 	rsv->space_info = __find_space_info(fs_info,
6134 					    BTRFS_BLOCK_GROUP_METADATA);
6135 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
6136 				  BTRFS_RESERVE_FLUSH_ALL);
6137 
6138 	if (ret == -ENOSPC && use_global_rsv)
6139 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
6140 
6141 	if (ret && qgroup_num_bytes)
6142 		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
6143 
6144 	return ret;
6145 }
6146 
6147 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
6148 				      struct btrfs_block_rsv *rsv)
6149 {
6150 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
6151 }
6152 
6153 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
6154 						 struct btrfs_inode *inode)
6155 {
6156 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6157 	u64 reserve_size = 0;
6158 	u64 qgroup_rsv_size = 0;
6159 	u64 csum_leaves;
6160 	unsigned outstanding_extents;
6161 
6162 	lockdep_assert_held(&inode->lock);
6163 	outstanding_extents = inode->outstanding_extents;
6164 	if (outstanding_extents)
6165 		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6166 						outstanding_extents + 1);
6167 	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6168 						 inode->csum_bytes);
6169 	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6170 						       csum_leaves);
6171 	/*
6172 	 * For qgroup rsv, the calculation is very simple:
6173 	 * account one nodesize for each outstanding extent
6174 	 *
6175 	 * This is overestimating in most cases.
6176 	 */
6177 	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
6178 
6179 	spin_lock(&block_rsv->lock);
6180 	block_rsv->size = reserve_size;
6181 	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6182 	spin_unlock(&block_rsv->lock);
6183 }
6184 
6185 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6186 {
6187 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6188 	unsigned nr_extents;
6189 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6190 	int ret = 0;
6191 	bool delalloc_lock = true;
6192 
6193 	/* If we are a free space inode we need to not flush since we will be in
6194 	 * the middle of a transaction commit.  We also don't need the delalloc
6195 	 * mutex since we won't race with anybody.  We need this mostly to make
6196 	 * lockdep shut its filthy mouth.
6197 	 *
6198 	 * If we have a transaction open (can happen if we call truncate_block
6199 	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6200 	 */
6201 	if (btrfs_is_free_space_inode(inode)) {
6202 		flush = BTRFS_RESERVE_NO_FLUSH;
6203 		delalloc_lock = false;
6204 	} else {
6205 		if (current->journal_info)
6206 			flush = BTRFS_RESERVE_FLUSH_LIMIT;
6207 
6208 		if (btrfs_transaction_in_commit(fs_info))
6209 			schedule_timeout(1);
6210 	}
6211 
6212 	if (delalloc_lock)
6213 		mutex_lock(&inode->delalloc_mutex);
6214 
6215 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6216 
6217 	/* Add our new extents and calculate the new rsv size. */
6218 	spin_lock(&inode->lock);
6219 	nr_extents = count_max_extents(num_bytes);
6220 	btrfs_mod_outstanding_extents(inode, nr_extents);
6221 	inode->csum_bytes += num_bytes;
6222 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6223 	spin_unlock(&inode->lock);
6224 
6225 	ret = btrfs_inode_rsv_refill(inode, flush);
6226 	if (unlikely(ret))
6227 		goto out_fail;
6228 
6229 	if (delalloc_lock)
6230 		mutex_unlock(&inode->delalloc_mutex);
6231 	return 0;
6232 
6233 out_fail:
6234 	spin_lock(&inode->lock);
6235 	nr_extents = count_max_extents(num_bytes);
6236 	btrfs_mod_outstanding_extents(inode, -nr_extents);
6237 	inode->csum_bytes -= num_bytes;
6238 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6239 	spin_unlock(&inode->lock);
6240 
6241 	btrfs_inode_rsv_release(inode, true);
6242 	if (delalloc_lock)
6243 		mutex_unlock(&inode->delalloc_mutex);
6244 	return ret;
6245 }
6246 
6247 /**
6248  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6249  * @inode: the inode to release the reservation for.
6250  * @num_bytes: the number of bytes we are releasing.
6251  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6252  *
6253  * This will release the metadata reservation for an inode.  This can be called
6254  * once we complete IO for a given set of bytes to release their metadata
6255  * reservations, or on error for the same reason.
6256  */
6257 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6258 				     bool qgroup_free)
6259 {
6260 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6261 
6262 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6263 	spin_lock(&inode->lock);
6264 	inode->csum_bytes -= num_bytes;
6265 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6266 	spin_unlock(&inode->lock);
6267 
6268 	if (btrfs_is_testing(fs_info))
6269 		return;
6270 
6271 	btrfs_inode_rsv_release(inode, qgroup_free);
6272 }
6273 
6274 /**
6275  * btrfs_delalloc_release_extents - release our outstanding_extents
6276  * @inode: the inode to balance the reservation for.
6277  * @num_bytes: the number of bytes we originally reserved with
6278  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6279  *
6280  * When we reserve space we increase outstanding_extents for the extents we may
6281  * add.  Once we've set the range as delalloc or created our ordered extents we
6282  * have outstanding_extents to track the real usage, so we use this to free our
6283  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6284  * with btrfs_delalloc_reserve_metadata.
6285  */
6286 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6287 				    bool qgroup_free)
6288 {
6289 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6290 	unsigned num_extents;
6291 
6292 	spin_lock(&inode->lock);
6293 	num_extents = count_max_extents(num_bytes);
6294 	btrfs_mod_outstanding_extents(inode, -num_extents);
6295 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6296 	spin_unlock(&inode->lock);
6297 
6298 	if (btrfs_is_testing(fs_info))
6299 		return;
6300 
6301 	btrfs_inode_rsv_release(inode, qgroup_free);
6302 }
6303 
6304 /**
6305  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6306  * delalloc
6307  * @inode: inode we're writing to
6308  * @start: start range we are writing to
6309  * @len: how long the range we are writing to
6310  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6311  * 	      current reservation.
6312  *
6313  * This will do the following things
6314  *
6315  * o reserve space in data space info for num bytes
6316  *   and reserve precious corresponding qgroup space
6317  *   (Done in check_data_free_space)
6318  *
6319  * o reserve space for metadata space, based on the number of outstanding
6320  *   extents and how much csums will be needed
6321  *   also reserve metadata space in a per root over-reserve method.
6322  * o add to the inodes->delalloc_bytes
6323  * o add it to the fs_info's delalloc inodes list.
6324  *   (Above 3 all done in delalloc_reserve_metadata)
6325  *
6326  * Return 0 for success
6327  * Return <0 for error(-ENOSPC or -EQUOT)
6328  */
6329 int btrfs_delalloc_reserve_space(struct inode *inode,
6330 			struct extent_changeset **reserved, u64 start, u64 len)
6331 {
6332 	int ret;
6333 
6334 	ret = btrfs_check_data_free_space(inode, reserved, start, len);
6335 	if (ret < 0)
6336 		return ret;
6337 	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6338 	if (ret < 0)
6339 		btrfs_free_reserved_data_space(inode, *reserved, start, len);
6340 	return ret;
6341 }
6342 
6343 /**
6344  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6345  * @inode: inode we're releasing space for
6346  * @start: start position of the space already reserved
6347  * @len: the len of the space already reserved
6348  * @release_bytes: the len of the space we consumed or didn't use
6349  *
6350  * This function will release the metadata space that was not used and will
6351  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6352  * list if there are no delalloc bytes left.
6353  * Also it will handle the qgroup reserved space.
6354  */
6355 void btrfs_delalloc_release_space(struct inode *inode,
6356 				  struct extent_changeset *reserved,
6357 				  u64 start, u64 len, bool qgroup_free)
6358 {
6359 	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6360 	btrfs_free_reserved_data_space(inode, reserved, start, len);
6361 }
6362 
6363 static int update_block_group(struct btrfs_trans_handle *trans,
6364 			      struct btrfs_fs_info *info, u64 bytenr,
6365 			      u64 num_bytes, int alloc)
6366 {
6367 	struct btrfs_block_group_cache *cache = NULL;
6368 	u64 total = num_bytes;
6369 	u64 old_val;
6370 	u64 byte_in_group;
6371 	int factor;
6372 	int ret = 0;
6373 
6374 	/* block accounting for super block */
6375 	spin_lock(&info->delalloc_root_lock);
6376 	old_val = btrfs_super_bytes_used(info->super_copy);
6377 	if (alloc)
6378 		old_val += num_bytes;
6379 	else
6380 		old_val -= num_bytes;
6381 	btrfs_set_super_bytes_used(info->super_copy, old_val);
6382 	spin_unlock(&info->delalloc_root_lock);
6383 
6384 	while (total) {
6385 		cache = btrfs_lookup_block_group(info, bytenr);
6386 		if (!cache) {
6387 			ret = -ENOENT;
6388 			break;
6389 		}
6390 		factor = btrfs_bg_type_to_factor(cache->flags);
6391 
6392 		/*
6393 		 * If this block group has free space cache written out, we
6394 		 * need to make sure to load it if we are removing space.  This
6395 		 * is because we need the unpinning stage to actually add the
6396 		 * space back to the block group, otherwise we will leak space.
6397 		 */
6398 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
6399 			cache_block_group(cache, 1);
6400 
6401 		byte_in_group = bytenr - cache->key.objectid;
6402 		WARN_ON(byte_in_group > cache->key.offset);
6403 
6404 		spin_lock(&cache->space_info->lock);
6405 		spin_lock(&cache->lock);
6406 
6407 		if (btrfs_test_opt(info, SPACE_CACHE) &&
6408 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
6409 			cache->disk_cache_state = BTRFS_DC_CLEAR;
6410 
6411 		old_val = btrfs_block_group_used(&cache->item);
6412 		num_bytes = min(total, cache->key.offset - byte_in_group);
6413 		if (alloc) {
6414 			old_val += num_bytes;
6415 			btrfs_set_block_group_used(&cache->item, old_val);
6416 			cache->reserved -= num_bytes;
6417 			cache->space_info->bytes_reserved -= num_bytes;
6418 			cache->space_info->bytes_used += num_bytes;
6419 			cache->space_info->disk_used += num_bytes * factor;
6420 			spin_unlock(&cache->lock);
6421 			spin_unlock(&cache->space_info->lock);
6422 		} else {
6423 			old_val -= num_bytes;
6424 			btrfs_set_block_group_used(&cache->item, old_val);
6425 			cache->pinned += num_bytes;
6426 			update_bytes_pinned(cache->space_info, num_bytes);
6427 			cache->space_info->bytes_used -= num_bytes;
6428 			cache->space_info->disk_used -= num_bytes * factor;
6429 			spin_unlock(&cache->lock);
6430 			spin_unlock(&cache->space_info->lock);
6431 
6432 			trace_btrfs_space_reservation(info, "pinned",
6433 						      cache->space_info->flags,
6434 						      num_bytes, 1);
6435 			percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6436 					   num_bytes,
6437 					   BTRFS_TOTAL_BYTES_PINNED_BATCH);
6438 			set_extent_dirty(info->pinned_extents,
6439 					 bytenr, bytenr + num_bytes - 1,
6440 					 GFP_NOFS | __GFP_NOFAIL);
6441 		}
6442 
6443 		spin_lock(&trans->transaction->dirty_bgs_lock);
6444 		if (list_empty(&cache->dirty_list)) {
6445 			list_add_tail(&cache->dirty_list,
6446 				      &trans->transaction->dirty_bgs);
6447 			trans->transaction->num_dirty_bgs++;
6448 			trans->delayed_ref_updates++;
6449 			btrfs_get_block_group(cache);
6450 		}
6451 		spin_unlock(&trans->transaction->dirty_bgs_lock);
6452 
6453 		/*
6454 		 * No longer have used bytes in this block group, queue it for
6455 		 * deletion. We do this after adding the block group to the
6456 		 * dirty list to avoid races between cleaner kthread and space
6457 		 * cache writeout.
6458 		 */
6459 		if (!alloc && old_val == 0)
6460 			btrfs_mark_bg_unused(cache);
6461 
6462 		btrfs_put_block_group(cache);
6463 		total -= num_bytes;
6464 		bytenr += num_bytes;
6465 	}
6466 
6467 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
6468 	btrfs_update_delayed_refs_rsv(trans);
6469 	return ret;
6470 }
6471 
6472 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6473 {
6474 	struct btrfs_block_group_cache *cache;
6475 	u64 bytenr;
6476 
6477 	spin_lock(&fs_info->block_group_cache_lock);
6478 	bytenr = fs_info->first_logical_byte;
6479 	spin_unlock(&fs_info->block_group_cache_lock);
6480 
6481 	if (bytenr < (u64)-1)
6482 		return bytenr;
6483 
6484 	cache = btrfs_lookup_first_block_group(fs_info, search_start);
6485 	if (!cache)
6486 		return 0;
6487 
6488 	bytenr = cache->key.objectid;
6489 	btrfs_put_block_group(cache);
6490 
6491 	return bytenr;
6492 }
6493 
6494 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6495 			   struct btrfs_block_group_cache *cache,
6496 			   u64 bytenr, u64 num_bytes, int reserved)
6497 {
6498 	spin_lock(&cache->space_info->lock);
6499 	spin_lock(&cache->lock);
6500 	cache->pinned += num_bytes;
6501 	update_bytes_pinned(cache->space_info, num_bytes);
6502 	if (reserved) {
6503 		cache->reserved -= num_bytes;
6504 		cache->space_info->bytes_reserved -= num_bytes;
6505 	}
6506 	spin_unlock(&cache->lock);
6507 	spin_unlock(&cache->space_info->lock);
6508 
6509 	trace_btrfs_space_reservation(fs_info, "pinned",
6510 				      cache->space_info->flags, num_bytes, 1);
6511 	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6512 		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6513 	set_extent_dirty(fs_info->pinned_extents, bytenr,
6514 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6515 	return 0;
6516 }
6517 
6518 /*
6519  * this function must be called within transaction
6520  */
6521 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6522 		     u64 bytenr, u64 num_bytes, int reserved)
6523 {
6524 	struct btrfs_block_group_cache *cache;
6525 
6526 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6527 	BUG_ON(!cache); /* Logic error */
6528 
6529 	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6530 
6531 	btrfs_put_block_group(cache);
6532 	return 0;
6533 }
6534 
6535 /*
6536  * this function must be called within transaction
6537  */
6538 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6539 				    u64 bytenr, u64 num_bytes)
6540 {
6541 	struct btrfs_block_group_cache *cache;
6542 	int ret;
6543 
6544 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6545 	if (!cache)
6546 		return -EINVAL;
6547 
6548 	/*
6549 	 * pull in the free space cache (if any) so that our pin
6550 	 * removes the free space from the cache.  We have load_only set
6551 	 * to one because the slow code to read in the free extents does check
6552 	 * the pinned extents.
6553 	 */
6554 	cache_block_group(cache, 1);
6555 
6556 	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6557 
6558 	/* remove us from the free space cache (if we're there at all) */
6559 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6560 	btrfs_put_block_group(cache);
6561 	return ret;
6562 }
6563 
6564 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6565 				   u64 start, u64 num_bytes)
6566 {
6567 	int ret;
6568 	struct btrfs_block_group_cache *block_group;
6569 	struct btrfs_caching_control *caching_ctl;
6570 
6571 	block_group = btrfs_lookup_block_group(fs_info, start);
6572 	if (!block_group)
6573 		return -EINVAL;
6574 
6575 	cache_block_group(block_group, 0);
6576 	caching_ctl = get_caching_control(block_group);
6577 
6578 	if (!caching_ctl) {
6579 		/* Logic error */
6580 		BUG_ON(!block_group_cache_done(block_group));
6581 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6582 	} else {
6583 		mutex_lock(&caching_ctl->mutex);
6584 
6585 		if (start >= caching_ctl->progress) {
6586 			ret = add_excluded_extent(fs_info, start, num_bytes);
6587 		} else if (start + num_bytes <= caching_ctl->progress) {
6588 			ret = btrfs_remove_free_space(block_group,
6589 						      start, num_bytes);
6590 		} else {
6591 			num_bytes = caching_ctl->progress - start;
6592 			ret = btrfs_remove_free_space(block_group,
6593 						      start, num_bytes);
6594 			if (ret)
6595 				goto out_lock;
6596 
6597 			num_bytes = (start + num_bytes) -
6598 				caching_ctl->progress;
6599 			start = caching_ctl->progress;
6600 			ret = add_excluded_extent(fs_info, start, num_bytes);
6601 		}
6602 out_lock:
6603 		mutex_unlock(&caching_ctl->mutex);
6604 		put_caching_control(caching_ctl);
6605 	}
6606 	btrfs_put_block_group(block_group);
6607 	return ret;
6608 }
6609 
6610 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6611 				 struct extent_buffer *eb)
6612 {
6613 	struct btrfs_file_extent_item *item;
6614 	struct btrfs_key key;
6615 	int found_type;
6616 	int i;
6617 	int ret = 0;
6618 
6619 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6620 		return 0;
6621 
6622 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
6623 		btrfs_item_key_to_cpu(eb, &key, i);
6624 		if (key.type != BTRFS_EXTENT_DATA_KEY)
6625 			continue;
6626 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6627 		found_type = btrfs_file_extent_type(eb, item);
6628 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
6629 			continue;
6630 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6631 			continue;
6632 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6633 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6634 		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6635 		if (ret)
6636 			break;
6637 	}
6638 
6639 	return ret;
6640 }
6641 
6642 static void
6643 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6644 {
6645 	atomic_inc(&bg->reservations);
6646 }
6647 
6648 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6649 					const u64 start)
6650 {
6651 	struct btrfs_block_group_cache *bg;
6652 
6653 	bg = btrfs_lookup_block_group(fs_info, start);
6654 	ASSERT(bg);
6655 	if (atomic_dec_and_test(&bg->reservations))
6656 		wake_up_var(&bg->reservations);
6657 	btrfs_put_block_group(bg);
6658 }
6659 
6660 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6661 {
6662 	struct btrfs_space_info *space_info = bg->space_info;
6663 
6664 	ASSERT(bg->ro);
6665 
6666 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6667 		return;
6668 
6669 	/*
6670 	 * Our block group is read only but before we set it to read only,
6671 	 * some task might have had allocated an extent from it already, but it
6672 	 * has not yet created a respective ordered extent (and added it to a
6673 	 * root's list of ordered extents).
6674 	 * Therefore wait for any task currently allocating extents, since the
6675 	 * block group's reservations counter is incremented while a read lock
6676 	 * on the groups' semaphore is held and decremented after releasing
6677 	 * the read access on that semaphore and creating the ordered extent.
6678 	 */
6679 	down_write(&space_info->groups_sem);
6680 	up_write(&space_info->groups_sem);
6681 
6682 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6683 }
6684 
6685 /**
6686  * btrfs_add_reserved_bytes - update the block_group and space info counters
6687  * @cache:	The cache we are manipulating
6688  * @ram_bytes:  The number of bytes of file content, and will be same to
6689  *              @num_bytes except for the compress path.
6690  * @num_bytes:	The number of bytes in question
6691  * @delalloc:   The blocks are allocated for the delalloc write
6692  *
6693  * This is called by the allocator when it reserves space. If this is a
6694  * reservation and the block group has become read only we cannot make the
6695  * reservation and return -EAGAIN, otherwise this function always succeeds.
6696  */
6697 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6698 				    u64 ram_bytes, u64 num_bytes, int delalloc)
6699 {
6700 	struct btrfs_space_info *space_info = cache->space_info;
6701 	int ret = 0;
6702 
6703 	spin_lock(&space_info->lock);
6704 	spin_lock(&cache->lock);
6705 	if (cache->ro) {
6706 		ret = -EAGAIN;
6707 	} else {
6708 		cache->reserved += num_bytes;
6709 		space_info->bytes_reserved += num_bytes;
6710 		update_bytes_may_use(space_info, -ram_bytes);
6711 		if (delalloc)
6712 			cache->delalloc_bytes += num_bytes;
6713 	}
6714 	spin_unlock(&cache->lock);
6715 	spin_unlock(&space_info->lock);
6716 	return ret;
6717 }
6718 
6719 /**
6720  * btrfs_free_reserved_bytes - update the block_group and space info counters
6721  * @cache:      The cache we are manipulating
6722  * @num_bytes:  The number of bytes in question
6723  * @delalloc:   The blocks are allocated for the delalloc write
6724  *
6725  * This is called by somebody who is freeing space that was never actually used
6726  * on disk.  For example if you reserve some space for a new leaf in transaction
6727  * A and before transaction A commits you free that leaf, you call this with
6728  * reserve set to 0 in order to clear the reservation.
6729  */
6730 
6731 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6732 				      u64 num_bytes, int delalloc)
6733 {
6734 	struct btrfs_space_info *space_info = cache->space_info;
6735 
6736 	spin_lock(&space_info->lock);
6737 	spin_lock(&cache->lock);
6738 	if (cache->ro)
6739 		space_info->bytes_readonly += num_bytes;
6740 	cache->reserved -= num_bytes;
6741 	space_info->bytes_reserved -= num_bytes;
6742 	space_info->max_extent_size = 0;
6743 
6744 	if (delalloc)
6745 		cache->delalloc_bytes -= num_bytes;
6746 	spin_unlock(&cache->lock);
6747 	spin_unlock(&space_info->lock);
6748 }
6749 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6750 {
6751 	struct btrfs_caching_control *next;
6752 	struct btrfs_caching_control *caching_ctl;
6753 	struct btrfs_block_group_cache *cache;
6754 
6755 	down_write(&fs_info->commit_root_sem);
6756 
6757 	list_for_each_entry_safe(caching_ctl, next,
6758 				 &fs_info->caching_block_groups, list) {
6759 		cache = caching_ctl->block_group;
6760 		if (block_group_cache_done(cache)) {
6761 			cache->last_byte_to_unpin = (u64)-1;
6762 			list_del_init(&caching_ctl->list);
6763 			put_caching_control(caching_ctl);
6764 		} else {
6765 			cache->last_byte_to_unpin = caching_ctl->progress;
6766 		}
6767 	}
6768 
6769 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6770 		fs_info->pinned_extents = &fs_info->freed_extents[1];
6771 	else
6772 		fs_info->pinned_extents = &fs_info->freed_extents[0];
6773 
6774 	up_write(&fs_info->commit_root_sem);
6775 
6776 	update_global_block_rsv(fs_info);
6777 }
6778 
6779 /*
6780  * Returns the free cluster for the given space info and sets empty_cluster to
6781  * what it should be based on the mount options.
6782  */
6783 static struct btrfs_free_cluster *
6784 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6785 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
6786 {
6787 	struct btrfs_free_cluster *ret = NULL;
6788 
6789 	*empty_cluster = 0;
6790 	if (btrfs_mixed_space_info(space_info))
6791 		return ret;
6792 
6793 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6794 		ret = &fs_info->meta_alloc_cluster;
6795 		if (btrfs_test_opt(fs_info, SSD))
6796 			*empty_cluster = SZ_2M;
6797 		else
6798 			*empty_cluster = SZ_64K;
6799 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6800 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
6801 		*empty_cluster = SZ_2M;
6802 		ret = &fs_info->data_alloc_cluster;
6803 	}
6804 
6805 	return ret;
6806 }
6807 
6808 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6809 			      u64 start, u64 end,
6810 			      const bool return_free_space)
6811 {
6812 	struct btrfs_block_group_cache *cache = NULL;
6813 	struct btrfs_space_info *space_info;
6814 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6815 	struct btrfs_free_cluster *cluster = NULL;
6816 	u64 len;
6817 	u64 total_unpinned = 0;
6818 	u64 empty_cluster = 0;
6819 	bool readonly;
6820 
6821 	while (start <= end) {
6822 		readonly = false;
6823 		if (!cache ||
6824 		    start >= cache->key.objectid + cache->key.offset) {
6825 			if (cache)
6826 				btrfs_put_block_group(cache);
6827 			total_unpinned = 0;
6828 			cache = btrfs_lookup_block_group(fs_info, start);
6829 			BUG_ON(!cache); /* Logic error */
6830 
6831 			cluster = fetch_cluster_info(fs_info,
6832 						     cache->space_info,
6833 						     &empty_cluster);
6834 			empty_cluster <<= 1;
6835 		}
6836 
6837 		len = cache->key.objectid + cache->key.offset - start;
6838 		len = min(len, end + 1 - start);
6839 
6840 		if (start < cache->last_byte_to_unpin) {
6841 			len = min(len, cache->last_byte_to_unpin - start);
6842 			if (return_free_space)
6843 				btrfs_add_free_space(cache, start, len);
6844 		}
6845 
6846 		start += len;
6847 		total_unpinned += len;
6848 		space_info = cache->space_info;
6849 
6850 		/*
6851 		 * If this space cluster has been marked as fragmented and we've
6852 		 * unpinned enough in this block group to potentially allow a
6853 		 * cluster to be created inside of it go ahead and clear the
6854 		 * fragmented check.
6855 		 */
6856 		if (cluster && cluster->fragmented &&
6857 		    total_unpinned > empty_cluster) {
6858 			spin_lock(&cluster->lock);
6859 			cluster->fragmented = 0;
6860 			spin_unlock(&cluster->lock);
6861 		}
6862 
6863 		spin_lock(&space_info->lock);
6864 		spin_lock(&cache->lock);
6865 		cache->pinned -= len;
6866 		update_bytes_pinned(space_info, -len);
6867 
6868 		trace_btrfs_space_reservation(fs_info, "pinned",
6869 					      space_info->flags, len, 0);
6870 		space_info->max_extent_size = 0;
6871 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
6872 			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6873 		if (cache->ro) {
6874 			space_info->bytes_readonly += len;
6875 			readonly = true;
6876 		}
6877 		spin_unlock(&cache->lock);
6878 		if (!readonly && return_free_space &&
6879 		    global_rsv->space_info == space_info) {
6880 			u64 to_add = len;
6881 
6882 			spin_lock(&global_rsv->lock);
6883 			if (!global_rsv->full) {
6884 				to_add = min(len, global_rsv->size -
6885 					     global_rsv->reserved);
6886 				global_rsv->reserved += to_add;
6887 				update_bytes_may_use(space_info, to_add);
6888 				if (global_rsv->reserved >= global_rsv->size)
6889 					global_rsv->full = 1;
6890 				trace_btrfs_space_reservation(fs_info,
6891 							      "space_info",
6892 							      space_info->flags,
6893 							      to_add, 1);
6894 				len -= to_add;
6895 			}
6896 			spin_unlock(&global_rsv->lock);
6897 			/* Add to any tickets we may have */
6898 			if (len)
6899 				space_info_add_new_bytes(fs_info, space_info,
6900 							 len);
6901 		}
6902 		spin_unlock(&space_info->lock);
6903 	}
6904 
6905 	if (cache)
6906 		btrfs_put_block_group(cache);
6907 	return 0;
6908 }
6909 
6910 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6911 {
6912 	struct btrfs_fs_info *fs_info = trans->fs_info;
6913 	struct btrfs_block_group_cache *block_group, *tmp;
6914 	struct list_head *deleted_bgs;
6915 	struct extent_io_tree *unpin;
6916 	u64 start;
6917 	u64 end;
6918 	int ret;
6919 
6920 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6921 		unpin = &fs_info->freed_extents[1];
6922 	else
6923 		unpin = &fs_info->freed_extents[0];
6924 
6925 	while (!trans->aborted) {
6926 		struct extent_state *cached_state = NULL;
6927 
6928 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
6929 		ret = find_first_extent_bit(unpin, 0, &start, &end,
6930 					    EXTENT_DIRTY, &cached_state);
6931 		if (ret) {
6932 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6933 			break;
6934 		}
6935 
6936 		if (btrfs_test_opt(fs_info, DISCARD))
6937 			ret = btrfs_discard_extent(fs_info, start,
6938 						   end + 1 - start, NULL);
6939 
6940 		clear_extent_dirty(unpin, start, end, &cached_state);
6941 		unpin_extent_range(fs_info, start, end, true);
6942 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6943 		free_extent_state(cached_state);
6944 		cond_resched();
6945 	}
6946 
6947 	/*
6948 	 * Transaction is finished.  We don't need the lock anymore.  We
6949 	 * do need to clean up the block groups in case of a transaction
6950 	 * abort.
6951 	 */
6952 	deleted_bgs = &trans->transaction->deleted_bgs;
6953 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6954 		u64 trimmed = 0;
6955 
6956 		ret = -EROFS;
6957 		if (!trans->aborted)
6958 			ret = btrfs_discard_extent(fs_info,
6959 						   block_group->key.objectid,
6960 						   block_group->key.offset,
6961 						   &trimmed);
6962 
6963 		list_del_init(&block_group->bg_list);
6964 		btrfs_put_block_group_trimming(block_group);
6965 		btrfs_put_block_group(block_group);
6966 
6967 		if (ret) {
6968 			const char *errstr = btrfs_decode_error(ret);
6969 			btrfs_warn(fs_info,
6970 			   "discard failed while removing blockgroup: errno=%d %s",
6971 				   ret, errstr);
6972 		}
6973 	}
6974 
6975 	return 0;
6976 }
6977 
6978 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6979 			       struct btrfs_delayed_ref_node *node, u64 parent,
6980 			       u64 root_objectid, u64 owner_objectid,
6981 			       u64 owner_offset, int refs_to_drop,
6982 			       struct btrfs_delayed_extent_op *extent_op)
6983 {
6984 	struct btrfs_fs_info *info = trans->fs_info;
6985 	struct btrfs_key key;
6986 	struct btrfs_path *path;
6987 	struct btrfs_root *extent_root = info->extent_root;
6988 	struct extent_buffer *leaf;
6989 	struct btrfs_extent_item *ei;
6990 	struct btrfs_extent_inline_ref *iref;
6991 	int ret;
6992 	int is_data;
6993 	int extent_slot = 0;
6994 	int found_extent = 0;
6995 	int num_to_del = 1;
6996 	u32 item_size;
6997 	u64 refs;
6998 	u64 bytenr = node->bytenr;
6999 	u64 num_bytes = node->num_bytes;
7000 	int last_ref = 0;
7001 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
7002 
7003 	path = btrfs_alloc_path();
7004 	if (!path)
7005 		return -ENOMEM;
7006 
7007 	path->reada = READA_FORWARD;
7008 	path->leave_spinning = 1;
7009 
7010 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
7011 	BUG_ON(!is_data && refs_to_drop != 1);
7012 
7013 	if (is_data)
7014 		skinny_metadata = false;
7015 
7016 	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
7017 				    parent, root_objectid, owner_objectid,
7018 				    owner_offset);
7019 	if (ret == 0) {
7020 		extent_slot = path->slots[0];
7021 		while (extent_slot >= 0) {
7022 			btrfs_item_key_to_cpu(path->nodes[0], &key,
7023 					      extent_slot);
7024 			if (key.objectid != bytenr)
7025 				break;
7026 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
7027 			    key.offset == num_bytes) {
7028 				found_extent = 1;
7029 				break;
7030 			}
7031 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
7032 			    key.offset == owner_objectid) {
7033 				found_extent = 1;
7034 				break;
7035 			}
7036 			if (path->slots[0] - extent_slot > 5)
7037 				break;
7038 			extent_slot--;
7039 		}
7040 
7041 		if (!found_extent) {
7042 			BUG_ON(iref);
7043 			ret = remove_extent_backref(trans, path, NULL,
7044 						    refs_to_drop,
7045 						    is_data, &last_ref);
7046 			if (ret) {
7047 				btrfs_abort_transaction(trans, ret);
7048 				goto out;
7049 			}
7050 			btrfs_release_path(path);
7051 			path->leave_spinning = 1;
7052 
7053 			key.objectid = bytenr;
7054 			key.type = BTRFS_EXTENT_ITEM_KEY;
7055 			key.offset = num_bytes;
7056 
7057 			if (!is_data && skinny_metadata) {
7058 				key.type = BTRFS_METADATA_ITEM_KEY;
7059 				key.offset = owner_objectid;
7060 			}
7061 
7062 			ret = btrfs_search_slot(trans, extent_root,
7063 						&key, path, -1, 1);
7064 			if (ret > 0 && skinny_metadata && path->slots[0]) {
7065 				/*
7066 				 * Couldn't find our skinny metadata item,
7067 				 * see if we have ye olde extent item.
7068 				 */
7069 				path->slots[0]--;
7070 				btrfs_item_key_to_cpu(path->nodes[0], &key,
7071 						      path->slots[0]);
7072 				if (key.objectid == bytenr &&
7073 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
7074 				    key.offset == num_bytes)
7075 					ret = 0;
7076 			}
7077 
7078 			if (ret > 0 && skinny_metadata) {
7079 				skinny_metadata = false;
7080 				key.objectid = bytenr;
7081 				key.type = BTRFS_EXTENT_ITEM_KEY;
7082 				key.offset = num_bytes;
7083 				btrfs_release_path(path);
7084 				ret = btrfs_search_slot(trans, extent_root,
7085 							&key, path, -1, 1);
7086 			}
7087 
7088 			if (ret) {
7089 				btrfs_err(info,
7090 					  "umm, got %d back from search, was looking for %llu",
7091 					  ret, bytenr);
7092 				if (ret > 0)
7093 					btrfs_print_leaf(path->nodes[0]);
7094 			}
7095 			if (ret < 0) {
7096 				btrfs_abort_transaction(trans, ret);
7097 				goto out;
7098 			}
7099 			extent_slot = path->slots[0];
7100 		}
7101 	} else if (WARN_ON(ret == -ENOENT)) {
7102 		btrfs_print_leaf(path->nodes[0]);
7103 		btrfs_err(info,
7104 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
7105 			bytenr, parent, root_objectid, owner_objectid,
7106 			owner_offset);
7107 		btrfs_abort_transaction(trans, ret);
7108 		goto out;
7109 	} else {
7110 		btrfs_abort_transaction(trans, ret);
7111 		goto out;
7112 	}
7113 
7114 	leaf = path->nodes[0];
7115 	item_size = btrfs_item_size_nr(leaf, extent_slot);
7116 	if (unlikely(item_size < sizeof(*ei))) {
7117 		ret = -EINVAL;
7118 		btrfs_print_v0_err(info);
7119 		btrfs_abort_transaction(trans, ret);
7120 		goto out;
7121 	}
7122 	ei = btrfs_item_ptr(leaf, extent_slot,
7123 			    struct btrfs_extent_item);
7124 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7125 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
7126 		struct btrfs_tree_block_info *bi;
7127 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7128 		bi = (struct btrfs_tree_block_info *)(ei + 1);
7129 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7130 	}
7131 
7132 	refs = btrfs_extent_refs(leaf, ei);
7133 	if (refs < refs_to_drop) {
7134 		btrfs_err(info,
7135 			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7136 			  refs_to_drop, refs, bytenr);
7137 		ret = -EINVAL;
7138 		btrfs_abort_transaction(trans, ret);
7139 		goto out;
7140 	}
7141 	refs -= refs_to_drop;
7142 
7143 	if (refs > 0) {
7144 		if (extent_op)
7145 			__run_delayed_extent_op(extent_op, leaf, ei);
7146 		/*
7147 		 * In the case of inline back ref, reference count will
7148 		 * be updated by remove_extent_backref
7149 		 */
7150 		if (iref) {
7151 			BUG_ON(!found_extent);
7152 		} else {
7153 			btrfs_set_extent_refs(leaf, ei, refs);
7154 			btrfs_mark_buffer_dirty(leaf);
7155 		}
7156 		if (found_extent) {
7157 			ret = remove_extent_backref(trans, path, iref,
7158 						    refs_to_drop, is_data,
7159 						    &last_ref);
7160 			if (ret) {
7161 				btrfs_abort_transaction(trans, ret);
7162 				goto out;
7163 			}
7164 		}
7165 	} else {
7166 		if (found_extent) {
7167 			BUG_ON(is_data && refs_to_drop !=
7168 			       extent_data_ref_count(path, iref));
7169 			if (iref) {
7170 				BUG_ON(path->slots[0] != extent_slot);
7171 			} else {
7172 				BUG_ON(path->slots[0] != extent_slot + 1);
7173 				path->slots[0] = extent_slot;
7174 				num_to_del = 2;
7175 			}
7176 		}
7177 
7178 		last_ref = 1;
7179 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7180 				      num_to_del);
7181 		if (ret) {
7182 			btrfs_abort_transaction(trans, ret);
7183 			goto out;
7184 		}
7185 		btrfs_release_path(path);
7186 
7187 		if (is_data) {
7188 			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7189 			if (ret) {
7190 				btrfs_abort_transaction(trans, ret);
7191 				goto out;
7192 			}
7193 		}
7194 
7195 		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
7196 		if (ret) {
7197 			btrfs_abort_transaction(trans, ret);
7198 			goto out;
7199 		}
7200 
7201 		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7202 		if (ret) {
7203 			btrfs_abort_transaction(trans, ret);
7204 			goto out;
7205 		}
7206 	}
7207 	btrfs_release_path(path);
7208 
7209 out:
7210 	btrfs_free_path(path);
7211 	return ret;
7212 }
7213 
7214 /*
7215  * when we free an block, it is possible (and likely) that we free the last
7216  * delayed ref for that extent as well.  This searches the delayed ref tree for
7217  * a given extent, and if there are no other delayed refs to be processed, it
7218  * removes it from the tree.
7219  */
7220 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7221 				      u64 bytenr)
7222 {
7223 	struct btrfs_delayed_ref_head *head;
7224 	struct btrfs_delayed_ref_root *delayed_refs;
7225 	int ret = 0;
7226 
7227 	delayed_refs = &trans->transaction->delayed_refs;
7228 	spin_lock(&delayed_refs->lock);
7229 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7230 	if (!head)
7231 		goto out_delayed_unlock;
7232 
7233 	spin_lock(&head->lock);
7234 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
7235 		goto out;
7236 
7237 	if (cleanup_extent_op(head) != NULL)
7238 		goto out;
7239 
7240 	/*
7241 	 * waiting for the lock here would deadlock.  If someone else has it
7242 	 * locked they are already in the process of dropping it anyway
7243 	 */
7244 	if (!mutex_trylock(&head->mutex))
7245 		goto out;
7246 
7247 	btrfs_delete_ref_head(delayed_refs, head);
7248 	head->processing = 0;
7249 
7250 	spin_unlock(&head->lock);
7251 	spin_unlock(&delayed_refs->lock);
7252 
7253 	BUG_ON(head->extent_op);
7254 	if (head->must_insert_reserved)
7255 		ret = 1;
7256 
7257 	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
7258 	mutex_unlock(&head->mutex);
7259 	btrfs_put_delayed_ref_head(head);
7260 	return ret;
7261 out:
7262 	spin_unlock(&head->lock);
7263 
7264 out_delayed_unlock:
7265 	spin_unlock(&delayed_refs->lock);
7266 	return 0;
7267 }
7268 
7269 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7270 			   struct btrfs_root *root,
7271 			   struct extent_buffer *buf,
7272 			   u64 parent, int last_ref)
7273 {
7274 	struct btrfs_fs_info *fs_info = root->fs_info;
7275 	int pin = 1;
7276 	int ret;
7277 
7278 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7279 		int old_ref_mod, new_ref_mod;
7280 
7281 		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7282 				   root->root_key.objectid,
7283 				   btrfs_header_level(buf), 0,
7284 				   BTRFS_DROP_DELAYED_REF);
7285 		ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7286 						 buf->len, parent,
7287 						 root->root_key.objectid,
7288 						 btrfs_header_level(buf),
7289 						 BTRFS_DROP_DELAYED_REF, NULL,
7290 						 &old_ref_mod, &new_ref_mod);
7291 		BUG_ON(ret); /* -ENOMEM */
7292 		pin = old_ref_mod >= 0 && new_ref_mod < 0;
7293 	}
7294 
7295 	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7296 		struct btrfs_block_group_cache *cache;
7297 
7298 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7299 			ret = check_ref_cleanup(trans, buf->start);
7300 			if (!ret)
7301 				goto out;
7302 		}
7303 
7304 		pin = 0;
7305 		cache = btrfs_lookup_block_group(fs_info, buf->start);
7306 
7307 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7308 			pin_down_extent(fs_info, cache, buf->start,
7309 					buf->len, 1);
7310 			btrfs_put_block_group(cache);
7311 			goto out;
7312 		}
7313 
7314 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7315 
7316 		btrfs_add_free_space(cache, buf->start, buf->len);
7317 		btrfs_free_reserved_bytes(cache, buf->len, 0);
7318 		btrfs_put_block_group(cache);
7319 		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7320 	}
7321 out:
7322 	if (pin)
7323 		add_pinned_bytes(fs_info, buf->len, true,
7324 				 root->root_key.objectid);
7325 
7326 	if (last_ref) {
7327 		/*
7328 		 * Deleting the buffer, clear the corrupt flag since it doesn't
7329 		 * matter anymore.
7330 		 */
7331 		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7332 	}
7333 }
7334 
7335 /* Can return -ENOMEM */
7336 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7337 		      struct btrfs_root *root,
7338 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7339 		      u64 owner, u64 offset)
7340 {
7341 	struct btrfs_fs_info *fs_info = root->fs_info;
7342 	int old_ref_mod, new_ref_mod;
7343 	int ret;
7344 
7345 	if (btrfs_is_testing(fs_info))
7346 		return 0;
7347 
7348 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7349 		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7350 				   root_objectid, owner, offset,
7351 				   BTRFS_DROP_DELAYED_REF);
7352 
7353 	/*
7354 	 * tree log blocks never actually go into the extent allocation
7355 	 * tree, just update pinning info and exit early.
7356 	 */
7357 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7358 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7359 		/* unlocks the pinned mutex */
7360 		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7361 		old_ref_mod = new_ref_mod = 0;
7362 		ret = 0;
7363 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7364 		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7365 						 num_bytes, parent,
7366 						 root_objectid, (int)owner,
7367 						 BTRFS_DROP_DELAYED_REF, NULL,
7368 						 &old_ref_mod, &new_ref_mod);
7369 	} else {
7370 		ret = btrfs_add_delayed_data_ref(trans, bytenr,
7371 						 num_bytes, parent,
7372 						 root_objectid, owner, offset,
7373 						 0, BTRFS_DROP_DELAYED_REF,
7374 						 &old_ref_mod, &new_ref_mod);
7375 	}
7376 
7377 	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7378 		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
7379 
7380 		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7381 	}
7382 
7383 	return ret;
7384 }
7385 
7386 /*
7387  * when we wait for progress in the block group caching, its because
7388  * our allocation attempt failed at least once.  So, we must sleep
7389  * and let some progress happen before we try again.
7390  *
7391  * This function will sleep at least once waiting for new free space to
7392  * show up, and then it will check the block group free space numbers
7393  * for our min num_bytes.  Another option is to have it go ahead
7394  * and look in the rbtree for a free extent of a given size, but this
7395  * is a good start.
7396  *
7397  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7398  * any of the information in this block group.
7399  */
7400 static noinline void
7401 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7402 				u64 num_bytes)
7403 {
7404 	struct btrfs_caching_control *caching_ctl;
7405 
7406 	caching_ctl = get_caching_control(cache);
7407 	if (!caching_ctl)
7408 		return;
7409 
7410 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7411 		   (cache->free_space_ctl->free_space >= num_bytes));
7412 
7413 	put_caching_control(caching_ctl);
7414 }
7415 
7416 static noinline int
7417 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7418 {
7419 	struct btrfs_caching_control *caching_ctl;
7420 	int ret = 0;
7421 
7422 	caching_ctl = get_caching_control(cache);
7423 	if (!caching_ctl)
7424 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7425 
7426 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
7427 	if (cache->cached == BTRFS_CACHE_ERROR)
7428 		ret = -EIO;
7429 	put_caching_control(caching_ctl);
7430 	return ret;
7431 }
7432 
7433 enum btrfs_loop_type {
7434 	LOOP_CACHING_NOWAIT = 0,
7435 	LOOP_CACHING_WAIT = 1,
7436 	LOOP_ALLOC_CHUNK = 2,
7437 	LOOP_NO_EMPTY_SIZE = 3,
7438 };
7439 
7440 static inline void
7441 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7442 		       int delalloc)
7443 {
7444 	if (delalloc)
7445 		down_read(&cache->data_rwsem);
7446 }
7447 
7448 static inline void
7449 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7450 		       int delalloc)
7451 {
7452 	btrfs_get_block_group(cache);
7453 	if (delalloc)
7454 		down_read(&cache->data_rwsem);
7455 }
7456 
7457 static struct btrfs_block_group_cache *
7458 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7459 		   struct btrfs_free_cluster *cluster,
7460 		   int delalloc)
7461 {
7462 	struct btrfs_block_group_cache *used_bg = NULL;
7463 
7464 	spin_lock(&cluster->refill_lock);
7465 	while (1) {
7466 		used_bg = cluster->block_group;
7467 		if (!used_bg)
7468 			return NULL;
7469 
7470 		if (used_bg == block_group)
7471 			return used_bg;
7472 
7473 		btrfs_get_block_group(used_bg);
7474 
7475 		if (!delalloc)
7476 			return used_bg;
7477 
7478 		if (down_read_trylock(&used_bg->data_rwsem))
7479 			return used_bg;
7480 
7481 		spin_unlock(&cluster->refill_lock);
7482 
7483 		/* We should only have one-level nested. */
7484 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7485 
7486 		spin_lock(&cluster->refill_lock);
7487 		if (used_bg == cluster->block_group)
7488 			return used_bg;
7489 
7490 		up_read(&used_bg->data_rwsem);
7491 		btrfs_put_block_group(used_bg);
7492 	}
7493 }
7494 
7495 static inline void
7496 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7497 			 int delalloc)
7498 {
7499 	if (delalloc)
7500 		up_read(&cache->data_rwsem);
7501 	btrfs_put_block_group(cache);
7502 }
7503 
7504 /*
7505  * Structure used internally for find_free_extent() function.  Wraps needed
7506  * parameters.
7507  */
7508 struct find_free_extent_ctl {
7509 	/* Basic allocation info */
7510 	u64 ram_bytes;
7511 	u64 num_bytes;
7512 	u64 empty_size;
7513 	u64 flags;
7514 	int delalloc;
7515 
7516 	/* Where to start the search inside the bg */
7517 	u64 search_start;
7518 
7519 	/* For clustered allocation */
7520 	u64 empty_cluster;
7521 
7522 	bool have_caching_bg;
7523 	bool orig_have_caching_bg;
7524 
7525 	/* RAID index, converted from flags */
7526 	int index;
7527 
7528 	/*
7529 	 * Current loop number, check find_free_extent_update_loop() for details
7530 	 */
7531 	int loop;
7532 
7533 	/*
7534 	 * Whether we're refilling a cluster, if true we need to re-search
7535 	 * current block group but don't try to refill the cluster again.
7536 	 */
7537 	bool retry_clustered;
7538 
7539 	/*
7540 	 * Whether we're updating free space cache, if true we need to re-search
7541 	 * current block group but don't try updating free space cache again.
7542 	 */
7543 	bool retry_unclustered;
7544 
7545 	/* If current block group is cached */
7546 	int cached;
7547 
7548 	/* Max contiguous hole found */
7549 	u64 max_extent_size;
7550 
7551 	/* Total free space from free space cache, not always contiguous */
7552 	u64 total_free_space;
7553 
7554 	/* Found result */
7555 	u64 found_offset;
7556 };
7557 
7558 
7559 /*
7560  * Helper function for find_free_extent().
7561  *
7562  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7563  * Return -EAGAIN to inform caller that we need to re-search this block group
7564  * Return >0 to inform caller that we find nothing
7565  * Return 0 means we have found a location and set ffe_ctl->found_offset.
7566  */
7567 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7568 		struct btrfs_free_cluster *last_ptr,
7569 		struct find_free_extent_ctl *ffe_ctl,
7570 		struct btrfs_block_group_cache **cluster_bg_ret)
7571 {
7572 	struct btrfs_fs_info *fs_info = bg->fs_info;
7573 	struct btrfs_block_group_cache *cluster_bg;
7574 	u64 aligned_cluster;
7575 	u64 offset;
7576 	int ret;
7577 
7578 	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7579 	if (!cluster_bg)
7580 		goto refill_cluster;
7581 	if (cluster_bg != bg && (cluster_bg->ro ||
7582 	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
7583 		goto release_cluster;
7584 
7585 	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7586 			ffe_ctl->num_bytes, cluster_bg->key.objectid,
7587 			&ffe_ctl->max_extent_size);
7588 	if (offset) {
7589 		/* We have a block, we're done */
7590 		spin_unlock(&last_ptr->refill_lock);
7591 		trace_btrfs_reserve_extent_cluster(cluster_bg,
7592 				ffe_ctl->search_start, ffe_ctl->num_bytes);
7593 		*cluster_bg_ret = cluster_bg;
7594 		ffe_ctl->found_offset = offset;
7595 		return 0;
7596 	}
7597 	WARN_ON(last_ptr->block_group != cluster_bg);
7598 
7599 release_cluster:
7600 	/*
7601 	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7602 	 * lets just skip it and let the allocator find whatever block it can
7603 	 * find. If we reach this point, we will have tried the cluster
7604 	 * allocator plenty of times and not have found anything, so we are
7605 	 * likely way too fragmented for the clustering stuff to find anything.
7606 	 *
7607 	 * However, if the cluster is taken from the current block group,
7608 	 * release the cluster first, so that we stand a better chance of
7609 	 * succeeding in the unclustered allocation.
7610 	 */
7611 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7612 		spin_unlock(&last_ptr->refill_lock);
7613 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7614 		return -ENOENT;
7615 	}
7616 
7617 	/* This cluster didn't work out, free it and start over */
7618 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
7619 
7620 	if (cluster_bg != bg)
7621 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7622 
7623 refill_cluster:
7624 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7625 		spin_unlock(&last_ptr->refill_lock);
7626 		return -ENOENT;
7627 	}
7628 
7629 	aligned_cluster = max_t(u64,
7630 			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7631 			bg->full_stripe_len);
7632 	ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
7633 			ffe_ctl->search_start, ffe_ctl->num_bytes,
7634 			aligned_cluster);
7635 	if (ret == 0) {
7636 		/* Now pull our allocation out of this cluster */
7637 		offset = btrfs_alloc_from_cluster(bg, last_ptr,
7638 				ffe_ctl->num_bytes, ffe_ctl->search_start,
7639 				&ffe_ctl->max_extent_size);
7640 		if (offset) {
7641 			/* We found one, proceed */
7642 			spin_unlock(&last_ptr->refill_lock);
7643 			trace_btrfs_reserve_extent_cluster(bg,
7644 					ffe_ctl->search_start,
7645 					ffe_ctl->num_bytes);
7646 			ffe_ctl->found_offset = offset;
7647 			return 0;
7648 		}
7649 	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7650 		   !ffe_ctl->retry_clustered) {
7651 		spin_unlock(&last_ptr->refill_lock);
7652 
7653 		ffe_ctl->retry_clustered = true;
7654 		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7655 				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7656 		return -EAGAIN;
7657 	}
7658 	/*
7659 	 * At this point we either didn't find a cluster or we weren't able to
7660 	 * allocate a block from our cluster.  Free the cluster we've been
7661 	 * trying to use, and go to the next block group.
7662 	 */
7663 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
7664 	spin_unlock(&last_ptr->refill_lock);
7665 	return 1;
7666 }
7667 
7668 /*
7669  * Return >0 to inform caller that we find nothing
7670  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7671  * Return -EAGAIN to inform caller that we need to re-search this block group
7672  */
7673 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7674 		struct btrfs_free_cluster *last_ptr,
7675 		struct find_free_extent_ctl *ffe_ctl)
7676 {
7677 	u64 offset;
7678 
7679 	/*
7680 	 * We are doing an unclustered allocation, set the fragmented flag so
7681 	 * we don't bother trying to setup a cluster again until we get more
7682 	 * space.
7683 	 */
7684 	if (unlikely(last_ptr)) {
7685 		spin_lock(&last_ptr->lock);
7686 		last_ptr->fragmented = 1;
7687 		spin_unlock(&last_ptr->lock);
7688 	}
7689 	if (ffe_ctl->cached) {
7690 		struct btrfs_free_space_ctl *free_space_ctl;
7691 
7692 		free_space_ctl = bg->free_space_ctl;
7693 		spin_lock(&free_space_ctl->tree_lock);
7694 		if (free_space_ctl->free_space <
7695 		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7696 		    ffe_ctl->empty_size) {
7697 			ffe_ctl->total_free_space = max_t(u64,
7698 					ffe_ctl->total_free_space,
7699 					free_space_ctl->free_space);
7700 			spin_unlock(&free_space_ctl->tree_lock);
7701 			return 1;
7702 		}
7703 		spin_unlock(&free_space_ctl->tree_lock);
7704 	}
7705 
7706 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7707 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
7708 			&ffe_ctl->max_extent_size);
7709 
7710 	/*
7711 	 * If we didn't find a chunk, and we haven't failed on this block group
7712 	 * before, and this block group is in the middle of caching and we are
7713 	 * ok with waiting, then go ahead and wait for progress to be made, and
7714 	 * set @retry_unclustered to true.
7715 	 *
7716 	 * If @retry_unclustered is true then we've already waited on this
7717 	 * block group once and should move on to the next block group.
7718 	 */
7719 	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7720 	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7721 		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7722 						ffe_ctl->empty_size);
7723 		ffe_ctl->retry_unclustered = true;
7724 		return -EAGAIN;
7725 	} else if (!offset) {
7726 		return 1;
7727 	}
7728 	ffe_ctl->found_offset = offset;
7729 	return 0;
7730 }
7731 
7732 /*
7733  * Return >0 means caller needs to re-search for free extent
7734  * Return 0 means we have the needed free extent.
7735  * Return <0 means we failed to locate any free extent.
7736  */
7737 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7738 					struct btrfs_free_cluster *last_ptr,
7739 					struct btrfs_key *ins,
7740 					struct find_free_extent_ctl *ffe_ctl,
7741 					int full_search, bool use_cluster)
7742 {
7743 	struct btrfs_root *root = fs_info->extent_root;
7744 	int ret;
7745 
7746 	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7747 	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7748 		ffe_ctl->orig_have_caching_bg = true;
7749 
7750 	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7751 	    ffe_ctl->have_caching_bg)
7752 		return 1;
7753 
7754 	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7755 		return 1;
7756 
7757 	if (ins->objectid) {
7758 		if (!use_cluster && last_ptr) {
7759 			spin_lock(&last_ptr->lock);
7760 			last_ptr->window_start = ins->objectid;
7761 			spin_unlock(&last_ptr->lock);
7762 		}
7763 		return 0;
7764 	}
7765 
7766 	/*
7767 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7768 	 *			caching kthreads as we move along
7769 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7770 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7771 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7772 	 *		       again
7773 	 */
7774 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7775 		ffe_ctl->index = 0;
7776 		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7777 			/*
7778 			 * We want to skip the LOOP_CACHING_WAIT step if we
7779 			 * don't have any uncached bgs and we've already done a
7780 			 * full search through.
7781 			 */
7782 			if (ffe_ctl->orig_have_caching_bg || !full_search)
7783 				ffe_ctl->loop = LOOP_CACHING_WAIT;
7784 			else
7785 				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7786 		} else {
7787 			ffe_ctl->loop++;
7788 		}
7789 
7790 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7791 			struct btrfs_trans_handle *trans;
7792 			int exist = 0;
7793 
7794 			trans = current->journal_info;
7795 			if (trans)
7796 				exist = 1;
7797 			else
7798 				trans = btrfs_join_transaction(root);
7799 
7800 			if (IS_ERR(trans)) {
7801 				ret = PTR_ERR(trans);
7802 				return ret;
7803 			}
7804 
7805 			ret = do_chunk_alloc(trans, ffe_ctl->flags,
7806 					     CHUNK_ALLOC_FORCE);
7807 
7808 			/*
7809 			 * If we can't allocate a new chunk we've already looped
7810 			 * through at least once, move on to the NO_EMPTY_SIZE
7811 			 * case.
7812 			 */
7813 			if (ret == -ENOSPC)
7814 				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7815 
7816 			/* Do not bail out on ENOSPC since we can do more. */
7817 			if (ret < 0 && ret != -ENOSPC)
7818 				btrfs_abort_transaction(trans, ret);
7819 			else
7820 				ret = 0;
7821 			if (!exist)
7822 				btrfs_end_transaction(trans);
7823 			if (ret)
7824 				return ret;
7825 		}
7826 
7827 		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7828 			/*
7829 			 * Don't loop again if we already have no empty_size and
7830 			 * no empty_cluster.
7831 			 */
7832 			if (ffe_ctl->empty_size == 0 &&
7833 			    ffe_ctl->empty_cluster == 0)
7834 				return -ENOSPC;
7835 			ffe_ctl->empty_size = 0;
7836 			ffe_ctl->empty_cluster = 0;
7837 		}
7838 		return 1;
7839 	}
7840 	return -ENOSPC;
7841 }
7842 
7843 /*
7844  * walks the btree of allocated extents and find a hole of a given size.
7845  * The key ins is changed to record the hole:
7846  * ins->objectid == start position
7847  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7848  * ins->offset == the size of the hole.
7849  * Any available blocks before search_start are skipped.
7850  *
7851  * If there is no suitable free space, we will record the max size of
7852  * the free space extent currently.
7853  *
7854  * The overall logic and call chain:
7855  *
7856  * find_free_extent()
7857  * |- Iterate through all block groups
7858  * |  |- Get a valid block group
7859  * |  |- Try to do clustered allocation in that block group
7860  * |  |- Try to do unclustered allocation in that block group
7861  * |  |- Check if the result is valid
7862  * |  |  |- If valid, then exit
7863  * |  |- Jump to next block group
7864  * |
7865  * |- Push harder to find free extents
7866  *    |- If not found, re-iterate all block groups
7867  */
7868 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7869 				u64 ram_bytes, u64 num_bytes, u64 empty_size,
7870 				u64 hint_byte, struct btrfs_key *ins,
7871 				u64 flags, int delalloc)
7872 {
7873 	int ret = 0;
7874 	struct btrfs_free_cluster *last_ptr = NULL;
7875 	struct btrfs_block_group_cache *block_group = NULL;
7876 	struct find_free_extent_ctl ffe_ctl = {0};
7877 	struct btrfs_space_info *space_info;
7878 	bool use_cluster = true;
7879 	bool full_search = false;
7880 
7881 	WARN_ON(num_bytes < fs_info->sectorsize);
7882 
7883 	ffe_ctl.ram_bytes = ram_bytes;
7884 	ffe_ctl.num_bytes = num_bytes;
7885 	ffe_ctl.empty_size = empty_size;
7886 	ffe_ctl.flags = flags;
7887 	ffe_ctl.search_start = 0;
7888 	ffe_ctl.retry_clustered = false;
7889 	ffe_ctl.retry_unclustered = false;
7890 	ffe_ctl.delalloc = delalloc;
7891 	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7892 	ffe_ctl.have_caching_bg = false;
7893 	ffe_ctl.orig_have_caching_bg = false;
7894 	ffe_ctl.found_offset = 0;
7895 
7896 	ins->type = BTRFS_EXTENT_ITEM_KEY;
7897 	ins->objectid = 0;
7898 	ins->offset = 0;
7899 
7900 	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7901 
7902 	space_info = __find_space_info(fs_info, flags);
7903 	if (!space_info) {
7904 		btrfs_err(fs_info, "No space info for %llu", flags);
7905 		return -ENOSPC;
7906 	}
7907 
7908 	/*
7909 	 * If our free space is heavily fragmented we may not be able to make
7910 	 * big contiguous allocations, so instead of doing the expensive search
7911 	 * for free space, simply return ENOSPC with our max_extent_size so we
7912 	 * can go ahead and search for a more manageable chunk.
7913 	 *
7914 	 * If our max_extent_size is large enough for our allocation simply
7915 	 * disable clustering since we will likely not be able to find enough
7916 	 * space to create a cluster and induce latency trying.
7917 	 */
7918 	if (unlikely(space_info->max_extent_size)) {
7919 		spin_lock(&space_info->lock);
7920 		if (space_info->max_extent_size &&
7921 		    num_bytes > space_info->max_extent_size) {
7922 			ins->offset = space_info->max_extent_size;
7923 			spin_unlock(&space_info->lock);
7924 			return -ENOSPC;
7925 		} else if (space_info->max_extent_size) {
7926 			use_cluster = false;
7927 		}
7928 		spin_unlock(&space_info->lock);
7929 	}
7930 
7931 	last_ptr = fetch_cluster_info(fs_info, space_info,
7932 				      &ffe_ctl.empty_cluster);
7933 	if (last_ptr) {
7934 		spin_lock(&last_ptr->lock);
7935 		if (last_ptr->block_group)
7936 			hint_byte = last_ptr->window_start;
7937 		if (last_ptr->fragmented) {
7938 			/*
7939 			 * We still set window_start so we can keep track of the
7940 			 * last place we found an allocation to try and save
7941 			 * some time.
7942 			 */
7943 			hint_byte = last_ptr->window_start;
7944 			use_cluster = false;
7945 		}
7946 		spin_unlock(&last_ptr->lock);
7947 	}
7948 
7949 	ffe_ctl.search_start = max(ffe_ctl.search_start,
7950 				   first_logical_byte(fs_info, 0));
7951 	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7952 	if (ffe_ctl.search_start == hint_byte) {
7953 		block_group = btrfs_lookup_block_group(fs_info,
7954 						       ffe_ctl.search_start);
7955 		/*
7956 		 * we don't want to use the block group if it doesn't match our
7957 		 * allocation bits, or if its not cached.
7958 		 *
7959 		 * However if we are re-searching with an ideal block group
7960 		 * picked out then we don't care that the block group is cached.
7961 		 */
7962 		if (block_group && block_group_bits(block_group, flags) &&
7963 		    block_group->cached != BTRFS_CACHE_NO) {
7964 			down_read(&space_info->groups_sem);
7965 			if (list_empty(&block_group->list) ||
7966 			    block_group->ro) {
7967 				/*
7968 				 * someone is removing this block group,
7969 				 * we can't jump into the have_block_group
7970 				 * target because our list pointers are not
7971 				 * valid
7972 				 */
7973 				btrfs_put_block_group(block_group);
7974 				up_read(&space_info->groups_sem);
7975 			} else {
7976 				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7977 						block_group->flags);
7978 				btrfs_lock_block_group(block_group, delalloc);
7979 				goto have_block_group;
7980 			}
7981 		} else if (block_group) {
7982 			btrfs_put_block_group(block_group);
7983 		}
7984 	}
7985 search:
7986 	ffe_ctl.have_caching_bg = false;
7987 	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7988 	    ffe_ctl.index == 0)
7989 		full_search = true;
7990 	down_read(&space_info->groups_sem);
7991 	list_for_each_entry(block_group,
7992 			    &space_info->block_groups[ffe_ctl.index], list) {
7993 		/* If the block group is read-only, we can skip it entirely. */
7994 		if (unlikely(block_group->ro))
7995 			continue;
7996 
7997 		btrfs_grab_block_group(block_group, delalloc);
7998 		ffe_ctl.search_start = block_group->key.objectid;
7999 
8000 		/*
8001 		 * this can happen if we end up cycling through all the
8002 		 * raid types, but we want to make sure we only allocate
8003 		 * for the proper type.
8004 		 */
8005 		if (!block_group_bits(block_group, flags)) {
8006 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
8007 				BTRFS_BLOCK_GROUP_RAID1 |
8008 				BTRFS_BLOCK_GROUP_RAID5 |
8009 				BTRFS_BLOCK_GROUP_RAID6 |
8010 				BTRFS_BLOCK_GROUP_RAID10;
8011 
8012 			/*
8013 			 * if they asked for extra copies and this block group
8014 			 * doesn't provide them, bail.  This does allow us to
8015 			 * fill raid0 from raid1.
8016 			 */
8017 			if ((flags & extra) && !(block_group->flags & extra))
8018 				goto loop;
8019 		}
8020 
8021 have_block_group:
8022 		ffe_ctl.cached = block_group_cache_done(block_group);
8023 		if (unlikely(!ffe_ctl.cached)) {
8024 			ffe_ctl.have_caching_bg = true;
8025 			ret = cache_block_group(block_group, 0);
8026 			BUG_ON(ret < 0);
8027 			ret = 0;
8028 		}
8029 
8030 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
8031 			goto loop;
8032 
8033 		/*
8034 		 * Ok we want to try and use the cluster allocator, so
8035 		 * lets look there
8036 		 */
8037 		if (last_ptr && use_cluster) {
8038 			struct btrfs_block_group_cache *cluster_bg = NULL;
8039 
8040 			ret = find_free_extent_clustered(block_group, last_ptr,
8041 							 &ffe_ctl, &cluster_bg);
8042 
8043 			if (ret == 0) {
8044 				if (cluster_bg && cluster_bg != block_group) {
8045 					btrfs_release_block_group(block_group,
8046 								  delalloc);
8047 					block_group = cluster_bg;
8048 				}
8049 				goto checks;
8050 			} else if (ret == -EAGAIN) {
8051 				goto have_block_group;
8052 			} else if (ret > 0) {
8053 				goto loop;
8054 			}
8055 			/* ret == -ENOENT case falls through */
8056 		}
8057 
8058 		ret = find_free_extent_unclustered(block_group, last_ptr,
8059 						   &ffe_ctl);
8060 		if (ret == -EAGAIN)
8061 			goto have_block_group;
8062 		else if (ret > 0)
8063 			goto loop;
8064 		/* ret == 0 case falls through */
8065 checks:
8066 		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
8067 					     fs_info->stripesize);
8068 
8069 		/* move on to the next group */
8070 		if (ffe_ctl.search_start + num_bytes >
8071 		    block_group->key.objectid + block_group->key.offset) {
8072 			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8073 					     num_bytes);
8074 			goto loop;
8075 		}
8076 
8077 		if (ffe_ctl.found_offset < ffe_ctl.search_start)
8078 			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8079 				ffe_ctl.search_start - ffe_ctl.found_offset);
8080 
8081 		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
8082 				num_bytes, delalloc);
8083 		if (ret == -EAGAIN) {
8084 			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8085 					     num_bytes);
8086 			goto loop;
8087 		}
8088 		btrfs_inc_block_group_reservations(block_group);
8089 
8090 		/* we are all good, lets return */
8091 		ins->objectid = ffe_ctl.search_start;
8092 		ins->offset = num_bytes;
8093 
8094 		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
8095 					   num_bytes);
8096 		btrfs_release_block_group(block_group, delalloc);
8097 		break;
8098 loop:
8099 		ffe_ctl.retry_clustered = false;
8100 		ffe_ctl.retry_unclustered = false;
8101 		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
8102 		       ffe_ctl.index);
8103 		btrfs_release_block_group(block_group, delalloc);
8104 		cond_resched();
8105 	}
8106 	up_read(&space_info->groups_sem);
8107 
8108 	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
8109 					   full_search, use_cluster);
8110 	if (ret > 0)
8111 		goto search;
8112 
8113 	if (ret == -ENOSPC) {
8114 		/*
8115 		 * Use ffe_ctl->total_free_space as fallback if we can't find
8116 		 * any contiguous hole.
8117 		 */
8118 		if (!ffe_ctl.max_extent_size)
8119 			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
8120 		spin_lock(&space_info->lock);
8121 		space_info->max_extent_size = ffe_ctl.max_extent_size;
8122 		spin_unlock(&space_info->lock);
8123 		ins->offset = ffe_ctl.max_extent_size;
8124 	}
8125 	return ret;
8126 }
8127 
8128 #define DUMP_BLOCK_RSV(fs_info, rsv_name)				\
8129 do {									\
8130 	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;		\
8131 	spin_lock(&__rsv->lock);					\
8132 	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",	\
8133 		   __rsv->size, __rsv->reserved);			\
8134 	spin_unlock(&__rsv->lock);					\
8135 } while (0)
8136 
8137 static void dump_space_info(struct btrfs_fs_info *fs_info,
8138 			    struct btrfs_space_info *info, u64 bytes,
8139 			    int dump_block_groups)
8140 {
8141 	struct btrfs_block_group_cache *cache;
8142 	int index = 0;
8143 
8144 	spin_lock(&info->lock);
8145 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
8146 		   info->flags,
8147 		   info->total_bytes - btrfs_space_info_used(info, true),
8148 		   info->full ? "" : "not ");
8149 	btrfs_info(fs_info,
8150 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
8151 		info->total_bytes, info->bytes_used, info->bytes_pinned,
8152 		info->bytes_reserved, info->bytes_may_use,
8153 		info->bytes_readonly);
8154 	spin_unlock(&info->lock);
8155 
8156 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
8157 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
8158 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
8159 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
8160 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
8161 
8162 	if (!dump_block_groups)
8163 		return;
8164 
8165 	down_read(&info->groups_sem);
8166 again:
8167 	list_for_each_entry(cache, &info->block_groups[index], list) {
8168 		spin_lock(&cache->lock);
8169 		btrfs_info(fs_info,
8170 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8171 			cache->key.objectid, cache->key.offset,
8172 			btrfs_block_group_used(&cache->item), cache->pinned,
8173 			cache->reserved, cache->ro ? "[readonly]" : "");
8174 		btrfs_dump_free_space(cache, bytes);
8175 		spin_unlock(&cache->lock);
8176 	}
8177 	if (++index < BTRFS_NR_RAID_TYPES)
8178 		goto again;
8179 	up_read(&info->groups_sem);
8180 }
8181 
8182 /*
8183  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
8184  *			  hole that is at least as big as @num_bytes.
8185  *
8186  * @root           -	The root that will contain this extent
8187  *
8188  * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
8189  *			is used for accounting purposes. This value differs
8190  *			from @num_bytes only in the case of compressed extents.
8191  *
8192  * @num_bytes      -	Number of bytes to allocate on-disk.
8193  *
8194  * @min_alloc_size -	Indicates the minimum amount of space that the
8195  *			allocator should try to satisfy. In some cases
8196  *			@num_bytes may be larger than what is required and if
8197  *			the filesystem is fragmented then allocation fails.
8198  *			However, the presence of @min_alloc_size gives a
8199  *			chance to try and satisfy the smaller allocation.
8200  *
8201  * @empty_size     -	A hint that you plan on doing more COW. This is the
8202  *			size in bytes the allocator should try to find free
8203  *			next to the block it returns.  This is just a hint and
8204  *			may be ignored by the allocator.
8205  *
8206  * @hint_byte      -	Hint to the allocator to start searching above the byte
8207  *			address passed. It might be ignored.
8208  *
8209  * @ins            -	This key is modified to record the found hole. It will
8210  *			have the following values:
8211  *			ins->objectid == start position
8212  *			ins->flags = BTRFS_EXTENT_ITEM_KEY
8213  *			ins->offset == the size of the hole.
8214  *
8215  * @is_data        -	Boolean flag indicating whether an extent is
8216  *			allocated for data (true) or metadata (false)
8217  *
8218  * @delalloc       -	Boolean flag indicating whether this allocation is for
8219  *			delalloc or not. If 'true' data_rwsem of block groups
8220  *			is going to be acquired.
8221  *
8222  *
8223  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
8224  * case -ENOSPC is returned then @ins->offset will contain the size of the
8225  * largest available hole the allocator managed to find.
8226  */
8227 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8228 			 u64 num_bytes, u64 min_alloc_size,
8229 			 u64 empty_size, u64 hint_byte,
8230 			 struct btrfs_key *ins, int is_data, int delalloc)
8231 {
8232 	struct btrfs_fs_info *fs_info = root->fs_info;
8233 	bool final_tried = num_bytes == min_alloc_size;
8234 	u64 flags;
8235 	int ret;
8236 
8237 	flags = get_alloc_profile_by_root(root, is_data);
8238 again:
8239 	WARN_ON(num_bytes < fs_info->sectorsize);
8240 	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8241 			       hint_byte, ins, flags, delalloc);
8242 	if (!ret && !is_data) {
8243 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8244 	} else if (ret == -ENOSPC) {
8245 		if (!final_tried && ins->offset) {
8246 			num_bytes = min(num_bytes >> 1, ins->offset);
8247 			num_bytes = round_down(num_bytes,
8248 					       fs_info->sectorsize);
8249 			num_bytes = max(num_bytes, min_alloc_size);
8250 			ram_bytes = num_bytes;
8251 			if (num_bytes == min_alloc_size)
8252 				final_tried = true;
8253 			goto again;
8254 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8255 			struct btrfs_space_info *sinfo;
8256 
8257 			sinfo = __find_space_info(fs_info, flags);
8258 			btrfs_err(fs_info,
8259 				  "allocation failed flags %llu, wanted %llu",
8260 				  flags, num_bytes);
8261 			if (sinfo)
8262 				dump_space_info(fs_info, sinfo, num_bytes, 1);
8263 		}
8264 	}
8265 
8266 	return ret;
8267 }
8268 
8269 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8270 					u64 start, u64 len,
8271 					int pin, int delalloc)
8272 {
8273 	struct btrfs_block_group_cache *cache;
8274 	int ret = 0;
8275 
8276 	cache = btrfs_lookup_block_group(fs_info, start);
8277 	if (!cache) {
8278 		btrfs_err(fs_info, "Unable to find block group for %llu",
8279 			  start);
8280 		return -ENOSPC;
8281 	}
8282 
8283 	if (pin)
8284 		pin_down_extent(fs_info, cache, start, len, 1);
8285 	else {
8286 		if (btrfs_test_opt(fs_info, DISCARD))
8287 			ret = btrfs_discard_extent(fs_info, start, len, NULL);
8288 		btrfs_add_free_space(cache, start, len);
8289 		btrfs_free_reserved_bytes(cache, len, delalloc);
8290 		trace_btrfs_reserved_extent_free(fs_info, start, len);
8291 	}
8292 
8293 	btrfs_put_block_group(cache);
8294 	return ret;
8295 }
8296 
8297 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8298 			       u64 start, u64 len, int delalloc)
8299 {
8300 	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8301 }
8302 
8303 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8304 				       u64 start, u64 len)
8305 {
8306 	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8307 }
8308 
8309 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8310 				      u64 parent, u64 root_objectid,
8311 				      u64 flags, u64 owner, u64 offset,
8312 				      struct btrfs_key *ins, int ref_mod)
8313 {
8314 	struct btrfs_fs_info *fs_info = trans->fs_info;
8315 	int ret;
8316 	struct btrfs_extent_item *extent_item;
8317 	struct btrfs_extent_inline_ref *iref;
8318 	struct btrfs_path *path;
8319 	struct extent_buffer *leaf;
8320 	int type;
8321 	u32 size;
8322 
8323 	if (parent > 0)
8324 		type = BTRFS_SHARED_DATA_REF_KEY;
8325 	else
8326 		type = BTRFS_EXTENT_DATA_REF_KEY;
8327 
8328 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8329 
8330 	path = btrfs_alloc_path();
8331 	if (!path)
8332 		return -ENOMEM;
8333 
8334 	path->leave_spinning = 1;
8335 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8336 				      ins, size);
8337 	if (ret) {
8338 		btrfs_free_path(path);
8339 		return ret;
8340 	}
8341 
8342 	leaf = path->nodes[0];
8343 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8344 				     struct btrfs_extent_item);
8345 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8346 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8347 	btrfs_set_extent_flags(leaf, extent_item,
8348 			       flags | BTRFS_EXTENT_FLAG_DATA);
8349 
8350 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8351 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
8352 	if (parent > 0) {
8353 		struct btrfs_shared_data_ref *ref;
8354 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
8355 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8356 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8357 	} else {
8358 		struct btrfs_extent_data_ref *ref;
8359 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8360 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8361 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8362 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8363 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8364 	}
8365 
8366 	btrfs_mark_buffer_dirty(path->nodes[0]);
8367 	btrfs_free_path(path);
8368 
8369 	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8370 	if (ret)
8371 		return ret;
8372 
8373 	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8374 	if (ret) { /* -ENOENT, logic error */
8375 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8376 			ins->objectid, ins->offset);
8377 		BUG();
8378 	}
8379 	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8380 	return ret;
8381 }
8382 
8383 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8384 				     struct btrfs_delayed_ref_node *node,
8385 				     struct btrfs_delayed_extent_op *extent_op)
8386 {
8387 	struct btrfs_fs_info *fs_info = trans->fs_info;
8388 	int ret;
8389 	struct btrfs_extent_item *extent_item;
8390 	struct btrfs_key extent_key;
8391 	struct btrfs_tree_block_info *block_info;
8392 	struct btrfs_extent_inline_ref *iref;
8393 	struct btrfs_path *path;
8394 	struct extent_buffer *leaf;
8395 	struct btrfs_delayed_tree_ref *ref;
8396 	u32 size = sizeof(*extent_item) + sizeof(*iref);
8397 	u64 num_bytes;
8398 	u64 flags = extent_op->flags_to_set;
8399 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8400 
8401 	ref = btrfs_delayed_node_to_tree_ref(node);
8402 
8403 	extent_key.objectid = node->bytenr;
8404 	if (skinny_metadata) {
8405 		extent_key.offset = ref->level;
8406 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
8407 		num_bytes = fs_info->nodesize;
8408 	} else {
8409 		extent_key.offset = node->num_bytes;
8410 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8411 		size += sizeof(*block_info);
8412 		num_bytes = node->num_bytes;
8413 	}
8414 
8415 	path = btrfs_alloc_path();
8416 	if (!path)
8417 		return -ENOMEM;
8418 
8419 	path->leave_spinning = 1;
8420 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8421 				      &extent_key, size);
8422 	if (ret) {
8423 		btrfs_free_path(path);
8424 		return ret;
8425 	}
8426 
8427 	leaf = path->nodes[0];
8428 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8429 				     struct btrfs_extent_item);
8430 	btrfs_set_extent_refs(leaf, extent_item, 1);
8431 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8432 	btrfs_set_extent_flags(leaf, extent_item,
8433 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8434 
8435 	if (skinny_metadata) {
8436 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8437 	} else {
8438 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8439 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8440 		btrfs_set_tree_block_level(leaf, block_info, ref->level);
8441 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8442 	}
8443 
8444 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8445 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8446 		btrfs_set_extent_inline_ref_type(leaf, iref,
8447 						 BTRFS_SHARED_BLOCK_REF_KEY);
8448 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8449 	} else {
8450 		btrfs_set_extent_inline_ref_type(leaf, iref,
8451 						 BTRFS_TREE_BLOCK_REF_KEY);
8452 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8453 	}
8454 
8455 	btrfs_mark_buffer_dirty(leaf);
8456 	btrfs_free_path(path);
8457 
8458 	ret = remove_from_free_space_tree(trans, extent_key.objectid,
8459 					  num_bytes);
8460 	if (ret)
8461 		return ret;
8462 
8463 	ret = update_block_group(trans, fs_info, extent_key.objectid,
8464 				 fs_info->nodesize, 1);
8465 	if (ret) { /* -ENOENT, logic error */
8466 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8467 			extent_key.objectid, extent_key.offset);
8468 		BUG();
8469 	}
8470 
8471 	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8472 					  fs_info->nodesize);
8473 	return ret;
8474 }
8475 
8476 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8477 				     struct btrfs_root *root, u64 owner,
8478 				     u64 offset, u64 ram_bytes,
8479 				     struct btrfs_key *ins)
8480 {
8481 	int ret;
8482 
8483 	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8484 
8485 	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8486 			   root->root_key.objectid, owner, offset,
8487 			   BTRFS_ADD_DELAYED_EXTENT);
8488 
8489 	ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8490 					 ins->offset, 0,
8491 					 root->root_key.objectid, owner,
8492 					 offset, ram_bytes,
8493 					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8494 	return ret;
8495 }
8496 
8497 /*
8498  * this is used by the tree logging recovery code.  It records that
8499  * an extent has been allocated and makes sure to clear the free
8500  * space cache bits as well
8501  */
8502 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8503 				   u64 root_objectid, u64 owner, u64 offset,
8504 				   struct btrfs_key *ins)
8505 {
8506 	struct btrfs_fs_info *fs_info = trans->fs_info;
8507 	int ret;
8508 	struct btrfs_block_group_cache *block_group;
8509 	struct btrfs_space_info *space_info;
8510 
8511 	/*
8512 	 * Mixed block groups will exclude before processing the log so we only
8513 	 * need to do the exclude dance if this fs isn't mixed.
8514 	 */
8515 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8516 		ret = __exclude_logged_extent(fs_info, ins->objectid,
8517 					      ins->offset);
8518 		if (ret)
8519 			return ret;
8520 	}
8521 
8522 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8523 	if (!block_group)
8524 		return -EINVAL;
8525 
8526 	space_info = block_group->space_info;
8527 	spin_lock(&space_info->lock);
8528 	spin_lock(&block_group->lock);
8529 	space_info->bytes_reserved += ins->offset;
8530 	block_group->reserved += ins->offset;
8531 	spin_unlock(&block_group->lock);
8532 	spin_unlock(&space_info->lock);
8533 
8534 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8535 					 offset, ins, 1);
8536 	btrfs_put_block_group(block_group);
8537 	return ret;
8538 }
8539 
8540 static struct extent_buffer *
8541 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8542 		      u64 bytenr, int level, u64 owner)
8543 {
8544 	struct btrfs_fs_info *fs_info = root->fs_info;
8545 	struct extent_buffer *buf;
8546 
8547 	buf = btrfs_find_create_tree_block(fs_info, bytenr);
8548 	if (IS_ERR(buf))
8549 		return buf;
8550 
8551 	/*
8552 	 * Extra safety check in case the extent tree is corrupted and extent
8553 	 * allocator chooses to use a tree block which is already used and
8554 	 * locked.
8555 	 */
8556 	if (buf->lock_owner == current->pid) {
8557 		btrfs_err_rl(fs_info,
8558 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8559 			buf->start, btrfs_header_owner(buf), current->pid);
8560 		free_extent_buffer(buf);
8561 		return ERR_PTR(-EUCLEAN);
8562 	}
8563 
8564 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8565 	btrfs_tree_lock(buf);
8566 	clean_tree_block(fs_info, buf);
8567 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8568 
8569 	btrfs_set_lock_blocking_write(buf);
8570 	set_extent_buffer_uptodate(buf);
8571 
8572 	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8573 	btrfs_set_header_level(buf, level);
8574 	btrfs_set_header_bytenr(buf, buf->start);
8575 	btrfs_set_header_generation(buf, trans->transid);
8576 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8577 	btrfs_set_header_owner(buf, owner);
8578 	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8579 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8580 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8581 		buf->log_index = root->log_transid % 2;
8582 		/*
8583 		 * we allow two log transactions at a time, use different
8584 		 * EXTENT bit to differentiate dirty pages.
8585 		 */
8586 		if (buf->log_index == 0)
8587 			set_extent_dirty(&root->dirty_log_pages, buf->start,
8588 					buf->start + buf->len - 1, GFP_NOFS);
8589 		else
8590 			set_extent_new(&root->dirty_log_pages, buf->start,
8591 					buf->start + buf->len - 1);
8592 	} else {
8593 		buf->log_index = -1;
8594 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8595 			 buf->start + buf->len - 1, GFP_NOFS);
8596 	}
8597 	trans->dirty = true;
8598 	/* this returns a buffer locked for blocking */
8599 	return buf;
8600 }
8601 
8602 static struct btrfs_block_rsv *
8603 use_block_rsv(struct btrfs_trans_handle *trans,
8604 	      struct btrfs_root *root, u32 blocksize)
8605 {
8606 	struct btrfs_fs_info *fs_info = root->fs_info;
8607 	struct btrfs_block_rsv *block_rsv;
8608 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8609 	int ret;
8610 	bool global_updated = false;
8611 
8612 	block_rsv = get_block_rsv(trans, root);
8613 
8614 	if (unlikely(block_rsv->size == 0))
8615 		goto try_reserve;
8616 again:
8617 	ret = block_rsv_use_bytes(block_rsv, blocksize);
8618 	if (!ret)
8619 		return block_rsv;
8620 
8621 	if (block_rsv->failfast)
8622 		return ERR_PTR(ret);
8623 
8624 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8625 		global_updated = true;
8626 		update_global_block_rsv(fs_info);
8627 		goto again;
8628 	}
8629 
8630 	/*
8631 	 * The global reserve still exists to save us from ourselves, so don't
8632 	 * warn_on if we are short on our delayed refs reserve.
8633 	 */
8634 	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8635 	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8636 		static DEFINE_RATELIMIT_STATE(_rs,
8637 				DEFAULT_RATELIMIT_INTERVAL * 10,
8638 				/*DEFAULT_RATELIMIT_BURST*/ 1);
8639 		if (__ratelimit(&_rs))
8640 			WARN(1, KERN_DEBUG
8641 				"BTRFS: block rsv returned %d\n", ret);
8642 	}
8643 try_reserve:
8644 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8645 				     BTRFS_RESERVE_NO_FLUSH);
8646 	if (!ret)
8647 		return block_rsv;
8648 	/*
8649 	 * If we couldn't reserve metadata bytes try and use some from
8650 	 * the global reserve if its space type is the same as the global
8651 	 * reservation.
8652 	 */
8653 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8654 	    block_rsv->space_info == global_rsv->space_info) {
8655 		ret = block_rsv_use_bytes(global_rsv, blocksize);
8656 		if (!ret)
8657 			return global_rsv;
8658 	}
8659 	return ERR_PTR(ret);
8660 }
8661 
8662 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8663 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
8664 {
8665 	block_rsv_add_bytes(block_rsv, blocksize, false);
8666 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8667 }
8668 
8669 /*
8670  * finds a free extent and does all the dirty work required for allocation
8671  * returns the tree buffer or an ERR_PTR on error.
8672  */
8673 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8674 					     struct btrfs_root *root,
8675 					     u64 parent, u64 root_objectid,
8676 					     const struct btrfs_disk_key *key,
8677 					     int level, u64 hint,
8678 					     u64 empty_size)
8679 {
8680 	struct btrfs_fs_info *fs_info = root->fs_info;
8681 	struct btrfs_key ins;
8682 	struct btrfs_block_rsv *block_rsv;
8683 	struct extent_buffer *buf;
8684 	struct btrfs_delayed_extent_op *extent_op;
8685 	u64 flags = 0;
8686 	int ret;
8687 	u32 blocksize = fs_info->nodesize;
8688 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8689 
8690 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8691 	if (btrfs_is_testing(fs_info)) {
8692 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8693 					    level, root_objectid);
8694 		if (!IS_ERR(buf))
8695 			root->alloc_bytenr += blocksize;
8696 		return buf;
8697 	}
8698 #endif
8699 
8700 	block_rsv = use_block_rsv(trans, root, blocksize);
8701 	if (IS_ERR(block_rsv))
8702 		return ERR_CAST(block_rsv);
8703 
8704 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8705 				   empty_size, hint, &ins, 0, 0);
8706 	if (ret)
8707 		goto out_unuse;
8708 
8709 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8710 				    root_objectid);
8711 	if (IS_ERR(buf)) {
8712 		ret = PTR_ERR(buf);
8713 		goto out_free_reserved;
8714 	}
8715 
8716 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8717 		if (parent == 0)
8718 			parent = ins.objectid;
8719 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8720 	} else
8721 		BUG_ON(parent > 0);
8722 
8723 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8724 		extent_op = btrfs_alloc_delayed_extent_op();
8725 		if (!extent_op) {
8726 			ret = -ENOMEM;
8727 			goto out_free_buf;
8728 		}
8729 		if (key)
8730 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
8731 		else
8732 			memset(&extent_op->key, 0, sizeof(extent_op->key));
8733 		extent_op->flags_to_set = flags;
8734 		extent_op->update_key = skinny_metadata ? false : true;
8735 		extent_op->update_flags = true;
8736 		extent_op->is_data = false;
8737 		extent_op->level = level;
8738 
8739 		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8740 				   root_objectid, level, 0,
8741 				   BTRFS_ADD_DELAYED_EXTENT);
8742 		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8743 						 ins.offset, parent,
8744 						 root_objectid, level,
8745 						 BTRFS_ADD_DELAYED_EXTENT,
8746 						 extent_op, NULL, NULL);
8747 		if (ret)
8748 			goto out_free_delayed;
8749 	}
8750 	return buf;
8751 
8752 out_free_delayed:
8753 	btrfs_free_delayed_extent_op(extent_op);
8754 out_free_buf:
8755 	free_extent_buffer(buf);
8756 out_free_reserved:
8757 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8758 out_unuse:
8759 	unuse_block_rsv(fs_info, block_rsv, blocksize);
8760 	return ERR_PTR(ret);
8761 }
8762 
8763 struct walk_control {
8764 	u64 refs[BTRFS_MAX_LEVEL];
8765 	u64 flags[BTRFS_MAX_LEVEL];
8766 	struct btrfs_key update_progress;
8767 	struct btrfs_key drop_progress;
8768 	int drop_level;
8769 	int stage;
8770 	int level;
8771 	int shared_level;
8772 	int update_ref;
8773 	int keep_locks;
8774 	int reada_slot;
8775 	int reada_count;
8776 	int restarted;
8777 };
8778 
8779 #define DROP_REFERENCE	1
8780 #define UPDATE_BACKREF	2
8781 
8782 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8783 				     struct btrfs_root *root,
8784 				     struct walk_control *wc,
8785 				     struct btrfs_path *path)
8786 {
8787 	struct btrfs_fs_info *fs_info = root->fs_info;
8788 	u64 bytenr;
8789 	u64 generation;
8790 	u64 refs;
8791 	u64 flags;
8792 	u32 nritems;
8793 	struct btrfs_key key;
8794 	struct extent_buffer *eb;
8795 	int ret;
8796 	int slot;
8797 	int nread = 0;
8798 
8799 	if (path->slots[wc->level] < wc->reada_slot) {
8800 		wc->reada_count = wc->reada_count * 2 / 3;
8801 		wc->reada_count = max(wc->reada_count, 2);
8802 	} else {
8803 		wc->reada_count = wc->reada_count * 3 / 2;
8804 		wc->reada_count = min_t(int, wc->reada_count,
8805 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8806 	}
8807 
8808 	eb = path->nodes[wc->level];
8809 	nritems = btrfs_header_nritems(eb);
8810 
8811 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8812 		if (nread >= wc->reada_count)
8813 			break;
8814 
8815 		cond_resched();
8816 		bytenr = btrfs_node_blockptr(eb, slot);
8817 		generation = btrfs_node_ptr_generation(eb, slot);
8818 
8819 		if (slot == path->slots[wc->level])
8820 			goto reada;
8821 
8822 		if (wc->stage == UPDATE_BACKREF &&
8823 		    generation <= root->root_key.offset)
8824 			continue;
8825 
8826 		/* We don't lock the tree block, it's OK to be racy here */
8827 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8828 					       wc->level - 1, 1, &refs,
8829 					       &flags);
8830 		/* We don't care about errors in readahead. */
8831 		if (ret < 0)
8832 			continue;
8833 		BUG_ON(refs == 0);
8834 
8835 		if (wc->stage == DROP_REFERENCE) {
8836 			if (refs == 1)
8837 				goto reada;
8838 
8839 			if (wc->level == 1 &&
8840 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8841 				continue;
8842 			if (!wc->update_ref ||
8843 			    generation <= root->root_key.offset)
8844 				continue;
8845 			btrfs_node_key_to_cpu(eb, &key, slot);
8846 			ret = btrfs_comp_cpu_keys(&key,
8847 						  &wc->update_progress);
8848 			if (ret < 0)
8849 				continue;
8850 		} else {
8851 			if (wc->level == 1 &&
8852 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8853 				continue;
8854 		}
8855 reada:
8856 		readahead_tree_block(fs_info, bytenr);
8857 		nread++;
8858 	}
8859 	wc->reada_slot = slot;
8860 }
8861 
8862 /*
8863  * helper to process tree block while walking down the tree.
8864  *
8865  * when wc->stage == UPDATE_BACKREF, this function updates
8866  * back refs for pointers in the block.
8867  *
8868  * NOTE: return value 1 means we should stop walking down.
8869  */
8870 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8871 				   struct btrfs_root *root,
8872 				   struct btrfs_path *path,
8873 				   struct walk_control *wc, int lookup_info)
8874 {
8875 	struct btrfs_fs_info *fs_info = root->fs_info;
8876 	int level = wc->level;
8877 	struct extent_buffer *eb = path->nodes[level];
8878 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8879 	int ret;
8880 
8881 	if (wc->stage == UPDATE_BACKREF &&
8882 	    btrfs_header_owner(eb) != root->root_key.objectid)
8883 		return 1;
8884 
8885 	/*
8886 	 * when reference count of tree block is 1, it won't increase
8887 	 * again. once full backref flag is set, we never clear it.
8888 	 */
8889 	if (lookup_info &&
8890 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8891 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8892 		BUG_ON(!path->locks[level]);
8893 		ret = btrfs_lookup_extent_info(trans, fs_info,
8894 					       eb->start, level, 1,
8895 					       &wc->refs[level],
8896 					       &wc->flags[level]);
8897 		BUG_ON(ret == -ENOMEM);
8898 		if (ret)
8899 			return ret;
8900 		BUG_ON(wc->refs[level] == 0);
8901 	}
8902 
8903 	if (wc->stage == DROP_REFERENCE) {
8904 		if (wc->refs[level] > 1)
8905 			return 1;
8906 
8907 		if (path->locks[level] && !wc->keep_locks) {
8908 			btrfs_tree_unlock_rw(eb, path->locks[level]);
8909 			path->locks[level] = 0;
8910 		}
8911 		return 0;
8912 	}
8913 
8914 	/* wc->stage == UPDATE_BACKREF */
8915 	if (!(wc->flags[level] & flag)) {
8916 		BUG_ON(!path->locks[level]);
8917 		ret = btrfs_inc_ref(trans, root, eb, 1);
8918 		BUG_ON(ret); /* -ENOMEM */
8919 		ret = btrfs_dec_ref(trans, root, eb, 0);
8920 		BUG_ON(ret); /* -ENOMEM */
8921 		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8922 						  eb->len, flag,
8923 						  btrfs_header_level(eb), 0);
8924 		BUG_ON(ret); /* -ENOMEM */
8925 		wc->flags[level] |= flag;
8926 	}
8927 
8928 	/*
8929 	 * the block is shared by multiple trees, so it's not good to
8930 	 * keep the tree lock
8931 	 */
8932 	if (path->locks[level] && level > 0) {
8933 		btrfs_tree_unlock_rw(eb, path->locks[level]);
8934 		path->locks[level] = 0;
8935 	}
8936 	return 0;
8937 }
8938 
8939 /*
8940  * This is used to verify a ref exists for this root to deal with a bug where we
8941  * would have a drop_progress key that hadn't been updated properly.
8942  */
8943 static int check_ref_exists(struct btrfs_trans_handle *trans,
8944 			    struct btrfs_root *root, u64 bytenr, u64 parent,
8945 			    int level)
8946 {
8947 	struct btrfs_path *path;
8948 	struct btrfs_extent_inline_ref *iref;
8949 	int ret;
8950 
8951 	path = btrfs_alloc_path();
8952 	if (!path)
8953 		return -ENOMEM;
8954 
8955 	ret = lookup_extent_backref(trans, path, &iref, bytenr,
8956 				    root->fs_info->nodesize, parent,
8957 				    root->root_key.objectid, level, 0);
8958 	btrfs_free_path(path);
8959 	if (ret == -ENOENT)
8960 		return 0;
8961 	if (ret < 0)
8962 		return ret;
8963 	return 1;
8964 }
8965 
8966 /*
8967  * helper to process tree block pointer.
8968  *
8969  * when wc->stage == DROP_REFERENCE, this function checks
8970  * reference count of the block pointed to. if the block
8971  * is shared and we need update back refs for the subtree
8972  * rooted at the block, this function changes wc->stage to
8973  * UPDATE_BACKREF. if the block is shared and there is no
8974  * need to update back, this function drops the reference
8975  * to the block.
8976  *
8977  * NOTE: return value 1 means we should stop walking down.
8978  */
8979 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8980 				 struct btrfs_root *root,
8981 				 struct btrfs_path *path,
8982 				 struct walk_control *wc, int *lookup_info)
8983 {
8984 	struct btrfs_fs_info *fs_info = root->fs_info;
8985 	u64 bytenr;
8986 	u64 generation;
8987 	u64 parent;
8988 	struct btrfs_key key;
8989 	struct btrfs_key first_key;
8990 	struct extent_buffer *next;
8991 	int level = wc->level;
8992 	int reada = 0;
8993 	int ret = 0;
8994 	bool need_account = false;
8995 
8996 	generation = btrfs_node_ptr_generation(path->nodes[level],
8997 					       path->slots[level]);
8998 	/*
8999 	 * if the lower level block was created before the snapshot
9000 	 * was created, we know there is no need to update back refs
9001 	 * for the subtree
9002 	 */
9003 	if (wc->stage == UPDATE_BACKREF &&
9004 	    generation <= root->root_key.offset) {
9005 		*lookup_info = 1;
9006 		return 1;
9007 	}
9008 
9009 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
9010 	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
9011 			      path->slots[level]);
9012 
9013 	next = find_extent_buffer(fs_info, bytenr);
9014 	if (!next) {
9015 		next = btrfs_find_create_tree_block(fs_info, bytenr);
9016 		if (IS_ERR(next))
9017 			return PTR_ERR(next);
9018 
9019 		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
9020 					       level - 1);
9021 		reada = 1;
9022 	}
9023 	btrfs_tree_lock(next);
9024 	btrfs_set_lock_blocking_write(next);
9025 
9026 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
9027 				       &wc->refs[level - 1],
9028 				       &wc->flags[level - 1]);
9029 	if (ret < 0)
9030 		goto out_unlock;
9031 
9032 	if (unlikely(wc->refs[level - 1] == 0)) {
9033 		btrfs_err(fs_info, "Missing references.");
9034 		ret = -EIO;
9035 		goto out_unlock;
9036 	}
9037 	*lookup_info = 0;
9038 
9039 	if (wc->stage == DROP_REFERENCE) {
9040 		if (wc->refs[level - 1] > 1) {
9041 			need_account = true;
9042 			if (level == 1 &&
9043 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
9044 				goto skip;
9045 
9046 			if (!wc->update_ref ||
9047 			    generation <= root->root_key.offset)
9048 				goto skip;
9049 
9050 			btrfs_node_key_to_cpu(path->nodes[level], &key,
9051 					      path->slots[level]);
9052 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
9053 			if (ret < 0)
9054 				goto skip;
9055 
9056 			wc->stage = UPDATE_BACKREF;
9057 			wc->shared_level = level - 1;
9058 		}
9059 	} else {
9060 		if (level == 1 &&
9061 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
9062 			goto skip;
9063 	}
9064 
9065 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
9066 		btrfs_tree_unlock(next);
9067 		free_extent_buffer(next);
9068 		next = NULL;
9069 		*lookup_info = 1;
9070 	}
9071 
9072 	if (!next) {
9073 		if (reada && level == 1)
9074 			reada_walk_down(trans, root, wc, path);
9075 		next = read_tree_block(fs_info, bytenr, generation, level - 1,
9076 				       &first_key);
9077 		if (IS_ERR(next)) {
9078 			return PTR_ERR(next);
9079 		} else if (!extent_buffer_uptodate(next)) {
9080 			free_extent_buffer(next);
9081 			return -EIO;
9082 		}
9083 		btrfs_tree_lock(next);
9084 		btrfs_set_lock_blocking_write(next);
9085 	}
9086 
9087 	level--;
9088 	ASSERT(level == btrfs_header_level(next));
9089 	if (level != btrfs_header_level(next)) {
9090 		btrfs_err(root->fs_info, "mismatched level");
9091 		ret = -EIO;
9092 		goto out_unlock;
9093 	}
9094 	path->nodes[level] = next;
9095 	path->slots[level] = 0;
9096 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9097 	wc->level = level;
9098 	if (wc->level == 1)
9099 		wc->reada_slot = 0;
9100 	return 0;
9101 skip:
9102 	wc->refs[level - 1] = 0;
9103 	wc->flags[level - 1] = 0;
9104 	if (wc->stage == DROP_REFERENCE) {
9105 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
9106 			parent = path->nodes[level]->start;
9107 		} else {
9108 			ASSERT(root->root_key.objectid ==
9109 			       btrfs_header_owner(path->nodes[level]));
9110 			if (root->root_key.objectid !=
9111 			    btrfs_header_owner(path->nodes[level])) {
9112 				btrfs_err(root->fs_info,
9113 						"mismatched block owner");
9114 				ret = -EIO;
9115 				goto out_unlock;
9116 			}
9117 			parent = 0;
9118 		}
9119 
9120 		/*
9121 		 * If we had a drop_progress we need to verify the refs are set
9122 		 * as expected.  If we find our ref then we know that from here
9123 		 * on out everything should be correct, and we can clear the
9124 		 * ->restarted flag.
9125 		 */
9126 		if (wc->restarted) {
9127 			ret = check_ref_exists(trans, root, bytenr, parent,
9128 					       level - 1);
9129 			if (ret < 0)
9130 				goto out_unlock;
9131 			if (ret == 0)
9132 				goto no_delete;
9133 			ret = 0;
9134 			wc->restarted = 0;
9135 		}
9136 
9137 		/*
9138 		 * Reloc tree doesn't contribute to qgroup numbers, and we have
9139 		 * already accounted them at merge time (replace_path),
9140 		 * thus we could skip expensive subtree trace here.
9141 		 */
9142 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
9143 		    need_account) {
9144 			ret = btrfs_qgroup_trace_subtree(trans, next,
9145 							 generation, level - 1);
9146 			if (ret) {
9147 				btrfs_err_rl(fs_info,
9148 					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
9149 					     ret);
9150 			}
9151 		}
9152 
9153 		/*
9154 		 * We need to update the next key in our walk control so we can
9155 		 * update the drop_progress key accordingly.  We don't care if
9156 		 * find_next_key doesn't find a key because that means we're at
9157 		 * the end and are going to clean up now.
9158 		 */
9159 		wc->drop_level = level;
9160 		find_next_key(path, level, &wc->drop_progress);
9161 
9162 		ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
9163 					parent, root->root_key.objectid,
9164 					level - 1, 0);
9165 		if (ret)
9166 			goto out_unlock;
9167 	}
9168 no_delete:
9169 	*lookup_info = 1;
9170 	ret = 1;
9171 
9172 out_unlock:
9173 	btrfs_tree_unlock(next);
9174 	free_extent_buffer(next);
9175 
9176 	return ret;
9177 }
9178 
9179 /*
9180  * helper to process tree block while walking up the tree.
9181  *
9182  * when wc->stage == DROP_REFERENCE, this function drops
9183  * reference count on the block.
9184  *
9185  * when wc->stage == UPDATE_BACKREF, this function changes
9186  * wc->stage back to DROP_REFERENCE if we changed wc->stage
9187  * to UPDATE_BACKREF previously while processing the block.
9188  *
9189  * NOTE: return value 1 means we should stop walking up.
9190  */
9191 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9192 				 struct btrfs_root *root,
9193 				 struct btrfs_path *path,
9194 				 struct walk_control *wc)
9195 {
9196 	struct btrfs_fs_info *fs_info = root->fs_info;
9197 	int ret;
9198 	int level = wc->level;
9199 	struct extent_buffer *eb = path->nodes[level];
9200 	u64 parent = 0;
9201 
9202 	if (wc->stage == UPDATE_BACKREF) {
9203 		BUG_ON(wc->shared_level < level);
9204 		if (level < wc->shared_level)
9205 			goto out;
9206 
9207 		ret = find_next_key(path, level + 1, &wc->update_progress);
9208 		if (ret > 0)
9209 			wc->update_ref = 0;
9210 
9211 		wc->stage = DROP_REFERENCE;
9212 		wc->shared_level = -1;
9213 		path->slots[level] = 0;
9214 
9215 		/*
9216 		 * check reference count again if the block isn't locked.
9217 		 * we should start walking down the tree again if reference
9218 		 * count is one.
9219 		 */
9220 		if (!path->locks[level]) {
9221 			BUG_ON(level == 0);
9222 			btrfs_tree_lock(eb);
9223 			btrfs_set_lock_blocking_write(eb);
9224 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9225 
9226 			ret = btrfs_lookup_extent_info(trans, fs_info,
9227 						       eb->start, level, 1,
9228 						       &wc->refs[level],
9229 						       &wc->flags[level]);
9230 			if (ret < 0) {
9231 				btrfs_tree_unlock_rw(eb, path->locks[level]);
9232 				path->locks[level] = 0;
9233 				return ret;
9234 			}
9235 			BUG_ON(wc->refs[level] == 0);
9236 			if (wc->refs[level] == 1) {
9237 				btrfs_tree_unlock_rw(eb, path->locks[level]);
9238 				path->locks[level] = 0;
9239 				return 1;
9240 			}
9241 		}
9242 	}
9243 
9244 	/* wc->stage == DROP_REFERENCE */
9245 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9246 
9247 	if (wc->refs[level] == 1) {
9248 		if (level == 0) {
9249 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9250 				ret = btrfs_dec_ref(trans, root, eb, 1);
9251 			else
9252 				ret = btrfs_dec_ref(trans, root, eb, 0);
9253 			BUG_ON(ret); /* -ENOMEM */
9254 			ret = btrfs_qgroup_trace_leaf_items(trans, eb);
9255 			if (ret) {
9256 				btrfs_err_rl(fs_info,
9257 					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
9258 					     ret);
9259 			}
9260 		}
9261 		/* make block locked assertion in clean_tree_block happy */
9262 		if (!path->locks[level] &&
9263 		    btrfs_header_generation(eb) == trans->transid) {
9264 			btrfs_tree_lock(eb);
9265 			btrfs_set_lock_blocking_write(eb);
9266 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9267 		}
9268 		clean_tree_block(fs_info, eb);
9269 	}
9270 
9271 	if (eb == root->node) {
9272 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9273 			parent = eb->start;
9274 		else if (root->root_key.objectid != btrfs_header_owner(eb))
9275 			goto owner_mismatch;
9276 	} else {
9277 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9278 			parent = path->nodes[level + 1]->start;
9279 		else if (root->root_key.objectid !=
9280 			 btrfs_header_owner(path->nodes[level + 1]))
9281 			goto owner_mismatch;
9282 	}
9283 
9284 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9285 out:
9286 	wc->refs[level] = 0;
9287 	wc->flags[level] = 0;
9288 	return 0;
9289 
9290 owner_mismatch:
9291 	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
9292 		     btrfs_header_owner(eb), root->root_key.objectid);
9293 	return -EUCLEAN;
9294 }
9295 
9296 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9297 				   struct btrfs_root *root,
9298 				   struct btrfs_path *path,
9299 				   struct walk_control *wc)
9300 {
9301 	int level = wc->level;
9302 	int lookup_info = 1;
9303 	int ret;
9304 
9305 	while (level >= 0) {
9306 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
9307 		if (ret > 0)
9308 			break;
9309 
9310 		if (level == 0)
9311 			break;
9312 
9313 		if (path->slots[level] >=
9314 		    btrfs_header_nritems(path->nodes[level]))
9315 			break;
9316 
9317 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
9318 		if (ret > 0) {
9319 			path->slots[level]++;
9320 			continue;
9321 		} else if (ret < 0)
9322 			return ret;
9323 		level = wc->level;
9324 	}
9325 	return 0;
9326 }
9327 
9328 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9329 				 struct btrfs_root *root,
9330 				 struct btrfs_path *path,
9331 				 struct walk_control *wc, int max_level)
9332 {
9333 	int level = wc->level;
9334 	int ret;
9335 
9336 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9337 	while (level < max_level && path->nodes[level]) {
9338 		wc->level = level;
9339 		if (path->slots[level] + 1 <
9340 		    btrfs_header_nritems(path->nodes[level])) {
9341 			path->slots[level]++;
9342 			return 0;
9343 		} else {
9344 			ret = walk_up_proc(trans, root, path, wc);
9345 			if (ret > 0)
9346 				return 0;
9347 			if (ret < 0)
9348 				return ret;
9349 
9350 			if (path->locks[level]) {
9351 				btrfs_tree_unlock_rw(path->nodes[level],
9352 						     path->locks[level]);
9353 				path->locks[level] = 0;
9354 			}
9355 			free_extent_buffer(path->nodes[level]);
9356 			path->nodes[level] = NULL;
9357 			level++;
9358 		}
9359 	}
9360 	return 1;
9361 }
9362 
9363 /*
9364  * drop a subvolume tree.
9365  *
9366  * this function traverses the tree freeing any blocks that only
9367  * referenced by the tree.
9368  *
9369  * when a shared tree block is found. this function decreases its
9370  * reference count by one. if update_ref is true, this function
9371  * also make sure backrefs for the shared block and all lower level
9372  * blocks are properly updated.
9373  *
9374  * If called with for_reloc == 0, may exit early with -EAGAIN
9375  */
9376 int btrfs_drop_snapshot(struct btrfs_root *root,
9377 			 struct btrfs_block_rsv *block_rsv, int update_ref,
9378 			 int for_reloc)
9379 {
9380 	struct btrfs_fs_info *fs_info = root->fs_info;
9381 	struct btrfs_path *path;
9382 	struct btrfs_trans_handle *trans;
9383 	struct btrfs_root *tree_root = fs_info->tree_root;
9384 	struct btrfs_root_item *root_item = &root->root_item;
9385 	struct walk_control *wc;
9386 	struct btrfs_key key;
9387 	int err = 0;
9388 	int ret;
9389 	int level;
9390 	bool root_dropped = false;
9391 
9392 	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
9393 
9394 	path = btrfs_alloc_path();
9395 	if (!path) {
9396 		err = -ENOMEM;
9397 		goto out;
9398 	}
9399 
9400 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9401 	if (!wc) {
9402 		btrfs_free_path(path);
9403 		err = -ENOMEM;
9404 		goto out;
9405 	}
9406 
9407 	trans = btrfs_start_transaction(tree_root, 0);
9408 	if (IS_ERR(trans)) {
9409 		err = PTR_ERR(trans);
9410 		goto out_free;
9411 	}
9412 
9413 	err = btrfs_run_delayed_items(trans);
9414 	if (err)
9415 		goto out_end_trans;
9416 
9417 	if (block_rsv)
9418 		trans->block_rsv = block_rsv;
9419 
9420 	/*
9421 	 * This will help us catch people modifying the fs tree while we're
9422 	 * dropping it.  It is unsafe to mess with the fs tree while it's being
9423 	 * dropped as we unlock the root node and parent nodes as we walk down
9424 	 * the tree, assuming nothing will change.  If something does change
9425 	 * then we'll have stale information and drop references to blocks we've
9426 	 * already dropped.
9427 	 */
9428 	set_bit(BTRFS_ROOT_DELETING, &root->state);
9429 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9430 		level = btrfs_header_level(root->node);
9431 		path->nodes[level] = btrfs_lock_root_node(root);
9432 		btrfs_set_lock_blocking_write(path->nodes[level]);
9433 		path->slots[level] = 0;
9434 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9435 		memset(&wc->update_progress, 0,
9436 		       sizeof(wc->update_progress));
9437 	} else {
9438 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9439 		memcpy(&wc->update_progress, &key,
9440 		       sizeof(wc->update_progress));
9441 
9442 		level = root_item->drop_level;
9443 		BUG_ON(level == 0);
9444 		path->lowest_level = level;
9445 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9446 		path->lowest_level = 0;
9447 		if (ret < 0) {
9448 			err = ret;
9449 			goto out_end_trans;
9450 		}
9451 		WARN_ON(ret > 0);
9452 
9453 		/*
9454 		 * unlock our path, this is safe because only this
9455 		 * function is allowed to delete this snapshot
9456 		 */
9457 		btrfs_unlock_up_safe(path, 0);
9458 
9459 		level = btrfs_header_level(root->node);
9460 		while (1) {
9461 			btrfs_tree_lock(path->nodes[level]);
9462 			btrfs_set_lock_blocking_write(path->nodes[level]);
9463 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9464 
9465 			ret = btrfs_lookup_extent_info(trans, fs_info,
9466 						path->nodes[level]->start,
9467 						level, 1, &wc->refs[level],
9468 						&wc->flags[level]);
9469 			if (ret < 0) {
9470 				err = ret;
9471 				goto out_end_trans;
9472 			}
9473 			BUG_ON(wc->refs[level] == 0);
9474 
9475 			if (level == root_item->drop_level)
9476 				break;
9477 
9478 			btrfs_tree_unlock(path->nodes[level]);
9479 			path->locks[level] = 0;
9480 			WARN_ON(wc->refs[level] != 1);
9481 			level--;
9482 		}
9483 	}
9484 
9485 	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
9486 	wc->level = level;
9487 	wc->shared_level = -1;
9488 	wc->stage = DROP_REFERENCE;
9489 	wc->update_ref = update_ref;
9490 	wc->keep_locks = 0;
9491 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9492 
9493 	while (1) {
9494 
9495 		ret = walk_down_tree(trans, root, path, wc);
9496 		if (ret < 0) {
9497 			err = ret;
9498 			break;
9499 		}
9500 
9501 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9502 		if (ret < 0) {
9503 			err = ret;
9504 			break;
9505 		}
9506 
9507 		if (ret > 0) {
9508 			BUG_ON(wc->stage != DROP_REFERENCE);
9509 			break;
9510 		}
9511 
9512 		if (wc->stage == DROP_REFERENCE) {
9513 			wc->drop_level = wc->level;
9514 			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
9515 					      &wc->drop_progress,
9516 					      path->slots[wc->drop_level]);
9517 		}
9518 		btrfs_cpu_key_to_disk(&root_item->drop_progress,
9519 				      &wc->drop_progress);
9520 		root_item->drop_level = wc->drop_level;
9521 
9522 		BUG_ON(wc->level == 0);
9523 		if (btrfs_should_end_transaction(trans) ||
9524 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9525 			ret = btrfs_update_root(trans, tree_root,
9526 						&root->root_key,
9527 						root_item);
9528 			if (ret) {
9529 				btrfs_abort_transaction(trans, ret);
9530 				err = ret;
9531 				goto out_end_trans;
9532 			}
9533 
9534 			btrfs_end_transaction_throttle(trans);
9535 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9536 				btrfs_debug(fs_info,
9537 					    "drop snapshot early exit");
9538 				err = -EAGAIN;
9539 				goto out_free;
9540 			}
9541 
9542 			trans = btrfs_start_transaction(tree_root, 0);
9543 			if (IS_ERR(trans)) {
9544 				err = PTR_ERR(trans);
9545 				goto out_free;
9546 			}
9547 			if (block_rsv)
9548 				trans->block_rsv = block_rsv;
9549 		}
9550 	}
9551 	btrfs_release_path(path);
9552 	if (err)
9553 		goto out_end_trans;
9554 
9555 	ret = btrfs_del_root(trans, &root->root_key);
9556 	if (ret) {
9557 		btrfs_abort_transaction(trans, ret);
9558 		err = ret;
9559 		goto out_end_trans;
9560 	}
9561 
9562 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9563 		ret = btrfs_find_root(tree_root, &root->root_key, path,
9564 				      NULL, NULL);
9565 		if (ret < 0) {
9566 			btrfs_abort_transaction(trans, ret);
9567 			err = ret;
9568 			goto out_end_trans;
9569 		} else if (ret > 0) {
9570 			/* if we fail to delete the orphan item this time
9571 			 * around, it'll get picked up the next time.
9572 			 *
9573 			 * The most common failure here is just -ENOENT.
9574 			 */
9575 			btrfs_del_orphan_item(trans, tree_root,
9576 					      root->root_key.objectid);
9577 		}
9578 	}
9579 
9580 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9581 		btrfs_add_dropped_root(trans, root);
9582 	} else {
9583 		free_extent_buffer(root->node);
9584 		free_extent_buffer(root->commit_root);
9585 		btrfs_put_fs_root(root);
9586 	}
9587 	root_dropped = true;
9588 out_end_trans:
9589 	btrfs_end_transaction_throttle(trans);
9590 out_free:
9591 	kfree(wc);
9592 	btrfs_free_path(path);
9593 out:
9594 	/*
9595 	 * So if we need to stop dropping the snapshot for whatever reason we
9596 	 * need to make sure to add it back to the dead root list so that we
9597 	 * keep trying to do the work later.  This also cleans up roots if we
9598 	 * don't have it in the radix (like when we recover after a power fail
9599 	 * or unmount) so we don't leak memory.
9600 	 */
9601 	if (!for_reloc && !root_dropped)
9602 		btrfs_add_dead_root(root);
9603 	if (err && err != -EAGAIN)
9604 		btrfs_handle_fs_error(fs_info, err, NULL);
9605 	return err;
9606 }
9607 
9608 /*
9609  * drop subtree rooted at tree block 'node'.
9610  *
9611  * NOTE: this function will unlock and release tree block 'node'
9612  * only used by relocation code
9613  */
9614 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9615 			struct btrfs_root *root,
9616 			struct extent_buffer *node,
9617 			struct extent_buffer *parent)
9618 {
9619 	struct btrfs_fs_info *fs_info = root->fs_info;
9620 	struct btrfs_path *path;
9621 	struct walk_control *wc;
9622 	int level;
9623 	int parent_level;
9624 	int ret = 0;
9625 	int wret;
9626 
9627 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9628 
9629 	path = btrfs_alloc_path();
9630 	if (!path)
9631 		return -ENOMEM;
9632 
9633 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9634 	if (!wc) {
9635 		btrfs_free_path(path);
9636 		return -ENOMEM;
9637 	}
9638 
9639 	btrfs_assert_tree_locked(parent);
9640 	parent_level = btrfs_header_level(parent);
9641 	extent_buffer_get(parent);
9642 	path->nodes[parent_level] = parent;
9643 	path->slots[parent_level] = btrfs_header_nritems(parent);
9644 
9645 	btrfs_assert_tree_locked(node);
9646 	level = btrfs_header_level(node);
9647 	path->nodes[level] = node;
9648 	path->slots[level] = 0;
9649 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9650 
9651 	wc->refs[parent_level] = 1;
9652 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9653 	wc->level = level;
9654 	wc->shared_level = -1;
9655 	wc->stage = DROP_REFERENCE;
9656 	wc->update_ref = 0;
9657 	wc->keep_locks = 1;
9658 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9659 
9660 	while (1) {
9661 		wret = walk_down_tree(trans, root, path, wc);
9662 		if (wret < 0) {
9663 			ret = wret;
9664 			break;
9665 		}
9666 
9667 		wret = walk_up_tree(trans, root, path, wc, parent_level);
9668 		if (wret < 0)
9669 			ret = wret;
9670 		if (wret != 0)
9671 			break;
9672 	}
9673 
9674 	kfree(wc);
9675 	btrfs_free_path(path);
9676 	return ret;
9677 }
9678 
9679 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9680 {
9681 	u64 num_devices;
9682 	u64 stripped;
9683 
9684 	/*
9685 	 * if restripe for this chunk_type is on pick target profile and
9686 	 * return, otherwise do the usual balance
9687 	 */
9688 	stripped = get_restripe_target(fs_info, flags);
9689 	if (stripped)
9690 		return extended_to_chunk(stripped);
9691 
9692 	num_devices = fs_info->fs_devices->rw_devices;
9693 
9694 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
9695 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9696 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9697 
9698 	if (num_devices == 1) {
9699 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9700 		stripped = flags & ~stripped;
9701 
9702 		/* turn raid0 into single device chunks */
9703 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
9704 			return stripped;
9705 
9706 		/* turn mirroring into duplication */
9707 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9708 			     BTRFS_BLOCK_GROUP_RAID10))
9709 			return stripped | BTRFS_BLOCK_GROUP_DUP;
9710 	} else {
9711 		/* they already had raid on here, just return */
9712 		if (flags & stripped)
9713 			return flags;
9714 
9715 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9716 		stripped = flags & ~stripped;
9717 
9718 		/* switch duplicated blocks with raid1 */
9719 		if (flags & BTRFS_BLOCK_GROUP_DUP)
9720 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
9721 
9722 		/* this is drive concat, leave it alone */
9723 	}
9724 
9725 	return flags;
9726 }
9727 
9728 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9729 {
9730 	struct btrfs_space_info *sinfo = cache->space_info;
9731 	u64 num_bytes;
9732 	u64 sinfo_used;
9733 	u64 min_allocable_bytes;
9734 	int ret = -ENOSPC;
9735 
9736 	/*
9737 	 * We need some metadata space and system metadata space for
9738 	 * allocating chunks in some corner cases until we force to set
9739 	 * it to be readonly.
9740 	 */
9741 	if ((sinfo->flags &
9742 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9743 	    !force)
9744 		min_allocable_bytes = SZ_1M;
9745 	else
9746 		min_allocable_bytes = 0;
9747 
9748 	spin_lock(&sinfo->lock);
9749 	spin_lock(&cache->lock);
9750 
9751 	if (cache->ro) {
9752 		cache->ro++;
9753 		ret = 0;
9754 		goto out;
9755 	}
9756 
9757 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9758 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
9759 	sinfo_used = btrfs_space_info_used(sinfo, true);
9760 
9761 	if (sinfo_used + num_bytes + min_allocable_bytes <=
9762 	    sinfo->total_bytes) {
9763 		sinfo->bytes_readonly += num_bytes;
9764 		cache->ro++;
9765 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9766 		ret = 0;
9767 	}
9768 out:
9769 	spin_unlock(&cache->lock);
9770 	spin_unlock(&sinfo->lock);
9771 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
9772 		btrfs_info(cache->fs_info,
9773 			"unable to make block group %llu ro",
9774 			cache->key.objectid);
9775 		btrfs_info(cache->fs_info,
9776 			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9777 			sinfo_used, num_bytes, min_allocable_bytes);
9778 		dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9779 	}
9780 	return ret;
9781 }
9782 
9783 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9784 
9785 {
9786 	struct btrfs_fs_info *fs_info = cache->fs_info;
9787 	struct btrfs_trans_handle *trans;
9788 	u64 alloc_flags;
9789 	int ret;
9790 
9791 again:
9792 	trans = btrfs_join_transaction(fs_info->extent_root);
9793 	if (IS_ERR(trans))
9794 		return PTR_ERR(trans);
9795 
9796 	/*
9797 	 * we're not allowed to set block groups readonly after the dirty
9798 	 * block groups cache has started writing.  If it already started,
9799 	 * back off and let this transaction commit
9800 	 */
9801 	mutex_lock(&fs_info->ro_block_group_mutex);
9802 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9803 		u64 transid = trans->transid;
9804 
9805 		mutex_unlock(&fs_info->ro_block_group_mutex);
9806 		btrfs_end_transaction(trans);
9807 
9808 		ret = btrfs_wait_for_commit(fs_info, transid);
9809 		if (ret)
9810 			return ret;
9811 		goto again;
9812 	}
9813 
9814 	/*
9815 	 * if we are changing raid levels, try to allocate a corresponding
9816 	 * block group with the new raid level.
9817 	 */
9818 	alloc_flags = update_block_group_flags(fs_info, cache->flags);
9819 	if (alloc_flags != cache->flags) {
9820 		ret = do_chunk_alloc(trans, alloc_flags,
9821 				     CHUNK_ALLOC_FORCE);
9822 		/*
9823 		 * ENOSPC is allowed here, we may have enough space
9824 		 * already allocated at the new raid level to
9825 		 * carry on
9826 		 */
9827 		if (ret == -ENOSPC)
9828 			ret = 0;
9829 		if (ret < 0)
9830 			goto out;
9831 	}
9832 
9833 	ret = inc_block_group_ro(cache, 0);
9834 	if (!ret)
9835 		goto out;
9836 	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9837 	ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9838 	if (ret < 0)
9839 		goto out;
9840 	ret = inc_block_group_ro(cache, 0);
9841 out:
9842 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9843 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
9844 		mutex_lock(&fs_info->chunk_mutex);
9845 		check_system_chunk(trans, alloc_flags);
9846 		mutex_unlock(&fs_info->chunk_mutex);
9847 	}
9848 	mutex_unlock(&fs_info->ro_block_group_mutex);
9849 
9850 	btrfs_end_transaction(trans);
9851 	return ret;
9852 }
9853 
9854 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9855 {
9856 	u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9857 
9858 	return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9859 }
9860 
9861 /*
9862  * helper to account the unused space of all the readonly block group in the
9863  * space_info. takes mirrors into account.
9864  */
9865 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9866 {
9867 	struct btrfs_block_group_cache *block_group;
9868 	u64 free_bytes = 0;
9869 	int factor;
9870 
9871 	/* It's df, we don't care if it's racy */
9872 	if (list_empty(&sinfo->ro_bgs))
9873 		return 0;
9874 
9875 	spin_lock(&sinfo->lock);
9876 	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9877 		spin_lock(&block_group->lock);
9878 
9879 		if (!block_group->ro) {
9880 			spin_unlock(&block_group->lock);
9881 			continue;
9882 		}
9883 
9884 		factor = btrfs_bg_type_to_factor(block_group->flags);
9885 		free_bytes += (block_group->key.offset -
9886 			       btrfs_block_group_used(&block_group->item)) *
9887 			       factor;
9888 
9889 		spin_unlock(&block_group->lock);
9890 	}
9891 	spin_unlock(&sinfo->lock);
9892 
9893 	return free_bytes;
9894 }
9895 
9896 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9897 {
9898 	struct btrfs_space_info *sinfo = cache->space_info;
9899 	u64 num_bytes;
9900 
9901 	BUG_ON(!cache->ro);
9902 
9903 	spin_lock(&sinfo->lock);
9904 	spin_lock(&cache->lock);
9905 	if (!--cache->ro) {
9906 		num_bytes = cache->key.offset - cache->reserved -
9907 			    cache->pinned - cache->bytes_super -
9908 			    btrfs_block_group_used(&cache->item);
9909 		sinfo->bytes_readonly -= num_bytes;
9910 		list_del_init(&cache->ro_list);
9911 	}
9912 	spin_unlock(&cache->lock);
9913 	spin_unlock(&sinfo->lock);
9914 }
9915 
9916 /*
9917  * Checks to see if it's even possible to relocate this block group.
9918  *
9919  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9920  * ok to go ahead and try.
9921  */
9922 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9923 {
9924 	struct btrfs_root *root = fs_info->extent_root;
9925 	struct btrfs_block_group_cache *block_group;
9926 	struct btrfs_space_info *space_info;
9927 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9928 	struct btrfs_device *device;
9929 	struct btrfs_trans_handle *trans;
9930 	u64 min_free;
9931 	u64 dev_min = 1;
9932 	u64 dev_nr = 0;
9933 	u64 target;
9934 	int debug;
9935 	int index;
9936 	int full = 0;
9937 	int ret = 0;
9938 
9939 	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9940 
9941 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
9942 
9943 	/* odd, couldn't find the block group, leave it alone */
9944 	if (!block_group) {
9945 		if (debug)
9946 			btrfs_warn(fs_info,
9947 				   "can't find block group for bytenr %llu",
9948 				   bytenr);
9949 		return -1;
9950 	}
9951 
9952 	min_free = btrfs_block_group_used(&block_group->item);
9953 
9954 	/* no bytes used, we're good */
9955 	if (!min_free)
9956 		goto out;
9957 
9958 	space_info = block_group->space_info;
9959 	spin_lock(&space_info->lock);
9960 
9961 	full = space_info->full;
9962 
9963 	/*
9964 	 * if this is the last block group we have in this space, we can't
9965 	 * relocate it unless we're able to allocate a new chunk below.
9966 	 *
9967 	 * Otherwise, we need to make sure we have room in the space to handle
9968 	 * all of the extents from this block group.  If we can, we're good
9969 	 */
9970 	if ((space_info->total_bytes != block_group->key.offset) &&
9971 	    (btrfs_space_info_used(space_info, false) + min_free <
9972 	     space_info->total_bytes)) {
9973 		spin_unlock(&space_info->lock);
9974 		goto out;
9975 	}
9976 	spin_unlock(&space_info->lock);
9977 
9978 	/*
9979 	 * ok we don't have enough space, but maybe we have free space on our
9980 	 * devices to allocate new chunks for relocation, so loop through our
9981 	 * alloc devices and guess if we have enough space.  if this block
9982 	 * group is going to be restriped, run checks against the target
9983 	 * profile instead of the current one.
9984 	 */
9985 	ret = -1;
9986 
9987 	/*
9988 	 * index:
9989 	 *      0: raid10
9990 	 *      1: raid1
9991 	 *      2: dup
9992 	 *      3: raid0
9993 	 *      4: single
9994 	 */
9995 	target = get_restripe_target(fs_info, block_group->flags);
9996 	if (target) {
9997 		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9998 	} else {
9999 		/*
10000 		 * this is just a balance, so if we were marked as full
10001 		 * we know there is no space for a new chunk
10002 		 */
10003 		if (full) {
10004 			if (debug)
10005 				btrfs_warn(fs_info,
10006 					   "no space to alloc new chunk for block group %llu",
10007 					   block_group->key.objectid);
10008 			goto out;
10009 		}
10010 
10011 		index = btrfs_bg_flags_to_raid_index(block_group->flags);
10012 	}
10013 
10014 	if (index == BTRFS_RAID_RAID10) {
10015 		dev_min = 4;
10016 		/* Divide by 2 */
10017 		min_free >>= 1;
10018 	} else if (index == BTRFS_RAID_RAID1) {
10019 		dev_min = 2;
10020 	} else if (index == BTRFS_RAID_DUP) {
10021 		/* Multiply by 2 */
10022 		min_free <<= 1;
10023 	} else if (index == BTRFS_RAID_RAID0) {
10024 		dev_min = fs_devices->rw_devices;
10025 		min_free = div64_u64(min_free, dev_min);
10026 	}
10027 
10028 	/* We need to do this so that we can look at pending chunks */
10029 	trans = btrfs_join_transaction(root);
10030 	if (IS_ERR(trans)) {
10031 		ret = PTR_ERR(trans);
10032 		goto out;
10033 	}
10034 
10035 	mutex_lock(&fs_info->chunk_mutex);
10036 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
10037 		u64 dev_offset;
10038 
10039 		/*
10040 		 * check to make sure we can actually find a chunk with enough
10041 		 * space to fit our block group in.
10042 		 */
10043 		if (device->total_bytes > device->bytes_used + min_free &&
10044 		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
10045 			ret = find_free_dev_extent(trans, device, min_free,
10046 						   &dev_offset, NULL);
10047 			if (!ret)
10048 				dev_nr++;
10049 
10050 			if (dev_nr >= dev_min)
10051 				break;
10052 
10053 			ret = -1;
10054 		}
10055 	}
10056 	if (debug && ret == -1)
10057 		btrfs_warn(fs_info,
10058 			   "no space to allocate a new chunk for block group %llu",
10059 			   block_group->key.objectid);
10060 	mutex_unlock(&fs_info->chunk_mutex);
10061 	btrfs_end_transaction(trans);
10062 out:
10063 	btrfs_put_block_group(block_group);
10064 	return ret;
10065 }
10066 
10067 static int find_first_block_group(struct btrfs_fs_info *fs_info,
10068 				  struct btrfs_path *path,
10069 				  struct btrfs_key *key)
10070 {
10071 	struct btrfs_root *root = fs_info->extent_root;
10072 	int ret = 0;
10073 	struct btrfs_key found_key;
10074 	struct extent_buffer *leaf;
10075 	struct btrfs_block_group_item bg;
10076 	u64 flags;
10077 	int slot;
10078 
10079 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
10080 	if (ret < 0)
10081 		goto out;
10082 
10083 	while (1) {
10084 		slot = path->slots[0];
10085 		leaf = path->nodes[0];
10086 		if (slot >= btrfs_header_nritems(leaf)) {
10087 			ret = btrfs_next_leaf(root, path);
10088 			if (ret == 0)
10089 				continue;
10090 			if (ret < 0)
10091 				goto out;
10092 			break;
10093 		}
10094 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
10095 
10096 		if (found_key.objectid >= key->objectid &&
10097 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
10098 			struct extent_map_tree *em_tree;
10099 			struct extent_map *em;
10100 
10101 			em_tree = &root->fs_info->mapping_tree.map_tree;
10102 			read_lock(&em_tree->lock);
10103 			em = lookup_extent_mapping(em_tree, found_key.objectid,
10104 						   found_key.offset);
10105 			read_unlock(&em_tree->lock);
10106 			if (!em) {
10107 				btrfs_err(fs_info,
10108 			"logical %llu len %llu found bg but no related chunk",
10109 					  found_key.objectid, found_key.offset);
10110 				ret = -ENOENT;
10111 			} else if (em->start != found_key.objectid ||
10112 				   em->len != found_key.offset) {
10113 				btrfs_err(fs_info,
10114 		"block group %llu len %llu mismatch with chunk %llu len %llu",
10115 					  found_key.objectid, found_key.offset,
10116 					  em->start, em->len);
10117 				ret = -EUCLEAN;
10118 			} else {
10119 				read_extent_buffer(leaf, &bg,
10120 					btrfs_item_ptr_offset(leaf, slot),
10121 					sizeof(bg));
10122 				flags = btrfs_block_group_flags(&bg) &
10123 					BTRFS_BLOCK_GROUP_TYPE_MASK;
10124 
10125 				if (flags != (em->map_lookup->type &
10126 					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10127 					btrfs_err(fs_info,
10128 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
10129 						found_key.objectid,
10130 						found_key.offset, flags,
10131 						(BTRFS_BLOCK_GROUP_TYPE_MASK &
10132 						 em->map_lookup->type));
10133 					ret = -EUCLEAN;
10134 				} else {
10135 					ret = 0;
10136 				}
10137 			}
10138 			free_extent_map(em);
10139 			goto out;
10140 		}
10141 		path->slots[0]++;
10142 	}
10143 out:
10144 	return ret;
10145 }
10146 
10147 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
10148 {
10149 	struct btrfs_block_group_cache *block_group;
10150 	u64 last = 0;
10151 
10152 	while (1) {
10153 		struct inode *inode;
10154 
10155 		block_group = btrfs_lookup_first_block_group(info, last);
10156 		while (block_group) {
10157 			wait_block_group_cache_done(block_group);
10158 			spin_lock(&block_group->lock);
10159 			if (block_group->iref)
10160 				break;
10161 			spin_unlock(&block_group->lock);
10162 			block_group = next_block_group(info, block_group);
10163 		}
10164 		if (!block_group) {
10165 			if (last == 0)
10166 				break;
10167 			last = 0;
10168 			continue;
10169 		}
10170 
10171 		inode = block_group->inode;
10172 		block_group->iref = 0;
10173 		block_group->inode = NULL;
10174 		spin_unlock(&block_group->lock);
10175 		ASSERT(block_group->io_ctl.inode == NULL);
10176 		iput(inode);
10177 		last = block_group->key.objectid + block_group->key.offset;
10178 		btrfs_put_block_group(block_group);
10179 	}
10180 }
10181 
10182 /*
10183  * Must be called only after stopping all workers, since we could have block
10184  * group caching kthreads running, and therefore they could race with us if we
10185  * freed the block groups before stopping them.
10186  */
10187 int btrfs_free_block_groups(struct btrfs_fs_info *info)
10188 {
10189 	struct btrfs_block_group_cache *block_group;
10190 	struct btrfs_space_info *space_info;
10191 	struct btrfs_caching_control *caching_ctl;
10192 	struct rb_node *n;
10193 
10194 	down_write(&info->commit_root_sem);
10195 	while (!list_empty(&info->caching_block_groups)) {
10196 		caching_ctl = list_entry(info->caching_block_groups.next,
10197 					 struct btrfs_caching_control, list);
10198 		list_del(&caching_ctl->list);
10199 		put_caching_control(caching_ctl);
10200 	}
10201 	up_write(&info->commit_root_sem);
10202 
10203 	spin_lock(&info->unused_bgs_lock);
10204 	while (!list_empty(&info->unused_bgs)) {
10205 		block_group = list_first_entry(&info->unused_bgs,
10206 					       struct btrfs_block_group_cache,
10207 					       bg_list);
10208 		list_del_init(&block_group->bg_list);
10209 		btrfs_put_block_group(block_group);
10210 	}
10211 	spin_unlock(&info->unused_bgs_lock);
10212 
10213 	spin_lock(&info->block_group_cache_lock);
10214 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
10215 		block_group = rb_entry(n, struct btrfs_block_group_cache,
10216 				       cache_node);
10217 		rb_erase(&block_group->cache_node,
10218 			 &info->block_group_cache_tree);
10219 		RB_CLEAR_NODE(&block_group->cache_node);
10220 		spin_unlock(&info->block_group_cache_lock);
10221 
10222 		down_write(&block_group->space_info->groups_sem);
10223 		list_del(&block_group->list);
10224 		up_write(&block_group->space_info->groups_sem);
10225 
10226 		/*
10227 		 * We haven't cached this block group, which means we could
10228 		 * possibly have excluded extents on this block group.
10229 		 */
10230 		if (block_group->cached == BTRFS_CACHE_NO ||
10231 		    block_group->cached == BTRFS_CACHE_ERROR)
10232 			free_excluded_extents(block_group);
10233 
10234 		btrfs_remove_free_space_cache(block_group);
10235 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
10236 		ASSERT(list_empty(&block_group->dirty_list));
10237 		ASSERT(list_empty(&block_group->io_list));
10238 		ASSERT(list_empty(&block_group->bg_list));
10239 		ASSERT(atomic_read(&block_group->count) == 1);
10240 		btrfs_put_block_group(block_group);
10241 
10242 		spin_lock(&info->block_group_cache_lock);
10243 	}
10244 	spin_unlock(&info->block_group_cache_lock);
10245 
10246 	/* now that all the block groups are freed, go through and
10247 	 * free all the space_info structs.  This is only called during
10248 	 * the final stages of unmount, and so we know nobody is
10249 	 * using them.  We call synchronize_rcu() once before we start,
10250 	 * just to be on the safe side.
10251 	 */
10252 	synchronize_rcu();
10253 
10254 	release_global_block_rsv(info);
10255 
10256 	while (!list_empty(&info->space_info)) {
10257 		int i;
10258 
10259 		space_info = list_entry(info->space_info.next,
10260 					struct btrfs_space_info,
10261 					list);
10262 
10263 		/*
10264 		 * Do not hide this behind enospc_debug, this is actually
10265 		 * important and indicates a real bug if this happens.
10266 		 */
10267 		if (WARN_ON(space_info->bytes_pinned > 0 ||
10268 			    space_info->bytes_reserved > 0 ||
10269 			    space_info->bytes_may_use > 0))
10270 			dump_space_info(info, space_info, 0, 0);
10271 		list_del(&space_info->list);
10272 		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10273 			struct kobject *kobj;
10274 			kobj = space_info->block_group_kobjs[i];
10275 			space_info->block_group_kobjs[i] = NULL;
10276 			if (kobj) {
10277 				kobject_del(kobj);
10278 				kobject_put(kobj);
10279 			}
10280 		}
10281 		kobject_del(&space_info->kobj);
10282 		kobject_put(&space_info->kobj);
10283 	}
10284 	return 0;
10285 }
10286 
10287 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
10288 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10289 {
10290 	struct btrfs_space_info *space_info;
10291 	struct raid_kobject *rkobj;
10292 	LIST_HEAD(list);
10293 	int index;
10294 	int ret = 0;
10295 
10296 	spin_lock(&fs_info->pending_raid_kobjs_lock);
10297 	list_splice_init(&fs_info->pending_raid_kobjs, &list);
10298 	spin_unlock(&fs_info->pending_raid_kobjs_lock);
10299 
10300 	list_for_each_entry(rkobj, &list, list) {
10301 		space_info = __find_space_info(fs_info, rkobj->flags);
10302 		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
10303 
10304 		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10305 				  "%s", get_raid_name(index));
10306 		if (ret) {
10307 			kobject_put(&rkobj->kobj);
10308 			break;
10309 		}
10310 	}
10311 	if (ret)
10312 		btrfs_warn(fs_info,
10313 			   "failed to add kobject for block cache, ignoring");
10314 }
10315 
10316 static void link_block_group(struct btrfs_block_group_cache *cache)
10317 {
10318 	struct btrfs_space_info *space_info = cache->space_info;
10319 	struct btrfs_fs_info *fs_info = cache->fs_info;
10320 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
10321 	bool first = false;
10322 
10323 	down_write(&space_info->groups_sem);
10324 	if (list_empty(&space_info->block_groups[index]))
10325 		first = true;
10326 	list_add_tail(&cache->list, &space_info->block_groups[index]);
10327 	up_write(&space_info->groups_sem);
10328 
10329 	if (first) {
10330 		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10331 		if (!rkobj) {
10332 			btrfs_warn(cache->fs_info,
10333 				"couldn't alloc memory for raid level kobject");
10334 			return;
10335 		}
10336 		rkobj->flags = cache->flags;
10337 		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10338 
10339 		spin_lock(&fs_info->pending_raid_kobjs_lock);
10340 		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10341 		spin_unlock(&fs_info->pending_raid_kobjs_lock);
10342 		space_info->block_group_kobjs[index] = &rkobj->kobj;
10343 	}
10344 }
10345 
10346 static struct btrfs_block_group_cache *
10347 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10348 			       u64 start, u64 size)
10349 {
10350 	struct btrfs_block_group_cache *cache;
10351 
10352 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
10353 	if (!cache)
10354 		return NULL;
10355 
10356 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10357 					GFP_NOFS);
10358 	if (!cache->free_space_ctl) {
10359 		kfree(cache);
10360 		return NULL;
10361 	}
10362 
10363 	cache->key.objectid = start;
10364 	cache->key.offset = size;
10365 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10366 
10367 	cache->fs_info = fs_info;
10368 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10369 	set_free_space_tree_thresholds(cache);
10370 
10371 	atomic_set(&cache->count, 1);
10372 	spin_lock_init(&cache->lock);
10373 	init_rwsem(&cache->data_rwsem);
10374 	INIT_LIST_HEAD(&cache->list);
10375 	INIT_LIST_HEAD(&cache->cluster_list);
10376 	INIT_LIST_HEAD(&cache->bg_list);
10377 	INIT_LIST_HEAD(&cache->ro_list);
10378 	INIT_LIST_HEAD(&cache->dirty_list);
10379 	INIT_LIST_HEAD(&cache->io_list);
10380 	btrfs_init_free_space_ctl(cache);
10381 	atomic_set(&cache->trimming, 0);
10382 	mutex_init(&cache->free_space_lock);
10383 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10384 
10385 	return cache;
10386 }
10387 
10388 
10389 /*
10390  * Iterate all chunks and verify that each of them has the corresponding block
10391  * group
10392  */
10393 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10394 {
10395 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
10396 	struct extent_map *em;
10397 	struct btrfs_block_group_cache *bg;
10398 	u64 start = 0;
10399 	int ret = 0;
10400 
10401 	while (1) {
10402 		read_lock(&map_tree->map_tree.lock);
10403 		/*
10404 		 * lookup_extent_mapping will return the first extent map
10405 		 * intersecting the range, so setting @len to 1 is enough to
10406 		 * get the first chunk.
10407 		 */
10408 		em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
10409 		read_unlock(&map_tree->map_tree.lock);
10410 		if (!em)
10411 			break;
10412 
10413 		bg = btrfs_lookup_block_group(fs_info, em->start);
10414 		if (!bg) {
10415 			btrfs_err(fs_info,
10416 	"chunk start=%llu len=%llu doesn't have corresponding block group",
10417 				     em->start, em->len);
10418 			ret = -EUCLEAN;
10419 			free_extent_map(em);
10420 			break;
10421 		}
10422 		if (bg->key.objectid != em->start ||
10423 		    bg->key.offset != em->len ||
10424 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10425 		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10426 			btrfs_err(fs_info,
10427 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10428 				em->start, em->len,
10429 				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10430 				bg->key.objectid, bg->key.offset,
10431 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10432 			ret = -EUCLEAN;
10433 			free_extent_map(em);
10434 			btrfs_put_block_group(bg);
10435 			break;
10436 		}
10437 		start = em->start + em->len;
10438 		free_extent_map(em);
10439 		btrfs_put_block_group(bg);
10440 	}
10441 	return ret;
10442 }
10443 
10444 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10445 {
10446 	struct btrfs_path *path;
10447 	int ret;
10448 	struct btrfs_block_group_cache *cache;
10449 	struct btrfs_space_info *space_info;
10450 	struct btrfs_key key;
10451 	struct btrfs_key found_key;
10452 	struct extent_buffer *leaf;
10453 	int need_clear = 0;
10454 	u64 cache_gen;
10455 	u64 feature;
10456 	int mixed;
10457 
10458 	feature = btrfs_super_incompat_flags(info->super_copy);
10459 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10460 
10461 	key.objectid = 0;
10462 	key.offset = 0;
10463 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10464 	path = btrfs_alloc_path();
10465 	if (!path)
10466 		return -ENOMEM;
10467 	path->reada = READA_FORWARD;
10468 
10469 	cache_gen = btrfs_super_cache_generation(info->super_copy);
10470 	if (btrfs_test_opt(info, SPACE_CACHE) &&
10471 	    btrfs_super_generation(info->super_copy) != cache_gen)
10472 		need_clear = 1;
10473 	if (btrfs_test_opt(info, CLEAR_CACHE))
10474 		need_clear = 1;
10475 
10476 	while (1) {
10477 		ret = find_first_block_group(info, path, &key);
10478 		if (ret > 0)
10479 			break;
10480 		if (ret != 0)
10481 			goto error;
10482 
10483 		leaf = path->nodes[0];
10484 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10485 
10486 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
10487 						       found_key.offset);
10488 		if (!cache) {
10489 			ret = -ENOMEM;
10490 			goto error;
10491 		}
10492 
10493 		if (need_clear) {
10494 			/*
10495 			 * When we mount with old space cache, we need to
10496 			 * set BTRFS_DC_CLEAR and set dirty flag.
10497 			 *
10498 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10499 			 *    truncate the old free space cache inode and
10500 			 *    setup a new one.
10501 			 * b) Setting 'dirty flag' makes sure that we flush
10502 			 *    the new space cache info onto disk.
10503 			 */
10504 			if (btrfs_test_opt(info, SPACE_CACHE))
10505 				cache->disk_cache_state = BTRFS_DC_CLEAR;
10506 		}
10507 
10508 		read_extent_buffer(leaf, &cache->item,
10509 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
10510 				   sizeof(cache->item));
10511 		cache->flags = btrfs_block_group_flags(&cache->item);
10512 		if (!mixed &&
10513 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10514 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10515 			btrfs_err(info,
10516 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10517 				  cache->key.objectid);
10518 			ret = -EINVAL;
10519 			goto error;
10520 		}
10521 
10522 		key.objectid = found_key.objectid + found_key.offset;
10523 		btrfs_release_path(path);
10524 
10525 		/*
10526 		 * We need to exclude the super stripes now so that the space
10527 		 * info has super bytes accounted for, otherwise we'll think
10528 		 * we have more space than we actually do.
10529 		 */
10530 		ret = exclude_super_stripes(cache);
10531 		if (ret) {
10532 			/*
10533 			 * We may have excluded something, so call this just in
10534 			 * case.
10535 			 */
10536 			free_excluded_extents(cache);
10537 			btrfs_put_block_group(cache);
10538 			goto error;
10539 		}
10540 
10541 		/*
10542 		 * check for two cases, either we are full, and therefore
10543 		 * don't need to bother with the caching work since we won't
10544 		 * find any space, or we are empty, and we can just add all
10545 		 * the space in and be done with it.  This saves us _a_lot_ of
10546 		 * time, particularly in the full case.
10547 		 */
10548 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10549 			cache->last_byte_to_unpin = (u64)-1;
10550 			cache->cached = BTRFS_CACHE_FINISHED;
10551 			free_excluded_extents(cache);
10552 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10553 			cache->last_byte_to_unpin = (u64)-1;
10554 			cache->cached = BTRFS_CACHE_FINISHED;
10555 			add_new_free_space(cache, found_key.objectid,
10556 					   found_key.objectid +
10557 					   found_key.offset);
10558 			free_excluded_extents(cache);
10559 		}
10560 
10561 		ret = btrfs_add_block_group_cache(info, cache);
10562 		if (ret) {
10563 			btrfs_remove_free_space_cache(cache);
10564 			btrfs_put_block_group(cache);
10565 			goto error;
10566 		}
10567 
10568 		trace_btrfs_add_block_group(info, cache, 0);
10569 		update_space_info(info, cache->flags, found_key.offset,
10570 				  btrfs_block_group_used(&cache->item),
10571 				  cache->bytes_super, &space_info);
10572 
10573 		cache->space_info = space_info;
10574 
10575 		link_block_group(cache);
10576 
10577 		set_avail_alloc_bits(info, cache->flags);
10578 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10579 			inc_block_group_ro(cache, 1);
10580 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10581 			ASSERT(list_empty(&cache->bg_list));
10582 			btrfs_mark_bg_unused(cache);
10583 		}
10584 	}
10585 
10586 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
10587 		if (!(get_alloc_profile(info, space_info->flags) &
10588 		      (BTRFS_BLOCK_GROUP_RAID10 |
10589 		       BTRFS_BLOCK_GROUP_RAID1 |
10590 		       BTRFS_BLOCK_GROUP_RAID5 |
10591 		       BTRFS_BLOCK_GROUP_RAID6 |
10592 		       BTRFS_BLOCK_GROUP_DUP)))
10593 			continue;
10594 		/*
10595 		 * avoid allocating from un-mirrored block group if there are
10596 		 * mirrored block groups.
10597 		 */
10598 		list_for_each_entry(cache,
10599 				&space_info->block_groups[BTRFS_RAID_RAID0],
10600 				list)
10601 			inc_block_group_ro(cache, 1);
10602 		list_for_each_entry(cache,
10603 				&space_info->block_groups[BTRFS_RAID_SINGLE],
10604 				list)
10605 			inc_block_group_ro(cache, 1);
10606 	}
10607 
10608 	btrfs_add_raid_kobjects(info);
10609 	init_global_block_rsv(info);
10610 	ret = check_chunk_block_group_mappings(info);
10611 error:
10612 	btrfs_free_path(path);
10613 	return ret;
10614 }
10615 
10616 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10617 {
10618 	struct btrfs_fs_info *fs_info = trans->fs_info;
10619 	struct btrfs_block_group_cache *block_group;
10620 	struct btrfs_root *extent_root = fs_info->extent_root;
10621 	struct btrfs_block_group_item item;
10622 	struct btrfs_key key;
10623 	int ret = 0;
10624 
10625 	if (!trans->can_flush_pending_bgs)
10626 		return;
10627 
10628 	while (!list_empty(&trans->new_bgs)) {
10629 		block_group = list_first_entry(&trans->new_bgs,
10630 					       struct btrfs_block_group_cache,
10631 					       bg_list);
10632 		if (ret)
10633 			goto next;
10634 
10635 		spin_lock(&block_group->lock);
10636 		memcpy(&item, &block_group->item, sizeof(item));
10637 		memcpy(&key, &block_group->key, sizeof(key));
10638 		spin_unlock(&block_group->lock);
10639 
10640 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10641 					sizeof(item));
10642 		if (ret)
10643 			btrfs_abort_transaction(trans, ret);
10644 		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10645 		if (ret)
10646 			btrfs_abort_transaction(trans, ret);
10647 		add_block_group_free_space(trans, block_group);
10648 		/* already aborted the transaction if it failed. */
10649 next:
10650 		btrfs_delayed_refs_rsv_release(fs_info, 1);
10651 		list_del_init(&block_group->bg_list);
10652 	}
10653 	btrfs_trans_release_chunk_metadata(trans);
10654 }
10655 
10656 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10657 			   u64 type, u64 chunk_offset, u64 size)
10658 {
10659 	struct btrfs_fs_info *fs_info = trans->fs_info;
10660 	struct btrfs_block_group_cache *cache;
10661 	int ret;
10662 
10663 	btrfs_set_log_full_commit(fs_info, trans);
10664 
10665 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10666 	if (!cache)
10667 		return -ENOMEM;
10668 
10669 	btrfs_set_block_group_used(&cache->item, bytes_used);
10670 	btrfs_set_block_group_chunk_objectid(&cache->item,
10671 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10672 	btrfs_set_block_group_flags(&cache->item, type);
10673 
10674 	cache->flags = type;
10675 	cache->last_byte_to_unpin = (u64)-1;
10676 	cache->cached = BTRFS_CACHE_FINISHED;
10677 	cache->needs_free_space = 1;
10678 	ret = exclude_super_stripes(cache);
10679 	if (ret) {
10680 		/*
10681 		 * We may have excluded something, so call this just in
10682 		 * case.
10683 		 */
10684 		free_excluded_extents(cache);
10685 		btrfs_put_block_group(cache);
10686 		return ret;
10687 	}
10688 
10689 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
10690 
10691 	free_excluded_extents(cache);
10692 
10693 #ifdef CONFIG_BTRFS_DEBUG
10694 	if (btrfs_should_fragment_free_space(cache)) {
10695 		u64 new_bytes_used = size - bytes_used;
10696 
10697 		bytes_used += new_bytes_used >> 1;
10698 		fragment_free_space(cache);
10699 	}
10700 #endif
10701 	/*
10702 	 * Ensure the corresponding space_info object is created and
10703 	 * assigned to our block group. We want our bg to be added to the rbtree
10704 	 * with its ->space_info set.
10705 	 */
10706 	cache->space_info = __find_space_info(fs_info, cache->flags);
10707 	ASSERT(cache->space_info);
10708 
10709 	ret = btrfs_add_block_group_cache(fs_info, cache);
10710 	if (ret) {
10711 		btrfs_remove_free_space_cache(cache);
10712 		btrfs_put_block_group(cache);
10713 		return ret;
10714 	}
10715 
10716 	/*
10717 	 * Now that our block group has its ->space_info set and is inserted in
10718 	 * the rbtree, update the space info's counters.
10719 	 */
10720 	trace_btrfs_add_block_group(fs_info, cache, 1);
10721 	update_space_info(fs_info, cache->flags, size, bytes_used,
10722 				cache->bytes_super, &cache->space_info);
10723 	update_global_block_rsv(fs_info);
10724 
10725 	link_block_group(cache);
10726 
10727 	list_add_tail(&cache->bg_list, &trans->new_bgs);
10728 	trans->delayed_ref_updates++;
10729 	btrfs_update_delayed_refs_rsv(trans);
10730 
10731 	set_avail_alloc_bits(fs_info, type);
10732 	return 0;
10733 }
10734 
10735 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10736 {
10737 	u64 extra_flags = chunk_to_extended(flags) &
10738 				BTRFS_EXTENDED_PROFILE_MASK;
10739 
10740 	write_seqlock(&fs_info->profiles_lock);
10741 	if (flags & BTRFS_BLOCK_GROUP_DATA)
10742 		fs_info->avail_data_alloc_bits &= ~extra_flags;
10743 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10744 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10745 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10746 		fs_info->avail_system_alloc_bits &= ~extra_flags;
10747 	write_sequnlock(&fs_info->profiles_lock);
10748 }
10749 
10750 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10751 			     u64 group_start, struct extent_map *em)
10752 {
10753 	struct btrfs_fs_info *fs_info = trans->fs_info;
10754 	struct btrfs_root *root = fs_info->extent_root;
10755 	struct btrfs_path *path;
10756 	struct btrfs_block_group_cache *block_group;
10757 	struct btrfs_free_cluster *cluster;
10758 	struct btrfs_root *tree_root = fs_info->tree_root;
10759 	struct btrfs_key key;
10760 	struct inode *inode;
10761 	struct kobject *kobj = NULL;
10762 	int ret;
10763 	int index;
10764 	int factor;
10765 	struct btrfs_caching_control *caching_ctl = NULL;
10766 	bool remove_em;
10767 	bool remove_rsv = false;
10768 
10769 	block_group = btrfs_lookup_block_group(fs_info, group_start);
10770 	BUG_ON(!block_group);
10771 	BUG_ON(!block_group->ro);
10772 
10773 	trace_btrfs_remove_block_group(block_group);
10774 	/*
10775 	 * Free the reserved super bytes from this block group before
10776 	 * remove it.
10777 	 */
10778 	free_excluded_extents(block_group);
10779 	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10780 				  block_group->key.offset);
10781 
10782 	memcpy(&key, &block_group->key, sizeof(key));
10783 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
10784 	factor = btrfs_bg_type_to_factor(block_group->flags);
10785 
10786 	/* make sure this block group isn't part of an allocation cluster */
10787 	cluster = &fs_info->data_alloc_cluster;
10788 	spin_lock(&cluster->refill_lock);
10789 	btrfs_return_cluster_to_free_space(block_group, cluster);
10790 	spin_unlock(&cluster->refill_lock);
10791 
10792 	/*
10793 	 * make sure this block group isn't part of a metadata
10794 	 * allocation cluster
10795 	 */
10796 	cluster = &fs_info->meta_alloc_cluster;
10797 	spin_lock(&cluster->refill_lock);
10798 	btrfs_return_cluster_to_free_space(block_group, cluster);
10799 	spin_unlock(&cluster->refill_lock);
10800 
10801 	path = btrfs_alloc_path();
10802 	if (!path) {
10803 		ret = -ENOMEM;
10804 		goto out;
10805 	}
10806 
10807 	/*
10808 	 * get the inode first so any iput calls done for the io_list
10809 	 * aren't the final iput (no unlinks allowed now)
10810 	 */
10811 	inode = lookup_free_space_inode(fs_info, block_group, path);
10812 
10813 	mutex_lock(&trans->transaction->cache_write_mutex);
10814 	/*
10815 	 * Make sure our free space cache IO is done before removing the
10816 	 * free space inode
10817 	 */
10818 	spin_lock(&trans->transaction->dirty_bgs_lock);
10819 	if (!list_empty(&block_group->io_list)) {
10820 		list_del_init(&block_group->io_list);
10821 
10822 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10823 
10824 		spin_unlock(&trans->transaction->dirty_bgs_lock);
10825 		btrfs_wait_cache_io(trans, block_group, path);
10826 		btrfs_put_block_group(block_group);
10827 		spin_lock(&trans->transaction->dirty_bgs_lock);
10828 	}
10829 
10830 	if (!list_empty(&block_group->dirty_list)) {
10831 		list_del_init(&block_group->dirty_list);
10832 		remove_rsv = true;
10833 		btrfs_put_block_group(block_group);
10834 	}
10835 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10836 	mutex_unlock(&trans->transaction->cache_write_mutex);
10837 
10838 	if (!IS_ERR(inode)) {
10839 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10840 		if (ret) {
10841 			btrfs_add_delayed_iput(inode);
10842 			goto out;
10843 		}
10844 		clear_nlink(inode);
10845 		/* One for the block groups ref */
10846 		spin_lock(&block_group->lock);
10847 		if (block_group->iref) {
10848 			block_group->iref = 0;
10849 			block_group->inode = NULL;
10850 			spin_unlock(&block_group->lock);
10851 			iput(inode);
10852 		} else {
10853 			spin_unlock(&block_group->lock);
10854 		}
10855 		/* One for our lookup ref */
10856 		btrfs_add_delayed_iput(inode);
10857 	}
10858 
10859 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10860 	key.offset = block_group->key.objectid;
10861 	key.type = 0;
10862 
10863 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10864 	if (ret < 0)
10865 		goto out;
10866 	if (ret > 0)
10867 		btrfs_release_path(path);
10868 	if (ret == 0) {
10869 		ret = btrfs_del_item(trans, tree_root, path);
10870 		if (ret)
10871 			goto out;
10872 		btrfs_release_path(path);
10873 	}
10874 
10875 	spin_lock(&fs_info->block_group_cache_lock);
10876 	rb_erase(&block_group->cache_node,
10877 		 &fs_info->block_group_cache_tree);
10878 	RB_CLEAR_NODE(&block_group->cache_node);
10879 
10880 	if (fs_info->first_logical_byte == block_group->key.objectid)
10881 		fs_info->first_logical_byte = (u64)-1;
10882 	spin_unlock(&fs_info->block_group_cache_lock);
10883 
10884 	down_write(&block_group->space_info->groups_sem);
10885 	/*
10886 	 * we must use list_del_init so people can check to see if they
10887 	 * are still on the list after taking the semaphore
10888 	 */
10889 	list_del_init(&block_group->list);
10890 	if (list_empty(&block_group->space_info->block_groups[index])) {
10891 		kobj = block_group->space_info->block_group_kobjs[index];
10892 		block_group->space_info->block_group_kobjs[index] = NULL;
10893 		clear_avail_alloc_bits(fs_info, block_group->flags);
10894 	}
10895 	up_write(&block_group->space_info->groups_sem);
10896 	if (kobj) {
10897 		kobject_del(kobj);
10898 		kobject_put(kobj);
10899 	}
10900 
10901 	if (block_group->has_caching_ctl)
10902 		caching_ctl = get_caching_control(block_group);
10903 	if (block_group->cached == BTRFS_CACHE_STARTED)
10904 		wait_block_group_cache_done(block_group);
10905 	if (block_group->has_caching_ctl) {
10906 		down_write(&fs_info->commit_root_sem);
10907 		if (!caching_ctl) {
10908 			struct btrfs_caching_control *ctl;
10909 
10910 			list_for_each_entry(ctl,
10911 				    &fs_info->caching_block_groups, list)
10912 				if (ctl->block_group == block_group) {
10913 					caching_ctl = ctl;
10914 					refcount_inc(&caching_ctl->count);
10915 					break;
10916 				}
10917 		}
10918 		if (caching_ctl)
10919 			list_del_init(&caching_ctl->list);
10920 		up_write(&fs_info->commit_root_sem);
10921 		if (caching_ctl) {
10922 			/* Once for the caching bgs list and once for us. */
10923 			put_caching_control(caching_ctl);
10924 			put_caching_control(caching_ctl);
10925 		}
10926 	}
10927 
10928 	spin_lock(&trans->transaction->dirty_bgs_lock);
10929 	WARN_ON(!list_empty(&block_group->dirty_list));
10930 	WARN_ON(!list_empty(&block_group->io_list));
10931 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10932 
10933 	btrfs_remove_free_space_cache(block_group);
10934 
10935 	spin_lock(&block_group->space_info->lock);
10936 	list_del_init(&block_group->ro_list);
10937 
10938 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10939 		WARN_ON(block_group->space_info->total_bytes
10940 			< block_group->key.offset);
10941 		WARN_ON(block_group->space_info->bytes_readonly
10942 			< block_group->key.offset);
10943 		WARN_ON(block_group->space_info->disk_total
10944 			< block_group->key.offset * factor);
10945 	}
10946 	block_group->space_info->total_bytes -= block_group->key.offset;
10947 	block_group->space_info->bytes_readonly -= block_group->key.offset;
10948 	block_group->space_info->disk_total -= block_group->key.offset * factor;
10949 
10950 	spin_unlock(&block_group->space_info->lock);
10951 
10952 	memcpy(&key, &block_group->key, sizeof(key));
10953 
10954 	mutex_lock(&fs_info->chunk_mutex);
10955 	if (!list_empty(&em->list)) {
10956 		/* We're in the transaction->pending_chunks list. */
10957 		free_extent_map(em);
10958 	}
10959 	spin_lock(&block_group->lock);
10960 	block_group->removed = 1;
10961 	/*
10962 	 * At this point trimming can't start on this block group, because we
10963 	 * removed the block group from the tree fs_info->block_group_cache_tree
10964 	 * so no one can't find it anymore and even if someone already got this
10965 	 * block group before we removed it from the rbtree, they have already
10966 	 * incremented block_group->trimming - if they didn't, they won't find
10967 	 * any free space entries because we already removed them all when we
10968 	 * called btrfs_remove_free_space_cache().
10969 	 *
10970 	 * And we must not remove the extent map from the fs_info->mapping_tree
10971 	 * to prevent the same logical address range and physical device space
10972 	 * ranges from being reused for a new block group. This is because our
10973 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10974 	 * completely transactionless, so while it is trimming a range the
10975 	 * currently running transaction might finish and a new one start,
10976 	 * allowing for new block groups to be created that can reuse the same
10977 	 * physical device locations unless we take this special care.
10978 	 *
10979 	 * There may also be an implicit trim operation if the file system
10980 	 * is mounted with -odiscard. The same protections must remain
10981 	 * in place until the extents have been discarded completely when
10982 	 * the transaction commit has completed.
10983 	 */
10984 	remove_em = (atomic_read(&block_group->trimming) == 0);
10985 	/*
10986 	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10987 	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10988 	 * before checking block_group->removed).
10989 	 */
10990 	if (!remove_em) {
10991 		/*
10992 		 * Our em might be in trans->transaction->pending_chunks which
10993 		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10994 		 * and so is the fs_info->pinned_chunks list.
10995 		 *
10996 		 * So at this point we must be holding the chunk_mutex to avoid
10997 		 * any races with chunk allocation (more specifically at
10998 		 * volumes.c:contains_pending_extent()), to ensure it always
10999 		 * sees the em, either in the pending_chunks list or in the
11000 		 * pinned_chunks list.
11001 		 */
11002 		list_move_tail(&em->list, &fs_info->pinned_chunks);
11003 	}
11004 	spin_unlock(&block_group->lock);
11005 
11006 	if (remove_em) {
11007 		struct extent_map_tree *em_tree;
11008 
11009 		em_tree = &fs_info->mapping_tree.map_tree;
11010 		write_lock(&em_tree->lock);
11011 		/*
11012 		 * The em might be in the pending_chunks list, so make sure the
11013 		 * chunk mutex is locked, since remove_extent_mapping() will
11014 		 * delete us from that list.
11015 		 */
11016 		remove_extent_mapping(em_tree, em);
11017 		write_unlock(&em_tree->lock);
11018 		/* once for the tree */
11019 		free_extent_map(em);
11020 	}
11021 
11022 	mutex_unlock(&fs_info->chunk_mutex);
11023 
11024 	ret = remove_block_group_free_space(trans, block_group);
11025 	if (ret)
11026 		goto out;
11027 
11028 	btrfs_put_block_group(block_group);
11029 	btrfs_put_block_group(block_group);
11030 
11031 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
11032 	if (ret > 0)
11033 		ret = -EIO;
11034 	if (ret < 0)
11035 		goto out;
11036 
11037 	ret = btrfs_del_item(trans, root, path);
11038 out:
11039 	if (remove_rsv)
11040 		btrfs_delayed_refs_rsv_release(fs_info, 1);
11041 	btrfs_free_path(path);
11042 	return ret;
11043 }
11044 
11045 struct btrfs_trans_handle *
11046 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
11047 				     const u64 chunk_offset)
11048 {
11049 	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
11050 	struct extent_map *em;
11051 	struct map_lookup *map;
11052 	unsigned int num_items;
11053 
11054 	read_lock(&em_tree->lock);
11055 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
11056 	read_unlock(&em_tree->lock);
11057 	ASSERT(em && em->start == chunk_offset);
11058 
11059 	/*
11060 	 * We need to reserve 3 + N units from the metadata space info in order
11061 	 * to remove a block group (done at btrfs_remove_chunk() and at
11062 	 * btrfs_remove_block_group()), which are used for:
11063 	 *
11064 	 * 1 unit for adding the free space inode's orphan (located in the tree
11065 	 * of tree roots).
11066 	 * 1 unit for deleting the block group item (located in the extent
11067 	 * tree).
11068 	 * 1 unit for deleting the free space item (located in tree of tree
11069 	 * roots).
11070 	 * N units for deleting N device extent items corresponding to each
11071 	 * stripe (located in the device tree).
11072 	 *
11073 	 * In order to remove a block group we also need to reserve units in the
11074 	 * system space info in order to update the chunk tree (update one or
11075 	 * more device items and remove one chunk item), but this is done at
11076 	 * btrfs_remove_chunk() through a call to check_system_chunk().
11077 	 */
11078 	map = em->map_lookup;
11079 	num_items = 3 + map->num_stripes;
11080 	free_extent_map(em);
11081 
11082 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
11083 							   num_items, 1);
11084 }
11085 
11086 /*
11087  * Process the unused_bgs list and remove any that don't have any allocated
11088  * space inside of them.
11089  */
11090 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
11091 {
11092 	struct btrfs_block_group_cache *block_group;
11093 	struct btrfs_space_info *space_info;
11094 	struct btrfs_trans_handle *trans;
11095 	int ret = 0;
11096 
11097 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
11098 		return;
11099 
11100 	spin_lock(&fs_info->unused_bgs_lock);
11101 	while (!list_empty(&fs_info->unused_bgs)) {
11102 		u64 start, end;
11103 		int trimming;
11104 
11105 		block_group = list_first_entry(&fs_info->unused_bgs,
11106 					       struct btrfs_block_group_cache,
11107 					       bg_list);
11108 		list_del_init(&block_group->bg_list);
11109 
11110 		space_info = block_group->space_info;
11111 
11112 		if (ret || btrfs_mixed_space_info(space_info)) {
11113 			btrfs_put_block_group(block_group);
11114 			continue;
11115 		}
11116 		spin_unlock(&fs_info->unused_bgs_lock);
11117 
11118 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
11119 
11120 		/* Don't want to race with allocators so take the groups_sem */
11121 		down_write(&space_info->groups_sem);
11122 		spin_lock(&block_group->lock);
11123 		if (block_group->reserved || block_group->pinned ||
11124 		    btrfs_block_group_used(&block_group->item) ||
11125 		    block_group->ro ||
11126 		    list_is_singular(&block_group->list)) {
11127 			/*
11128 			 * We want to bail if we made new allocations or have
11129 			 * outstanding allocations in this block group.  We do
11130 			 * the ro check in case balance is currently acting on
11131 			 * this block group.
11132 			 */
11133 			trace_btrfs_skip_unused_block_group(block_group);
11134 			spin_unlock(&block_group->lock);
11135 			up_write(&space_info->groups_sem);
11136 			goto next;
11137 		}
11138 		spin_unlock(&block_group->lock);
11139 
11140 		/* We don't want to force the issue, only flip if it's ok. */
11141 		ret = inc_block_group_ro(block_group, 0);
11142 		up_write(&space_info->groups_sem);
11143 		if (ret < 0) {
11144 			ret = 0;
11145 			goto next;
11146 		}
11147 
11148 		/*
11149 		 * Want to do this before we do anything else so we can recover
11150 		 * properly if we fail to join the transaction.
11151 		 */
11152 		trans = btrfs_start_trans_remove_block_group(fs_info,
11153 						     block_group->key.objectid);
11154 		if (IS_ERR(trans)) {
11155 			btrfs_dec_block_group_ro(block_group);
11156 			ret = PTR_ERR(trans);
11157 			goto next;
11158 		}
11159 
11160 		/*
11161 		 * We could have pending pinned extents for this block group,
11162 		 * just delete them, we don't care about them anymore.
11163 		 */
11164 		start = block_group->key.objectid;
11165 		end = start + block_group->key.offset - 1;
11166 		/*
11167 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
11168 		 * btrfs_finish_extent_commit(). If we are at transaction N,
11169 		 * another task might be running finish_extent_commit() for the
11170 		 * previous transaction N - 1, and have seen a range belonging
11171 		 * to the block group in freed_extents[] before we were able to
11172 		 * clear the whole block group range from freed_extents[]. This
11173 		 * means that task can lookup for the block group after we
11174 		 * unpinned it from freed_extents[] and removed it, leading to
11175 		 * a BUG_ON() at btrfs_unpin_extent_range().
11176 		 */
11177 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
11178 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
11179 				  EXTENT_DIRTY);
11180 		if (ret) {
11181 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11182 			btrfs_dec_block_group_ro(block_group);
11183 			goto end_trans;
11184 		}
11185 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
11186 				  EXTENT_DIRTY);
11187 		if (ret) {
11188 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11189 			btrfs_dec_block_group_ro(block_group);
11190 			goto end_trans;
11191 		}
11192 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11193 
11194 		/* Reset pinned so btrfs_put_block_group doesn't complain */
11195 		spin_lock(&space_info->lock);
11196 		spin_lock(&block_group->lock);
11197 
11198 		update_bytes_pinned(space_info, -block_group->pinned);
11199 		space_info->bytes_readonly += block_group->pinned;
11200 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
11201 				   -block_group->pinned,
11202 				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
11203 		block_group->pinned = 0;
11204 
11205 		spin_unlock(&block_group->lock);
11206 		spin_unlock(&space_info->lock);
11207 
11208 		/* DISCARD can flip during remount */
11209 		trimming = btrfs_test_opt(fs_info, DISCARD);
11210 
11211 		/* Implicit trim during transaction commit. */
11212 		if (trimming)
11213 			btrfs_get_block_group_trimming(block_group);
11214 
11215 		/*
11216 		 * Btrfs_remove_chunk will abort the transaction if things go
11217 		 * horribly wrong.
11218 		 */
11219 		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
11220 
11221 		if (ret) {
11222 			if (trimming)
11223 				btrfs_put_block_group_trimming(block_group);
11224 			goto end_trans;
11225 		}
11226 
11227 		/*
11228 		 * If we're not mounted with -odiscard, we can just forget
11229 		 * about this block group. Otherwise we'll need to wait
11230 		 * until transaction commit to do the actual discard.
11231 		 */
11232 		if (trimming) {
11233 			spin_lock(&fs_info->unused_bgs_lock);
11234 			/*
11235 			 * A concurrent scrub might have added us to the list
11236 			 * fs_info->unused_bgs, so use a list_move operation
11237 			 * to add the block group to the deleted_bgs list.
11238 			 */
11239 			list_move(&block_group->bg_list,
11240 				  &trans->transaction->deleted_bgs);
11241 			spin_unlock(&fs_info->unused_bgs_lock);
11242 			btrfs_get_block_group(block_group);
11243 		}
11244 end_trans:
11245 		btrfs_end_transaction(trans);
11246 next:
11247 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
11248 		btrfs_put_block_group(block_group);
11249 		spin_lock(&fs_info->unused_bgs_lock);
11250 	}
11251 	spin_unlock(&fs_info->unused_bgs_lock);
11252 }
11253 
11254 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11255 {
11256 	struct btrfs_super_block *disk_super;
11257 	u64 features;
11258 	u64 flags;
11259 	int mixed = 0;
11260 	int ret;
11261 
11262 	disk_super = fs_info->super_copy;
11263 	if (!btrfs_super_root(disk_super))
11264 		return -EINVAL;
11265 
11266 	features = btrfs_super_incompat_flags(disk_super);
11267 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11268 		mixed = 1;
11269 
11270 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
11271 	ret = create_space_info(fs_info, flags);
11272 	if (ret)
11273 		goto out;
11274 
11275 	if (mixed) {
11276 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11277 		ret = create_space_info(fs_info, flags);
11278 	} else {
11279 		flags = BTRFS_BLOCK_GROUP_METADATA;
11280 		ret = create_space_info(fs_info, flags);
11281 		if (ret)
11282 			goto out;
11283 
11284 		flags = BTRFS_BLOCK_GROUP_DATA;
11285 		ret = create_space_info(fs_info, flags);
11286 	}
11287 out:
11288 	return ret;
11289 }
11290 
11291 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
11292 				   u64 start, u64 end)
11293 {
11294 	return unpin_extent_range(fs_info, start, end, false);
11295 }
11296 
11297 /*
11298  * It used to be that old block groups would be left around forever.
11299  * Iterating over them would be enough to trim unused space.  Since we
11300  * now automatically remove them, we also need to iterate over unallocated
11301  * space.
11302  *
11303  * We don't want a transaction for this since the discard may take a
11304  * substantial amount of time.  We don't require that a transaction be
11305  * running, but we do need to take a running transaction into account
11306  * to ensure that we're not discarding chunks that were released or
11307  * allocated in the current transaction.
11308  *
11309  * Holding the chunks lock will prevent other threads from allocating
11310  * or releasing chunks, but it won't prevent a running transaction
11311  * from committing and releasing the memory that the pending chunks
11312  * list head uses.  For that, we need to take a reference to the
11313  * transaction and hold the commit root sem.  We only need to hold
11314  * it while performing the free space search since we have already
11315  * held back allocations.
11316  */
11317 static int btrfs_trim_free_extents(struct btrfs_device *device,
11318 				   u64 minlen, u64 *trimmed)
11319 {
11320 	u64 start = 0, len = 0;
11321 	int ret;
11322 
11323 	*trimmed = 0;
11324 
11325 	/* Discard not supported = nothing to do. */
11326 	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
11327 		return 0;
11328 
11329 	/* Not writable = nothing to do. */
11330 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
11331 		return 0;
11332 
11333 	/* No free space = nothing to do. */
11334 	if (device->total_bytes <= device->bytes_used)
11335 		return 0;
11336 
11337 	ret = 0;
11338 
11339 	while (1) {
11340 		struct btrfs_fs_info *fs_info = device->fs_info;
11341 		struct btrfs_transaction *trans;
11342 		u64 bytes;
11343 
11344 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11345 		if (ret)
11346 			break;
11347 
11348 		ret = down_read_killable(&fs_info->commit_root_sem);
11349 		if (ret) {
11350 			mutex_unlock(&fs_info->chunk_mutex);
11351 			break;
11352 		}
11353 
11354 		spin_lock(&fs_info->trans_lock);
11355 		trans = fs_info->running_transaction;
11356 		if (trans)
11357 			refcount_inc(&trans->use_count);
11358 		spin_unlock(&fs_info->trans_lock);
11359 
11360 		if (!trans)
11361 			up_read(&fs_info->commit_root_sem);
11362 
11363 		ret = find_free_dev_extent_start(trans, device, minlen, start,
11364 						 &start, &len);
11365 		if (trans) {
11366 			up_read(&fs_info->commit_root_sem);
11367 			btrfs_put_transaction(trans);
11368 		}
11369 
11370 		if (ret) {
11371 			mutex_unlock(&fs_info->chunk_mutex);
11372 			if (ret == -ENOSPC)
11373 				ret = 0;
11374 			break;
11375 		}
11376 
11377 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11378 		mutex_unlock(&fs_info->chunk_mutex);
11379 
11380 		if (ret)
11381 			break;
11382 
11383 		start += len;
11384 		*trimmed += bytes;
11385 
11386 		if (fatal_signal_pending(current)) {
11387 			ret = -ERESTARTSYS;
11388 			break;
11389 		}
11390 
11391 		cond_resched();
11392 	}
11393 
11394 	return ret;
11395 }
11396 
11397 /*
11398  * Trim the whole filesystem by:
11399  * 1) trimming the free space in each block group
11400  * 2) trimming the unallocated space on each device
11401  *
11402  * This will also continue trimming even if a block group or device encounters
11403  * an error.  The return value will be the last error, or 0 if nothing bad
11404  * happens.
11405  */
11406 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11407 {
11408 	struct btrfs_block_group_cache *cache = NULL;
11409 	struct btrfs_device *device;
11410 	struct list_head *devices;
11411 	u64 group_trimmed;
11412 	u64 start;
11413 	u64 end;
11414 	u64 trimmed = 0;
11415 	u64 bg_failed = 0;
11416 	u64 dev_failed = 0;
11417 	int bg_ret = 0;
11418 	int dev_ret = 0;
11419 	int ret = 0;
11420 
11421 	cache = btrfs_lookup_first_block_group(fs_info, range->start);
11422 	for (; cache; cache = next_block_group(fs_info, cache)) {
11423 		if (cache->key.objectid >= (range->start + range->len)) {
11424 			btrfs_put_block_group(cache);
11425 			break;
11426 		}
11427 
11428 		start = max(range->start, cache->key.objectid);
11429 		end = min(range->start + range->len,
11430 				cache->key.objectid + cache->key.offset);
11431 
11432 		if (end - start >= range->minlen) {
11433 			if (!block_group_cache_done(cache)) {
11434 				ret = cache_block_group(cache, 0);
11435 				if (ret) {
11436 					bg_failed++;
11437 					bg_ret = ret;
11438 					continue;
11439 				}
11440 				ret = wait_block_group_cache_done(cache);
11441 				if (ret) {
11442 					bg_failed++;
11443 					bg_ret = ret;
11444 					continue;
11445 				}
11446 			}
11447 			ret = btrfs_trim_block_group(cache,
11448 						     &group_trimmed,
11449 						     start,
11450 						     end,
11451 						     range->minlen);
11452 
11453 			trimmed += group_trimmed;
11454 			if (ret) {
11455 				bg_failed++;
11456 				bg_ret = ret;
11457 				continue;
11458 			}
11459 		}
11460 	}
11461 
11462 	if (bg_failed)
11463 		btrfs_warn(fs_info,
11464 			"failed to trim %llu block group(s), last error %d",
11465 			bg_failed, bg_ret);
11466 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
11467 	devices = &fs_info->fs_devices->devices;
11468 	list_for_each_entry(device, devices, dev_list) {
11469 		ret = btrfs_trim_free_extents(device, range->minlen,
11470 					      &group_trimmed);
11471 		if (ret) {
11472 			dev_failed++;
11473 			dev_ret = ret;
11474 			break;
11475 		}
11476 
11477 		trimmed += group_trimmed;
11478 	}
11479 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11480 
11481 	if (dev_failed)
11482 		btrfs_warn(fs_info,
11483 			"failed to trim %llu device(s), last error %d",
11484 			dev_failed, dev_ret);
11485 	range->len = trimmed;
11486 	if (bg_ret)
11487 		return bg_ret;
11488 	return dev_ret;
11489 }
11490 
11491 /*
11492  * btrfs_{start,end}_write_no_snapshotting() are similar to
11493  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11494  * data into the page cache through nocow before the subvolume is snapshoted,
11495  * but flush the data into disk after the snapshot creation, or to prevent
11496  * operations while snapshotting is ongoing and that cause the snapshot to be
11497  * inconsistent (writes followed by expanding truncates for example).
11498  */
11499 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11500 {
11501 	percpu_counter_dec(&root->subv_writers->counter);
11502 	cond_wake_up(&root->subv_writers->wait);
11503 }
11504 
11505 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11506 {
11507 	if (atomic_read(&root->will_be_snapshotted))
11508 		return 0;
11509 
11510 	percpu_counter_inc(&root->subv_writers->counter);
11511 	/*
11512 	 * Make sure counter is updated before we check for snapshot creation.
11513 	 */
11514 	smp_mb();
11515 	if (atomic_read(&root->will_be_snapshotted)) {
11516 		btrfs_end_write_no_snapshotting(root);
11517 		return 0;
11518 	}
11519 	return 1;
11520 }
11521 
11522 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11523 {
11524 	while (true) {
11525 		int ret;
11526 
11527 		ret = btrfs_start_write_no_snapshotting(root);
11528 		if (ret)
11529 			break;
11530 		wait_var_event(&root->will_be_snapshotted,
11531 			       !atomic_read(&root->will_be_snapshotted));
11532 	}
11533 }
11534 
11535 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11536 {
11537 	struct btrfs_fs_info *fs_info = bg->fs_info;
11538 
11539 	spin_lock(&fs_info->unused_bgs_lock);
11540 	if (list_empty(&bg->bg_list)) {
11541 		btrfs_get_block_group(bg);
11542 		trace_btrfs_add_unused_block_group(bg);
11543 		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11544 	}
11545 	spin_unlock(&fs_info->unused_bgs_lock);
11546 }
11547