xref: /linux/fs/btrfs/extent-tree.c (revision 5c35a02c545a7bbe77f3a1ae337d9e29beed079b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "tree-log.h"
20 #include "disk-io.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "locking.h"
25 #include "free-space-cache.h"
26 #include "free-space-tree.h"
27 #include "math.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 
32 #undef SCRAMBLE_DELAYED_REFS
33 
34 /*
35  * control flags for do_chunk_alloc's force field
36  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37  * if we really need one.
38  *
39  * CHUNK_ALLOC_LIMITED means to only try and allocate one
40  * if we have very few chunks already allocated.  This is
41  * used as part of the clustering code to help make sure
42  * we have a good pool of storage to cluster in, without
43  * filling the FS with empty chunks
44  *
45  * CHUNK_ALLOC_FORCE means it must try to allocate one
46  *
47  */
48 enum {
49 	CHUNK_ALLOC_NO_FORCE = 0,
50 	CHUNK_ALLOC_LIMITED = 1,
51 	CHUNK_ALLOC_FORCE = 2,
52 };
53 
54 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
55 			       struct btrfs_fs_info *fs_info,
56 				struct btrfs_delayed_ref_node *node, u64 parent,
57 				u64 root_objectid, u64 owner_objectid,
58 				u64 owner_offset, int refs_to_drop,
59 				struct btrfs_delayed_extent_op *extra_op);
60 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
61 				    struct extent_buffer *leaf,
62 				    struct btrfs_extent_item *ei);
63 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
64 				      struct btrfs_fs_info *fs_info,
65 				      u64 parent, u64 root_objectid,
66 				      u64 flags, u64 owner, u64 offset,
67 				      struct btrfs_key *ins, int ref_mod);
68 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
69 				     struct btrfs_delayed_ref_node *node,
70 				     struct btrfs_delayed_extent_op *extent_op);
71 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
72 			  struct btrfs_fs_info *fs_info, u64 flags,
73 			  int force);
74 static int find_next_key(struct btrfs_path *path, int level,
75 			 struct btrfs_key *key);
76 static void dump_space_info(struct btrfs_fs_info *fs_info,
77 			    struct btrfs_space_info *info, u64 bytes,
78 			    int dump_block_groups);
79 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
80 			       u64 num_bytes);
81 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
82 				     struct btrfs_space_info *space_info,
83 				     u64 num_bytes);
84 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
85 				     struct btrfs_space_info *space_info,
86 				     u64 num_bytes);
87 
88 static noinline int
89 block_group_cache_done(struct btrfs_block_group_cache *cache)
90 {
91 	smp_mb();
92 	return cache->cached == BTRFS_CACHE_FINISHED ||
93 		cache->cached == BTRFS_CACHE_ERROR;
94 }
95 
96 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
97 {
98 	return (cache->flags & bits) == bits;
99 }
100 
101 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
102 {
103 	atomic_inc(&cache->count);
104 }
105 
106 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
107 {
108 	if (atomic_dec_and_test(&cache->count)) {
109 		WARN_ON(cache->pinned > 0);
110 		WARN_ON(cache->reserved > 0);
111 
112 		/*
113 		 * If not empty, someone is still holding mutex of
114 		 * full_stripe_lock, which can only be released by caller.
115 		 * And it will definitely cause use-after-free when caller
116 		 * tries to release full stripe lock.
117 		 *
118 		 * No better way to resolve, but only to warn.
119 		 */
120 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
121 		kfree(cache->free_space_ctl);
122 		kfree(cache);
123 	}
124 }
125 
126 /*
127  * this adds the block group to the fs_info rb tree for the block group
128  * cache
129  */
130 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
131 				struct btrfs_block_group_cache *block_group)
132 {
133 	struct rb_node **p;
134 	struct rb_node *parent = NULL;
135 	struct btrfs_block_group_cache *cache;
136 
137 	spin_lock(&info->block_group_cache_lock);
138 	p = &info->block_group_cache_tree.rb_node;
139 
140 	while (*p) {
141 		parent = *p;
142 		cache = rb_entry(parent, struct btrfs_block_group_cache,
143 				 cache_node);
144 		if (block_group->key.objectid < cache->key.objectid) {
145 			p = &(*p)->rb_left;
146 		} else if (block_group->key.objectid > cache->key.objectid) {
147 			p = &(*p)->rb_right;
148 		} else {
149 			spin_unlock(&info->block_group_cache_lock);
150 			return -EEXIST;
151 		}
152 	}
153 
154 	rb_link_node(&block_group->cache_node, parent, p);
155 	rb_insert_color(&block_group->cache_node,
156 			&info->block_group_cache_tree);
157 
158 	if (info->first_logical_byte > block_group->key.objectid)
159 		info->first_logical_byte = block_group->key.objectid;
160 
161 	spin_unlock(&info->block_group_cache_lock);
162 
163 	return 0;
164 }
165 
166 /*
167  * This will return the block group at or after bytenr if contains is 0, else
168  * it will return the block group that contains the bytenr
169  */
170 static struct btrfs_block_group_cache *
171 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
172 			      int contains)
173 {
174 	struct btrfs_block_group_cache *cache, *ret = NULL;
175 	struct rb_node *n;
176 	u64 end, start;
177 
178 	spin_lock(&info->block_group_cache_lock);
179 	n = info->block_group_cache_tree.rb_node;
180 
181 	while (n) {
182 		cache = rb_entry(n, struct btrfs_block_group_cache,
183 				 cache_node);
184 		end = cache->key.objectid + cache->key.offset - 1;
185 		start = cache->key.objectid;
186 
187 		if (bytenr < start) {
188 			if (!contains && (!ret || start < ret->key.objectid))
189 				ret = cache;
190 			n = n->rb_left;
191 		} else if (bytenr > start) {
192 			if (contains && bytenr <= end) {
193 				ret = cache;
194 				break;
195 			}
196 			n = n->rb_right;
197 		} else {
198 			ret = cache;
199 			break;
200 		}
201 	}
202 	if (ret) {
203 		btrfs_get_block_group(ret);
204 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
205 			info->first_logical_byte = ret->key.objectid;
206 	}
207 	spin_unlock(&info->block_group_cache_lock);
208 
209 	return ret;
210 }
211 
212 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
213 			       u64 start, u64 num_bytes)
214 {
215 	u64 end = start + num_bytes - 1;
216 	set_extent_bits(&fs_info->freed_extents[0],
217 			start, end, EXTENT_UPTODATE);
218 	set_extent_bits(&fs_info->freed_extents[1],
219 			start, end, EXTENT_UPTODATE);
220 	return 0;
221 }
222 
223 static void free_excluded_extents(struct btrfs_fs_info *fs_info,
224 				  struct btrfs_block_group_cache *cache)
225 {
226 	u64 start, end;
227 
228 	start = cache->key.objectid;
229 	end = start + cache->key.offset - 1;
230 
231 	clear_extent_bits(&fs_info->freed_extents[0],
232 			  start, end, EXTENT_UPTODATE);
233 	clear_extent_bits(&fs_info->freed_extents[1],
234 			  start, end, EXTENT_UPTODATE);
235 }
236 
237 static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
238 				 struct btrfs_block_group_cache *cache)
239 {
240 	u64 bytenr;
241 	u64 *logical;
242 	int stripe_len;
243 	int i, nr, ret;
244 
245 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
246 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
247 		cache->bytes_super += stripe_len;
248 		ret = add_excluded_extent(fs_info, cache->key.objectid,
249 					  stripe_len);
250 		if (ret)
251 			return ret;
252 	}
253 
254 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
255 		bytenr = btrfs_sb_offset(i);
256 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
257 				       bytenr, &logical, &nr, &stripe_len);
258 		if (ret)
259 			return ret;
260 
261 		while (nr--) {
262 			u64 start, len;
263 
264 			if (logical[nr] > cache->key.objectid +
265 			    cache->key.offset)
266 				continue;
267 
268 			if (logical[nr] + stripe_len <= cache->key.objectid)
269 				continue;
270 
271 			start = logical[nr];
272 			if (start < cache->key.objectid) {
273 				start = cache->key.objectid;
274 				len = (logical[nr] + stripe_len) - start;
275 			} else {
276 				len = min_t(u64, stripe_len,
277 					    cache->key.objectid +
278 					    cache->key.offset - start);
279 			}
280 
281 			cache->bytes_super += len;
282 			ret = add_excluded_extent(fs_info, start, len);
283 			if (ret) {
284 				kfree(logical);
285 				return ret;
286 			}
287 		}
288 
289 		kfree(logical);
290 	}
291 	return 0;
292 }
293 
294 static struct btrfs_caching_control *
295 get_caching_control(struct btrfs_block_group_cache *cache)
296 {
297 	struct btrfs_caching_control *ctl;
298 
299 	spin_lock(&cache->lock);
300 	if (!cache->caching_ctl) {
301 		spin_unlock(&cache->lock);
302 		return NULL;
303 	}
304 
305 	ctl = cache->caching_ctl;
306 	refcount_inc(&ctl->count);
307 	spin_unlock(&cache->lock);
308 	return ctl;
309 }
310 
311 static void put_caching_control(struct btrfs_caching_control *ctl)
312 {
313 	if (refcount_dec_and_test(&ctl->count))
314 		kfree(ctl);
315 }
316 
317 #ifdef CONFIG_BTRFS_DEBUG
318 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
319 {
320 	struct btrfs_fs_info *fs_info = block_group->fs_info;
321 	u64 start = block_group->key.objectid;
322 	u64 len = block_group->key.offset;
323 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
324 		fs_info->nodesize : fs_info->sectorsize;
325 	u64 step = chunk << 1;
326 
327 	while (len > chunk) {
328 		btrfs_remove_free_space(block_group, start, chunk);
329 		start += step;
330 		if (len < step)
331 			len = 0;
332 		else
333 			len -= step;
334 	}
335 }
336 #endif
337 
338 /*
339  * this is only called by cache_block_group, since we could have freed extents
340  * we need to check the pinned_extents for any extents that can't be used yet
341  * since their free space will be released as soon as the transaction commits.
342  */
343 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
344 		       u64 start, u64 end)
345 {
346 	struct btrfs_fs_info *info = block_group->fs_info;
347 	u64 extent_start, extent_end, size, total_added = 0;
348 	int ret;
349 
350 	while (start < end) {
351 		ret = find_first_extent_bit(info->pinned_extents, start,
352 					    &extent_start, &extent_end,
353 					    EXTENT_DIRTY | EXTENT_UPTODATE,
354 					    NULL);
355 		if (ret)
356 			break;
357 
358 		if (extent_start <= start) {
359 			start = extent_end + 1;
360 		} else if (extent_start > start && extent_start < end) {
361 			size = extent_start - start;
362 			total_added += size;
363 			ret = btrfs_add_free_space(block_group, start,
364 						   size);
365 			BUG_ON(ret); /* -ENOMEM or logic error */
366 			start = extent_end + 1;
367 		} else {
368 			break;
369 		}
370 	}
371 
372 	if (start < end) {
373 		size = end - start;
374 		total_added += size;
375 		ret = btrfs_add_free_space(block_group, start, size);
376 		BUG_ON(ret); /* -ENOMEM or logic error */
377 	}
378 
379 	return total_added;
380 }
381 
382 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
383 {
384 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
385 	struct btrfs_fs_info *fs_info = block_group->fs_info;
386 	struct btrfs_root *extent_root = fs_info->extent_root;
387 	struct btrfs_path *path;
388 	struct extent_buffer *leaf;
389 	struct btrfs_key key;
390 	u64 total_found = 0;
391 	u64 last = 0;
392 	u32 nritems;
393 	int ret;
394 	bool wakeup = true;
395 
396 	path = btrfs_alloc_path();
397 	if (!path)
398 		return -ENOMEM;
399 
400 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
401 
402 #ifdef CONFIG_BTRFS_DEBUG
403 	/*
404 	 * If we're fragmenting we don't want to make anybody think we can
405 	 * allocate from this block group until we've had a chance to fragment
406 	 * the free space.
407 	 */
408 	if (btrfs_should_fragment_free_space(block_group))
409 		wakeup = false;
410 #endif
411 	/*
412 	 * We don't want to deadlock with somebody trying to allocate a new
413 	 * extent for the extent root while also trying to search the extent
414 	 * root to add free space.  So we skip locking and search the commit
415 	 * root, since its read-only
416 	 */
417 	path->skip_locking = 1;
418 	path->search_commit_root = 1;
419 	path->reada = READA_FORWARD;
420 
421 	key.objectid = last;
422 	key.offset = 0;
423 	key.type = BTRFS_EXTENT_ITEM_KEY;
424 
425 next:
426 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
427 	if (ret < 0)
428 		goto out;
429 
430 	leaf = path->nodes[0];
431 	nritems = btrfs_header_nritems(leaf);
432 
433 	while (1) {
434 		if (btrfs_fs_closing(fs_info) > 1) {
435 			last = (u64)-1;
436 			break;
437 		}
438 
439 		if (path->slots[0] < nritems) {
440 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
441 		} else {
442 			ret = find_next_key(path, 0, &key);
443 			if (ret)
444 				break;
445 
446 			if (need_resched() ||
447 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
448 				if (wakeup)
449 					caching_ctl->progress = last;
450 				btrfs_release_path(path);
451 				up_read(&fs_info->commit_root_sem);
452 				mutex_unlock(&caching_ctl->mutex);
453 				cond_resched();
454 				mutex_lock(&caching_ctl->mutex);
455 				down_read(&fs_info->commit_root_sem);
456 				goto next;
457 			}
458 
459 			ret = btrfs_next_leaf(extent_root, path);
460 			if (ret < 0)
461 				goto out;
462 			if (ret)
463 				break;
464 			leaf = path->nodes[0];
465 			nritems = btrfs_header_nritems(leaf);
466 			continue;
467 		}
468 
469 		if (key.objectid < last) {
470 			key.objectid = last;
471 			key.offset = 0;
472 			key.type = BTRFS_EXTENT_ITEM_KEY;
473 
474 			if (wakeup)
475 				caching_ctl->progress = last;
476 			btrfs_release_path(path);
477 			goto next;
478 		}
479 
480 		if (key.objectid < block_group->key.objectid) {
481 			path->slots[0]++;
482 			continue;
483 		}
484 
485 		if (key.objectid >= block_group->key.objectid +
486 		    block_group->key.offset)
487 			break;
488 
489 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
490 		    key.type == BTRFS_METADATA_ITEM_KEY) {
491 			total_found += add_new_free_space(block_group, last,
492 							  key.objectid);
493 			if (key.type == BTRFS_METADATA_ITEM_KEY)
494 				last = key.objectid +
495 					fs_info->nodesize;
496 			else
497 				last = key.objectid + key.offset;
498 
499 			if (total_found > CACHING_CTL_WAKE_UP) {
500 				total_found = 0;
501 				if (wakeup)
502 					wake_up(&caching_ctl->wait);
503 			}
504 		}
505 		path->slots[0]++;
506 	}
507 	ret = 0;
508 
509 	total_found += add_new_free_space(block_group, last,
510 					  block_group->key.objectid +
511 					  block_group->key.offset);
512 	caching_ctl->progress = (u64)-1;
513 
514 out:
515 	btrfs_free_path(path);
516 	return ret;
517 }
518 
519 static noinline void caching_thread(struct btrfs_work *work)
520 {
521 	struct btrfs_block_group_cache *block_group;
522 	struct btrfs_fs_info *fs_info;
523 	struct btrfs_caching_control *caching_ctl;
524 	int ret;
525 
526 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
527 	block_group = caching_ctl->block_group;
528 	fs_info = block_group->fs_info;
529 
530 	mutex_lock(&caching_ctl->mutex);
531 	down_read(&fs_info->commit_root_sem);
532 
533 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
534 		ret = load_free_space_tree(caching_ctl);
535 	else
536 		ret = load_extent_tree_free(caching_ctl);
537 
538 	spin_lock(&block_group->lock);
539 	block_group->caching_ctl = NULL;
540 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
541 	spin_unlock(&block_group->lock);
542 
543 #ifdef CONFIG_BTRFS_DEBUG
544 	if (btrfs_should_fragment_free_space(block_group)) {
545 		u64 bytes_used;
546 
547 		spin_lock(&block_group->space_info->lock);
548 		spin_lock(&block_group->lock);
549 		bytes_used = block_group->key.offset -
550 			btrfs_block_group_used(&block_group->item);
551 		block_group->space_info->bytes_used += bytes_used >> 1;
552 		spin_unlock(&block_group->lock);
553 		spin_unlock(&block_group->space_info->lock);
554 		fragment_free_space(block_group);
555 	}
556 #endif
557 
558 	caching_ctl->progress = (u64)-1;
559 
560 	up_read(&fs_info->commit_root_sem);
561 	free_excluded_extents(fs_info, block_group);
562 	mutex_unlock(&caching_ctl->mutex);
563 
564 	wake_up(&caching_ctl->wait);
565 
566 	put_caching_control(caching_ctl);
567 	btrfs_put_block_group(block_group);
568 }
569 
570 static int cache_block_group(struct btrfs_block_group_cache *cache,
571 			     int load_cache_only)
572 {
573 	DEFINE_WAIT(wait);
574 	struct btrfs_fs_info *fs_info = cache->fs_info;
575 	struct btrfs_caching_control *caching_ctl;
576 	int ret = 0;
577 
578 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
579 	if (!caching_ctl)
580 		return -ENOMEM;
581 
582 	INIT_LIST_HEAD(&caching_ctl->list);
583 	mutex_init(&caching_ctl->mutex);
584 	init_waitqueue_head(&caching_ctl->wait);
585 	caching_ctl->block_group = cache;
586 	caching_ctl->progress = cache->key.objectid;
587 	refcount_set(&caching_ctl->count, 1);
588 	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
589 			caching_thread, NULL, NULL);
590 
591 	spin_lock(&cache->lock);
592 	/*
593 	 * This should be a rare occasion, but this could happen I think in the
594 	 * case where one thread starts to load the space cache info, and then
595 	 * some other thread starts a transaction commit which tries to do an
596 	 * allocation while the other thread is still loading the space cache
597 	 * info.  The previous loop should have kept us from choosing this block
598 	 * group, but if we've moved to the state where we will wait on caching
599 	 * block groups we need to first check if we're doing a fast load here,
600 	 * so we can wait for it to finish, otherwise we could end up allocating
601 	 * from a block group who's cache gets evicted for one reason or
602 	 * another.
603 	 */
604 	while (cache->cached == BTRFS_CACHE_FAST) {
605 		struct btrfs_caching_control *ctl;
606 
607 		ctl = cache->caching_ctl;
608 		refcount_inc(&ctl->count);
609 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
610 		spin_unlock(&cache->lock);
611 
612 		schedule();
613 
614 		finish_wait(&ctl->wait, &wait);
615 		put_caching_control(ctl);
616 		spin_lock(&cache->lock);
617 	}
618 
619 	if (cache->cached != BTRFS_CACHE_NO) {
620 		spin_unlock(&cache->lock);
621 		kfree(caching_ctl);
622 		return 0;
623 	}
624 	WARN_ON(cache->caching_ctl);
625 	cache->caching_ctl = caching_ctl;
626 	cache->cached = BTRFS_CACHE_FAST;
627 	spin_unlock(&cache->lock);
628 
629 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
630 		mutex_lock(&caching_ctl->mutex);
631 		ret = load_free_space_cache(fs_info, cache);
632 
633 		spin_lock(&cache->lock);
634 		if (ret == 1) {
635 			cache->caching_ctl = NULL;
636 			cache->cached = BTRFS_CACHE_FINISHED;
637 			cache->last_byte_to_unpin = (u64)-1;
638 			caching_ctl->progress = (u64)-1;
639 		} else {
640 			if (load_cache_only) {
641 				cache->caching_ctl = NULL;
642 				cache->cached = BTRFS_CACHE_NO;
643 			} else {
644 				cache->cached = BTRFS_CACHE_STARTED;
645 				cache->has_caching_ctl = 1;
646 			}
647 		}
648 		spin_unlock(&cache->lock);
649 #ifdef CONFIG_BTRFS_DEBUG
650 		if (ret == 1 &&
651 		    btrfs_should_fragment_free_space(cache)) {
652 			u64 bytes_used;
653 
654 			spin_lock(&cache->space_info->lock);
655 			spin_lock(&cache->lock);
656 			bytes_used = cache->key.offset -
657 				btrfs_block_group_used(&cache->item);
658 			cache->space_info->bytes_used += bytes_used >> 1;
659 			spin_unlock(&cache->lock);
660 			spin_unlock(&cache->space_info->lock);
661 			fragment_free_space(cache);
662 		}
663 #endif
664 		mutex_unlock(&caching_ctl->mutex);
665 
666 		wake_up(&caching_ctl->wait);
667 		if (ret == 1) {
668 			put_caching_control(caching_ctl);
669 			free_excluded_extents(fs_info, cache);
670 			return 0;
671 		}
672 	} else {
673 		/*
674 		 * We're either using the free space tree or no caching at all.
675 		 * Set cached to the appropriate value and wakeup any waiters.
676 		 */
677 		spin_lock(&cache->lock);
678 		if (load_cache_only) {
679 			cache->caching_ctl = NULL;
680 			cache->cached = BTRFS_CACHE_NO;
681 		} else {
682 			cache->cached = BTRFS_CACHE_STARTED;
683 			cache->has_caching_ctl = 1;
684 		}
685 		spin_unlock(&cache->lock);
686 		wake_up(&caching_ctl->wait);
687 	}
688 
689 	if (load_cache_only) {
690 		put_caching_control(caching_ctl);
691 		return 0;
692 	}
693 
694 	down_write(&fs_info->commit_root_sem);
695 	refcount_inc(&caching_ctl->count);
696 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
697 	up_write(&fs_info->commit_root_sem);
698 
699 	btrfs_get_block_group(cache);
700 
701 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
702 
703 	return ret;
704 }
705 
706 /*
707  * return the block group that starts at or after bytenr
708  */
709 static struct btrfs_block_group_cache *
710 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
711 {
712 	return block_group_cache_tree_search(info, bytenr, 0);
713 }
714 
715 /*
716  * return the block group that contains the given bytenr
717  */
718 struct btrfs_block_group_cache *btrfs_lookup_block_group(
719 						 struct btrfs_fs_info *info,
720 						 u64 bytenr)
721 {
722 	return block_group_cache_tree_search(info, bytenr, 1);
723 }
724 
725 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
726 						  u64 flags)
727 {
728 	struct list_head *head = &info->space_info;
729 	struct btrfs_space_info *found;
730 
731 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
732 
733 	rcu_read_lock();
734 	list_for_each_entry_rcu(found, head, list) {
735 		if (found->flags & flags) {
736 			rcu_read_unlock();
737 			return found;
738 		}
739 	}
740 	rcu_read_unlock();
741 	return NULL;
742 }
743 
744 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
745 			     bool metadata, u64 root_objectid)
746 {
747 	struct btrfs_space_info *space_info;
748 	u64 flags;
749 
750 	if (metadata) {
751 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
752 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
753 		else
754 			flags = BTRFS_BLOCK_GROUP_METADATA;
755 	} else {
756 		flags = BTRFS_BLOCK_GROUP_DATA;
757 	}
758 
759 	space_info = __find_space_info(fs_info, flags);
760 	ASSERT(space_info);
761 	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
762 }
763 
764 /*
765  * after adding space to the filesystem, we need to clear the full flags
766  * on all the space infos.
767  */
768 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
769 {
770 	struct list_head *head = &info->space_info;
771 	struct btrfs_space_info *found;
772 
773 	rcu_read_lock();
774 	list_for_each_entry_rcu(found, head, list)
775 		found->full = 0;
776 	rcu_read_unlock();
777 }
778 
779 /* simple helper to search for an existing data extent at a given offset */
780 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
781 {
782 	int ret;
783 	struct btrfs_key key;
784 	struct btrfs_path *path;
785 
786 	path = btrfs_alloc_path();
787 	if (!path)
788 		return -ENOMEM;
789 
790 	key.objectid = start;
791 	key.offset = len;
792 	key.type = BTRFS_EXTENT_ITEM_KEY;
793 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
794 	btrfs_free_path(path);
795 	return ret;
796 }
797 
798 /*
799  * helper function to lookup reference count and flags of a tree block.
800  *
801  * the head node for delayed ref is used to store the sum of all the
802  * reference count modifications queued up in the rbtree. the head
803  * node may also store the extent flags to set. This way you can check
804  * to see what the reference count and extent flags would be if all of
805  * the delayed refs are not processed.
806  */
807 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
808 			     struct btrfs_fs_info *fs_info, u64 bytenr,
809 			     u64 offset, int metadata, u64 *refs, u64 *flags)
810 {
811 	struct btrfs_delayed_ref_head *head;
812 	struct btrfs_delayed_ref_root *delayed_refs;
813 	struct btrfs_path *path;
814 	struct btrfs_extent_item *ei;
815 	struct extent_buffer *leaf;
816 	struct btrfs_key key;
817 	u32 item_size;
818 	u64 num_refs;
819 	u64 extent_flags;
820 	int ret;
821 
822 	/*
823 	 * If we don't have skinny metadata, don't bother doing anything
824 	 * different
825 	 */
826 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
827 		offset = fs_info->nodesize;
828 		metadata = 0;
829 	}
830 
831 	path = btrfs_alloc_path();
832 	if (!path)
833 		return -ENOMEM;
834 
835 	if (!trans) {
836 		path->skip_locking = 1;
837 		path->search_commit_root = 1;
838 	}
839 
840 search_again:
841 	key.objectid = bytenr;
842 	key.offset = offset;
843 	if (metadata)
844 		key.type = BTRFS_METADATA_ITEM_KEY;
845 	else
846 		key.type = BTRFS_EXTENT_ITEM_KEY;
847 
848 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
849 	if (ret < 0)
850 		goto out_free;
851 
852 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
853 		if (path->slots[0]) {
854 			path->slots[0]--;
855 			btrfs_item_key_to_cpu(path->nodes[0], &key,
856 					      path->slots[0]);
857 			if (key.objectid == bytenr &&
858 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
859 			    key.offset == fs_info->nodesize)
860 				ret = 0;
861 		}
862 	}
863 
864 	if (ret == 0) {
865 		leaf = path->nodes[0];
866 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
867 		if (item_size >= sizeof(*ei)) {
868 			ei = btrfs_item_ptr(leaf, path->slots[0],
869 					    struct btrfs_extent_item);
870 			num_refs = btrfs_extent_refs(leaf, ei);
871 			extent_flags = btrfs_extent_flags(leaf, ei);
872 		} else {
873 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
874 			struct btrfs_extent_item_v0 *ei0;
875 			BUG_ON(item_size != sizeof(*ei0));
876 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
877 					     struct btrfs_extent_item_v0);
878 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
879 			/* FIXME: this isn't correct for data */
880 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
881 #else
882 			BUG();
883 #endif
884 		}
885 		BUG_ON(num_refs == 0);
886 	} else {
887 		num_refs = 0;
888 		extent_flags = 0;
889 		ret = 0;
890 	}
891 
892 	if (!trans)
893 		goto out;
894 
895 	delayed_refs = &trans->transaction->delayed_refs;
896 	spin_lock(&delayed_refs->lock);
897 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
898 	if (head) {
899 		if (!mutex_trylock(&head->mutex)) {
900 			refcount_inc(&head->refs);
901 			spin_unlock(&delayed_refs->lock);
902 
903 			btrfs_release_path(path);
904 
905 			/*
906 			 * Mutex was contended, block until it's released and try
907 			 * again
908 			 */
909 			mutex_lock(&head->mutex);
910 			mutex_unlock(&head->mutex);
911 			btrfs_put_delayed_ref_head(head);
912 			goto search_again;
913 		}
914 		spin_lock(&head->lock);
915 		if (head->extent_op && head->extent_op->update_flags)
916 			extent_flags |= head->extent_op->flags_to_set;
917 		else
918 			BUG_ON(num_refs == 0);
919 
920 		num_refs += head->ref_mod;
921 		spin_unlock(&head->lock);
922 		mutex_unlock(&head->mutex);
923 	}
924 	spin_unlock(&delayed_refs->lock);
925 out:
926 	WARN_ON(num_refs == 0);
927 	if (refs)
928 		*refs = num_refs;
929 	if (flags)
930 		*flags = extent_flags;
931 out_free:
932 	btrfs_free_path(path);
933 	return ret;
934 }
935 
936 /*
937  * Back reference rules.  Back refs have three main goals:
938  *
939  * 1) differentiate between all holders of references to an extent so that
940  *    when a reference is dropped we can make sure it was a valid reference
941  *    before freeing the extent.
942  *
943  * 2) Provide enough information to quickly find the holders of an extent
944  *    if we notice a given block is corrupted or bad.
945  *
946  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
947  *    maintenance.  This is actually the same as #2, but with a slightly
948  *    different use case.
949  *
950  * There are two kinds of back refs. The implicit back refs is optimized
951  * for pointers in non-shared tree blocks. For a given pointer in a block,
952  * back refs of this kind provide information about the block's owner tree
953  * and the pointer's key. These information allow us to find the block by
954  * b-tree searching. The full back refs is for pointers in tree blocks not
955  * referenced by their owner trees. The location of tree block is recorded
956  * in the back refs. Actually the full back refs is generic, and can be
957  * used in all cases the implicit back refs is used. The major shortcoming
958  * of the full back refs is its overhead. Every time a tree block gets
959  * COWed, we have to update back refs entry for all pointers in it.
960  *
961  * For a newly allocated tree block, we use implicit back refs for
962  * pointers in it. This means most tree related operations only involve
963  * implicit back refs. For a tree block created in old transaction, the
964  * only way to drop a reference to it is COW it. So we can detect the
965  * event that tree block loses its owner tree's reference and do the
966  * back refs conversion.
967  *
968  * When a tree block is COWed through a tree, there are four cases:
969  *
970  * The reference count of the block is one and the tree is the block's
971  * owner tree. Nothing to do in this case.
972  *
973  * The reference count of the block is one and the tree is not the
974  * block's owner tree. In this case, full back refs is used for pointers
975  * in the block. Remove these full back refs, add implicit back refs for
976  * every pointers in the new block.
977  *
978  * The reference count of the block is greater than one and the tree is
979  * the block's owner tree. In this case, implicit back refs is used for
980  * pointers in the block. Add full back refs for every pointers in the
981  * block, increase lower level extents' reference counts. The original
982  * implicit back refs are entailed to the new block.
983  *
984  * The reference count of the block is greater than one and the tree is
985  * not the block's owner tree. Add implicit back refs for every pointer in
986  * the new block, increase lower level extents' reference count.
987  *
988  * Back Reference Key composing:
989  *
990  * The key objectid corresponds to the first byte in the extent,
991  * The key type is used to differentiate between types of back refs.
992  * There are different meanings of the key offset for different types
993  * of back refs.
994  *
995  * File extents can be referenced by:
996  *
997  * - multiple snapshots, subvolumes, or different generations in one subvol
998  * - different files inside a single subvolume
999  * - different offsets inside a file (bookend extents in file.c)
1000  *
1001  * The extent ref structure for the implicit back refs has fields for:
1002  *
1003  * - Objectid of the subvolume root
1004  * - objectid of the file holding the reference
1005  * - original offset in the file
1006  * - how many bookend extents
1007  *
1008  * The key offset for the implicit back refs is hash of the first
1009  * three fields.
1010  *
1011  * The extent ref structure for the full back refs has field for:
1012  *
1013  * - number of pointers in the tree leaf
1014  *
1015  * The key offset for the implicit back refs is the first byte of
1016  * the tree leaf
1017  *
1018  * When a file extent is allocated, The implicit back refs is used.
1019  * the fields are filled in:
1020  *
1021  *     (root_key.objectid, inode objectid, offset in file, 1)
1022  *
1023  * When a file extent is removed file truncation, we find the
1024  * corresponding implicit back refs and check the following fields:
1025  *
1026  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1027  *
1028  * Btree extents can be referenced by:
1029  *
1030  * - Different subvolumes
1031  *
1032  * Both the implicit back refs and the full back refs for tree blocks
1033  * only consist of key. The key offset for the implicit back refs is
1034  * objectid of block's owner tree. The key offset for the full back refs
1035  * is the first byte of parent block.
1036  *
1037  * When implicit back refs is used, information about the lowest key and
1038  * level of the tree block are required. These information are stored in
1039  * tree block info structure.
1040  */
1041 
1042 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1043 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1044 				  struct btrfs_fs_info *fs_info,
1045 				  struct btrfs_path *path,
1046 				  u64 owner, u32 extra_size)
1047 {
1048 	struct btrfs_root *root = fs_info->extent_root;
1049 	struct btrfs_extent_item *item;
1050 	struct btrfs_extent_item_v0 *ei0;
1051 	struct btrfs_extent_ref_v0 *ref0;
1052 	struct btrfs_tree_block_info *bi;
1053 	struct extent_buffer *leaf;
1054 	struct btrfs_key key;
1055 	struct btrfs_key found_key;
1056 	u32 new_size = sizeof(*item);
1057 	u64 refs;
1058 	int ret;
1059 
1060 	leaf = path->nodes[0];
1061 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1062 
1063 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1064 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
1065 			     struct btrfs_extent_item_v0);
1066 	refs = btrfs_extent_refs_v0(leaf, ei0);
1067 
1068 	if (owner == (u64)-1) {
1069 		while (1) {
1070 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1071 				ret = btrfs_next_leaf(root, path);
1072 				if (ret < 0)
1073 					return ret;
1074 				BUG_ON(ret > 0); /* Corruption */
1075 				leaf = path->nodes[0];
1076 			}
1077 			btrfs_item_key_to_cpu(leaf, &found_key,
1078 					      path->slots[0]);
1079 			BUG_ON(key.objectid != found_key.objectid);
1080 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1081 				path->slots[0]++;
1082 				continue;
1083 			}
1084 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1085 					      struct btrfs_extent_ref_v0);
1086 			owner = btrfs_ref_objectid_v0(leaf, ref0);
1087 			break;
1088 		}
1089 	}
1090 	btrfs_release_path(path);
1091 
1092 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1093 		new_size += sizeof(*bi);
1094 
1095 	new_size -= sizeof(*ei0);
1096 	ret = btrfs_search_slot(trans, root, &key, path,
1097 				new_size + extra_size, 1);
1098 	if (ret < 0)
1099 		return ret;
1100 	BUG_ON(ret); /* Corruption */
1101 
1102 	btrfs_extend_item(fs_info, path, new_size);
1103 
1104 	leaf = path->nodes[0];
1105 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1106 	btrfs_set_extent_refs(leaf, item, refs);
1107 	/* FIXME: get real generation */
1108 	btrfs_set_extent_generation(leaf, item, 0);
1109 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1110 		btrfs_set_extent_flags(leaf, item,
1111 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1112 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1113 		bi = (struct btrfs_tree_block_info *)(item + 1);
1114 		/* FIXME: get first key of the block */
1115 		memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
1116 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1117 	} else {
1118 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1119 	}
1120 	btrfs_mark_buffer_dirty(leaf);
1121 	return 0;
1122 }
1123 #endif
1124 
1125 /*
1126  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1127  * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1128  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1129  */
1130 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1131 				     struct btrfs_extent_inline_ref *iref,
1132 				     enum btrfs_inline_ref_type is_data)
1133 {
1134 	int type = btrfs_extent_inline_ref_type(eb, iref);
1135 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1136 
1137 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1138 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
1139 	    type == BTRFS_SHARED_DATA_REF_KEY ||
1140 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
1141 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
1142 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
1143 				return type;
1144 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1145 				ASSERT(eb->fs_info);
1146 				/*
1147 				 * Every shared one has parent tree
1148 				 * block, which must be aligned to
1149 				 * nodesize.
1150 				 */
1151 				if (offset &&
1152 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1153 					return type;
1154 			}
1155 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
1156 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
1157 				return type;
1158 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
1159 				ASSERT(eb->fs_info);
1160 				/*
1161 				 * Every shared one has parent tree
1162 				 * block, which must be aligned to
1163 				 * nodesize.
1164 				 */
1165 				if (offset &&
1166 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1167 					return type;
1168 			}
1169 		} else {
1170 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1171 			return type;
1172 		}
1173 	}
1174 
1175 	btrfs_print_leaf((struct extent_buffer *)eb);
1176 	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1177 		  eb->start, type);
1178 	WARN_ON(1);
1179 
1180 	return BTRFS_REF_TYPE_INVALID;
1181 }
1182 
1183 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1184 {
1185 	u32 high_crc = ~(u32)0;
1186 	u32 low_crc = ~(u32)0;
1187 	__le64 lenum;
1188 
1189 	lenum = cpu_to_le64(root_objectid);
1190 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1191 	lenum = cpu_to_le64(owner);
1192 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1193 	lenum = cpu_to_le64(offset);
1194 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1195 
1196 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1197 }
1198 
1199 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1200 				     struct btrfs_extent_data_ref *ref)
1201 {
1202 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1203 				    btrfs_extent_data_ref_objectid(leaf, ref),
1204 				    btrfs_extent_data_ref_offset(leaf, ref));
1205 }
1206 
1207 static int match_extent_data_ref(struct extent_buffer *leaf,
1208 				 struct btrfs_extent_data_ref *ref,
1209 				 u64 root_objectid, u64 owner, u64 offset)
1210 {
1211 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1212 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1213 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1214 		return 0;
1215 	return 1;
1216 }
1217 
1218 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1219 					   struct btrfs_fs_info *fs_info,
1220 					   struct btrfs_path *path,
1221 					   u64 bytenr, u64 parent,
1222 					   u64 root_objectid,
1223 					   u64 owner, u64 offset)
1224 {
1225 	struct btrfs_root *root = fs_info->extent_root;
1226 	struct btrfs_key key;
1227 	struct btrfs_extent_data_ref *ref;
1228 	struct extent_buffer *leaf;
1229 	u32 nritems;
1230 	int ret;
1231 	int recow;
1232 	int err = -ENOENT;
1233 
1234 	key.objectid = bytenr;
1235 	if (parent) {
1236 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1237 		key.offset = parent;
1238 	} else {
1239 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1240 		key.offset = hash_extent_data_ref(root_objectid,
1241 						  owner, offset);
1242 	}
1243 again:
1244 	recow = 0;
1245 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1246 	if (ret < 0) {
1247 		err = ret;
1248 		goto fail;
1249 	}
1250 
1251 	if (parent) {
1252 		if (!ret)
1253 			return 0;
1254 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1255 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1256 		btrfs_release_path(path);
1257 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1258 		if (ret < 0) {
1259 			err = ret;
1260 			goto fail;
1261 		}
1262 		if (!ret)
1263 			return 0;
1264 #endif
1265 		goto fail;
1266 	}
1267 
1268 	leaf = path->nodes[0];
1269 	nritems = btrfs_header_nritems(leaf);
1270 	while (1) {
1271 		if (path->slots[0] >= nritems) {
1272 			ret = btrfs_next_leaf(root, path);
1273 			if (ret < 0)
1274 				err = ret;
1275 			if (ret)
1276 				goto fail;
1277 
1278 			leaf = path->nodes[0];
1279 			nritems = btrfs_header_nritems(leaf);
1280 			recow = 1;
1281 		}
1282 
1283 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1284 		if (key.objectid != bytenr ||
1285 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1286 			goto fail;
1287 
1288 		ref = btrfs_item_ptr(leaf, path->slots[0],
1289 				     struct btrfs_extent_data_ref);
1290 
1291 		if (match_extent_data_ref(leaf, ref, root_objectid,
1292 					  owner, offset)) {
1293 			if (recow) {
1294 				btrfs_release_path(path);
1295 				goto again;
1296 			}
1297 			err = 0;
1298 			break;
1299 		}
1300 		path->slots[0]++;
1301 	}
1302 fail:
1303 	return err;
1304 }
1305 
1306 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1307 					   struct btrfs_fs_info *fs_info,
1308 					   struct btrfs_path *path,
1309 					   u64 bytenr, u64 parent,
1310 					   u64 root_objectid, u64 owner,
1311 					   u64 offset, int refs_to_add)
1312 {
1313 	struct btrfs_root *root = fs_info->extent_root;
1314 	struct btrfs_key key;
1315 	struct extent_buffer *leaf;
1316 	u32 size;
1317 	u32 num_refs;
1318 	int ret;
1319 
1320 	key.objectid = bytenr;
1321 	if (parent) {
1322 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1323 		key.offset = parent;
1324 		size = sizeof(struct btrfs_shared_data_ref);
1325 	} else {
1326 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1327 		key.offset = hash_extent_data_ref(root_objectid,
1328 						  owner, offset);
1329 		size = sizeof(struct btrfs_extent_data_ref);
1330 	}
1331 
1332 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1333 	if (ret && ret != -EEXIST)
1334 		goto fail;
1335 
1336 	leaf = path->nodes[0];
1337 	if (parent) {
1338 		struct btrfs_shared_data_ref *ref;
1339 		ref = btrfs_item_ptr(leaf, path->slots[0],
1340 				     struct btrfs_shared_data_ref);
1341 		if (ret == 0) {
1342 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1343 		} else {
1344 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1345 			num_refs += refs_to_add;
1346 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1347 		}
1348 	} else {
1349 		struct btrfs_extent_data_ref *ref;
1350 		while (ret == -EEXIST) {
1351 			ref = btrfs_item_ptr(leaf, path->slots[0],
1352 					     struct btrfs_extent_data_ref);
1353 			if (match_extent_data_ref(leaf, ref, root_objectid,
1354 						  owner, offset))
1355 				break;
1356 			btrfs_release_path(path);
1357 			key.offset++;
1358 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1359 						      size);
1360 			if (ret && ret != -EEXIST)
1361 				goto fail;
1362 
1363 			leaf = path->nodes[0];
1364 		}
1365 		ref = btrfs_item_ptr(leaf, path->slots[0],
1366 				     struct btrfs_extent_data_ref);
1367 		if (ret == 0) {
1368 			btrfs_set_extent_data_ref_root(leaf, ref,
1369 						       root_objectid);
1370 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1371 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1372 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1373 		} else {
1374 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1375 			num_refs += refs_to_add;
1376 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1377 		}
1378 	}
1379 	btrfs_mark_buffer_dirty(leaf);
1380 	ret = 0;
1381 fail:
1382 	btrfs_release_path(path);
1383 	return ret;
1384 }
1385 
1386 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1387 					   struct btrfs_fs_info *fs_info,
1388 					   struct btrfs_path *path,
1389 					   int refs_to_drop, int *last_ref)
1390 {
1391 	struct btrfs_key key;
1392 	struct btrfs_extent_data_ref *ref1 = NULL;
1393 	struct btrfs_shared_data_ref *ref2 = NULL;
1394 	struct extent_buffer *leaf;
1395 	u32 num_refs = 0;
1396 	int ret = 0;
1397 
1398 	leaf = path->nodes[0];
1399 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1400 
1401 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1402 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1403 				      struct btrfs_extent_data_ref);
1404 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1405 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1406 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1407 				      struct btrfs_shared_data_ref);
1408 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1409 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1410 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1411 		struct btrfs_extent_ref_v0 *ref0;
1412 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1413 				      struct btrfs_extent_ref_v0);
1414 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1415 #endif
1416 	} else {
1417 		BUG();
1418 	}
1419 
1420 	BUG_ON(num_refs < refs_to_drop);
1421 	num_refs -= refs_to_drop;
1422 
1423 	if (num_refs == 0) {
1424 		ret = btrfs_del_item(trans, fs_info->extent_root, path);
1425 		*last_ref = 1;
1426 	} else {
1427 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1428 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1429 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1430 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1431 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1432 		else {
1433 			struct btrfs_extent_ref_v0 *ref0;
1434 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1435 					struct btrfs_extent_ref_v0);
1436 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1437 		}
1438 #endif
1439 		btrfs_mark_buffer_dirty(leaf);
1440 	}
1441 	return ret;
1442 }
1443 
1444 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1445 					  struct btrfs_extent_inline_ref *iref)
1446 {
1447 	struct btrfs_key key;
1448 	struct extent_buffer *leaf;
1449 	struct btrfs_extent_data_ref *ref1;
1450 	struct btrfs_shared_data_ref *ref2;
1451 	u32 num_refs = 0;
1452 	int type;
1453 
1454 	leaf = path->nodes[0];
1455 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1456 	if (iref) {
1457 		/*
1458 		 * If type is invalid, we should have bailed out earlier than
1459 		 * this call.
1460 		 */
1461 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1462 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
1463 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1464 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1465 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1466 		} else {
1467 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1468 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1469 		}
1470 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1471 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1472 				      struct btrfs_extent_data_ref);
1473 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1474 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1475 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1476 				      struct btrfs_shared_data_ref);
1477 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1478 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1479 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1480 		struct btrfs_extent_ref_v0 *ref0;
1481 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1482 				      struct btrfs_extent_ref_v0);
1483 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1484 #endif
1485 	} else {
1486 		WARN_ON(1);
1487 	}
1488 	return num_refs;
1489 }
1490 
1491 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1492 					  struct btrfs_fs_info *fs_info,
1493 					  struct btrfs_path *path,
1494 					  u64 bytenr, u64 parent,
1495 					  u64 root_objectid)
1496 {
1497 	struct btrfs_root *root = fs_info->extent_root;
1498 	struct btrfs_key key;
1499 	int ret;
1500 
1501 	key.objectid = bytenr;
1502 	if (parent) {
1503 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1504 		key.offset = parent;
1505 	} else {
1506 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1507 		key.offset = root_objectid;
1508 	}
1509 
1510 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1511 	if (ret > 0)
1512 		ret = -ENOENT;
1513 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1514 	if (ret == -ENOENT && parent) {
1515 		btrfs_release_path(path);
1516 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1517 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1518 		if (ret > 0)
1519 			ret = -ENOENT;
1520 	}
1521 #endif
1522 	return ret;
1523 }
1524 
1525 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1526 					  struct btrfs_fs_info *fs_info,
1527 					  struct btrfs_path *path,
1528 					  u64 bytenr, u64 parent,
1529 					  u64 root_objectid)
1530 {
1531 	struct btrfs_key key;
1532 	int ret;
1533 
1534 	key.objectid = bytenr;
1535 	if (parent) {
1536 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1537 		key.offset = parent;
1538 	} else {
1539 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1540 		key.offset = root_objectid;
1541 	}
1542 
1543 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1544 				      path, &key, 0);
1545 	btrfs_release_path(path);
1546 	return ret;
1547 }
1548 
1549 static inline int extent_ref_type(u64 parent, u64 owner)
1550 {
1551 	int type;
1552 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1553 		if (parent > 0)
1554 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1555 		else
1556 			type = BTRFS_TREE_BLOCK_REF_KEY;
1557 	} else {
1558 		if (parent > 0)
1559 			type = BTRFS_SHARED_DATA_REF_KEY;
1560 		else
1561 			type = BTRFS_EXTENT_DATA_REF_KEY;
1562 	}
1563 	return type;
1564 }
1565 
1566 static int find_next_key(struct btrfs_path *path, int level,
1567 			 struct btrfs_key *key)
1568 
1569 {
1570 	for (; level < BTRFS_MAX_LEVEL; level++) {
1571 		if (!path->nodes[level])
1572 			break;
1573 		if (path->slots[level] + 1 >=
1574 		    btrfs_header_nritems(path->nodes[level]))
1575 			continue;
1576 		if (level == 0)
1577 			btrfs_item_key_to_cpu(path->nodes[level], key,
1578 					      path->slots[level] + 1);
1579 		else
1580 			btrfs_node_key_to_cpu(path->nodes[level], key,
1581 					      path->slots[level] + 1);
1582 		return 0;
1583 	}
1584 	return 1;
1585 }
1586 
1587 /*
1588  * look for inline back ref. if back ref is found, *ref_ret is set
1589  * to the address of inline back ref, and 0 is returned.
1590  *
1591  * if back ref isn't found, *ref_ret is set to the address where it
1592  * should be inserted, and -ENOENT is returned.
1593  *
1594  * if insert is true and there are too many inline back refs, the path
1595  * points to the extent item, and -EAGAIN is returned.
1596  *
1597  * NOTE: inline back refs are ordered in the same way that back ref
1598  *	 items in the tree are ordered.
1599  */
1600 static noinline_for_stack
1601 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1602 				 struct btrfs_fs_info *fs_info,
1603 				 struct btrfs_path *path,
1604 				 struct btrfs_extent_inline_ref **ref_ret,
1605 				 u64 bytenr, u64 num_bytes,
1606 				 u64 parent, u64 root_objectid,
1607 				 u64 owner, u64 offset, int insert)
1608 {
1609 	struct btrfs_root *root = fs_info->extent_root;
1610 	struct btrfs_key key;
1611 	struct extent_buffer *leaf;
1612 	struct btrfs_extent_item *ei;
1613 	struct btrfs_extent_inline_ref *iref;
1614 	u64 flags;
1615 	u64 item_size;
1616 	unsigned long ptr;
1617 	unsigned long end;
1618 	int extra_size;
1619 	int type;
1620 	int want;
1621 	int ret;
1622 	int err = 0;
1623 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1624 	int needed;
1625 
1626 	key.objectid = bytenr;
1627 	key.type = BTRFS_EXTENT_ITEM_KEY;
1628 	key.offset = num_bytes;
1629 
1630 	want = extent_ref_type(parent, owner);
1631 	if (insert) {
1632 		extra_size = btrfs_extent_inline_ref_size(want);
1633 		path->keep_locks = 1;
1634 	} else
1635 		extra_size = -1;
1636 
1637 	/*
1638 	 * Owner is our parent level, so we can just add one to get the level
1639 	 * for the block we are interested in.
1640 	 */
1641 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1642 		key.type = BTRFS_METADATA_ITEM_KEY;
1643 		key.offset = owner;
1644 	}
1645 
1646 again:
1647 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1648 	if (ret < 0) {
1649 		err = ret;
1650 		goto out;
1651 	}
1652 
1653 	/*
1654 	 * We may be a newly converted file system which still has the old fat
1655 	 * extent entries for metadata, so try and see if we have one of those.
1656 	 */
1657 	if (ret > 0 && skinny_metadata) {
1658 		skinny_metadata = false;
1659 		if (path->slots[0]) {
1660 			path->slots[0]--;
1661 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1662 					      path->slots[0]);
1663 			if (key.objectid == bytenr &&
1664 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1665 			    key.offset == num_bytes)
1666 				ret = 0;
1667 		}
1668 		if (ret) {
1669 			key.objectid = bytenr;
1670 			key.type = BTRFS_EXTENT_ITEM_KEY;
1671 			key.offset = num_bytes;
1672 			btrfs_release_path(path);
1673 			goto again;
1674 		}
1675 	}
1676 
1677 	if (ret && !insert) {
1678 		err = -ENOENT;
1679 		goto out;
1680 	} else if (WARN_ON(ret)) {
1681 		err = -EIO;
1682 		goto out;
1683 	}
1684 
1685 	leaf = path->nodes[0];
1686 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1687 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1688 	if (item_size < sizeof(*ei)) {
1689 		if (!insert) {
1690 			err = -ENOENT;
1691 			goto out;
1692 		}
1693 		ret = convert_extent_item_v0(trans, fs_info, path, owner,
1694 					     extra_size);
1695 		if (ret < 0) {
1696 			err = ret;
1697 			goto out;
1698 		}
1699 		leaf = path->nodes[0];
1700 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1701 	}
1702 #endif
1703 	BUG_ON(item_size < sizeof(*ei));
1704 
1705 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1706 	flags = btrfs_extent_flags(leaf, ei);
1707 
1708 	ptr = (unsigned long)(ei + 1);
1709 	end = (unsigned long)ei + item_size;
1710 
1711 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1712 		ptr += sizeof(struct btrfs_tree_block_info);
1713 		BUG_ON(ptr > end);
1714 	}
1715 
1716 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1717 		needed = BTRFS_REF_TYPE_DATA;
1718 	else
1719 		needed = BTRFS_REF_TYPE_BLOCK;
1720 
1721 	err = -ENOENT;
1722 	while (1) {
1723 		if (ptr >= end) {
1724 			WARN_ON(ptr > end);
1725 			break;
1726 		}
1727 		iref = (struct btrfs_extent_inline_ref *)ptr;
1728 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1729 		if (type == BTRFS_REF_TYPE_INVALID) {
1730 			err = -EINVAL;
1731 			goto out;
1732 		}
1733 
1734 		if (want < type)
1735 			break;
1736 		if (want > type) {
1737 			ptr += btrfs_extent_inline_ref_size(type);
1738 			continue;
1739 		}
1740 
1741 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1742 			struct btrfs_extent_data_ref *dref;
1743 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1744 			if (match_extent_data_ref(leaf, dref, root_objectid,
1745 						  owner, offset)) {
1746 				err = 0;
1747 				break;
1748 			}
1749 			if (hash_extent_data_ref_item(leaf, dref) <
1750 			    hash_extent_data_ref(root_objectid, owner, offset))
1751 				break;
1752 		} else {
1753 			u64 ref_offset;
1754 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1755 			if (parent > 0) {
1756 				if (parent == ref_offset) {
1757 					err = 0;
1758 					break;
1759 				}
1760 				if (ref_offset < parent)
1761 					break;
1762 			} else {
1763 				if (root_objectid == ref_offset) {
1764 					err = 0;
1765 					break;
1766 				}
1767 				if (ref_offset < root_objectid)
1768 					break;
1769 			}
1770 		}
1771 		ptr += btrfs_extent_inline_ref_size(type);
1772 	}
1773 	if (err == -ENOENT && insert) {
1774 		if (item_size + extra_size >=
1775 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1776 			err = -EAGAIN;
1777 			goto out;
1778 		}
1779 		/*
1780 		 * To add new inline back ref, we have to make sure
1781 		 * there is no corresponding back ref item.
1782 		 * For simplicity, we just do not add new inline back
1783 		 * ref if there is any kind of item for this block
1784 		 */
1785 		if (find_next_key(path, 0, &key) == 0 &&
1786 		    key.objectid == bytenr &&
1787 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1788 			err = -EAGAIN;
1789 			goto out;
1790 		}
1791 	}
1792 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1793 out:
1794 	if (insert) {
1795 		path->keep_locks = 0;
1796 		btrfs_unlock_up_safe(path, 1);
1797 	}
1798 	return err;
1799 }
1800 
1801 /*
1802  * helper to add new inline back ref
1803  */
1804 static noinline_for_stack
1805 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1806 				 struct btrfs_path *path,
1807 				 struct btrfs_extent_inline_ref *iref,
1808 				 u64 parent, u64 root_objectid,
1809 				 u64 owner, u64 offset, int refs_to_add,
1810 				 struct btrfs_delayed_extent_op *extent_op)
1811 {
1812 	struct extent_buffer *leaf;
1813 	struct btrfs_extent_item *ei;
1814 	unsigned long ptr;
1815 	unsigned long end;
1816 	unsigned long item_offset;
1817 	u64 refs;
1818 	int size;
1819 	int type;
1820 
1821 	leaf = path->nodes[0];
1822 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1823 	item_offset = (unsigned long)iref - (unsigned long)ei;
1824 
1825 	type = extent_ref_type(parent, owner);
1826 	size = btrfs_extent_inline_ref_size(type);
1827 
1828 	btrfs_extend_item(fs_info, path, size);
1829 
1830 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1831 	refs = btrfs_extent_refs(leaf, ei);
1832 	refs += refs_to_add;
1833 	btrfs_set_extent_refs(leaf, ei, refs);
1834 	if (extent_op)
1835 		__run_delayed_extent_op(extent_op, leaf, ei);
1836 
1837 	ptr = (unsigned long)ei + item_offset;
1838 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1839 	if (ptr < end - size)
1840 		memmove_extent_buffer(leaf, ptr + size, ptr,
1841 				      end - size - ptr);
1842 
1843 	iref = (struct btrfs_extent_inline_ref *)ptr;
1844 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1845 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1846 		struct btrfs_extent_data_ref *dref;
1847 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1848 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1849 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1850 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1851 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1852 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1853 		struct btrfs_shared_data_ref *sref;
1854 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1855 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1856 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1857 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1858 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1859 	} else {
1860 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1861 	}
1862 	btrfs_mark_buffer_dirty(leaf);
1863 }
1864 
1865 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1866 				 struct btrfs_fs_info *fs_info,
1867 				 struct btrfs_path *path,
1868 				 struct btrfs_extent_inline_ref **ref_ret,
1869 				 u64 bytenr, u64 num_bytes, u64 parent,
1870 				 u64 root_objectid, u64 owner, u64 offset)
1871 {
1872 	int ret;
1873 
1874 	ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
1875 					   bytenr, num_bytes, parent,
1876 					   root_objectid, owner, offset, 0);
1877 	if (ret != -ENOENT)
1878 		return ret;
1879 
1880 	btrfs_release_path(path);
1881 	*ref_ret = NULL;
1882 
1883 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1884 		ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1885 					    parent, root_objectid);
1886 	} else {
1887 		ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1888 					     parent, root_objectid, owner,
1889 					     offset);
1890 	}
1891 	return ret;
1892 }
1893 
1894 /*
1895  * helper to update/remove inline back ref
1896  */
1897 static noinline_for_stack
1898 void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
1899 				  struct btrfs_path *path,
1900 				  struct btrfs_extent_inline_ref *iref,
1901 				  int refs_to_mod,
1902 				  struct btrfs_delayed_extent_op *extent_op,
1903 				  int *last_ref)
1904 {
1905 	struct extent_buffer *leaf;
1906 	struct btrfs_extent_item *ei;
1907 	struct btrfs_extent_data_ref *dref = NULL;
1908 	struct btrfs_shared_data_ref *sref = NULL;
1909 	unsigned long ptr;
1910 	unsigned long end;
1911 	u32 item_size;
1912 	int size;
1913 	int type;
1914 	u64 refs;
1915 
1916 	leaf = path->nodes[0];
1917 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1918 	refs = btrfs_extent_refs(leaf, ei);
1919 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1920 	refs += refs_to_mod;
1921 	btrfs_set_extent_refs(leaf, ei, refs);
1922 	if (extent_op)
1923 		__run_delayed_extent_op(extent_op, leaf, ei);
1924 
1925 	/*
1926 	 * If type is invalid, we should have bailed out after
1927 	 * lookup_inline_extent_backref().
1928 	 */
1929 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1930 	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1931 
1932 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1933 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1934 		refs = btrfs_extent_data_ref_count(leaf, dref);
1935 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1936 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1937 		refs = btrfs_shared_data_ref_count(leaf, sref);
1938 	} else {
1939 		refs = 1;
1940 		BUG_ON(refs_to_mod != -1);
1941 	}
1942 
1943 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1944 	refs += refs_to_mod;
1945 
1946 	if (refs > 0) {
1947 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1948 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1949 		else
1950 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1951 	} else {
1952 		*last_ref = 1;
1953 		size =  btrfs_extent_inline_ref_size(type);
1954 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1955 		ptr = (unsigned long)iref;
1956 		end = (unsigned long)ei + item_size;
1957 		if (ptr + size < end)
1958 			memmove_extent_buffer(leaf, ptr, ptr + size,
1959 					      end - ptr - size);
1960 		item_size -= size;
1961 		btrfs_truncate_item(fs_info, path, item_size, 1);
1962 	}
1963 	btrfs_mark_buffer_dirty(leaf);
1964 }
1965 
1966 static noinline_for_stack
1967 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1968 				 struct btrfs_fs_info *fs_info,
1969 				 struct btrfs_path *path,
1970 				 u64 bytenr, u64 num_bytes, u64 parent,
1971 				 u64 root_objectid, u64 owner,
1972 				 u64 offset, int refs_to_add,
1973 				 struct btrfs_delayed_extent_op *extent_op)
1974 {
1975 	struct btrfs_extent_inline_ref *iref;
1976 	int ret;
1977 
1978 	ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
1979 					   bytenr, num_bytes, parent,
1980 					   root_objectid, owner, offset, 1);
1981 	if (ret == 0) {
1982 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1983 		update_inline_extent_backref(fs_info, path, iref,
1984 					     refs_to_add, extent_op, NULL);
1985 	} else if (ret == -ENOENT) {
1986 		setup_inline_extent_backref(fs_info, path, iref, parent,
1987 					    root_objectid, owner, offset,
1988 					    refs_to_add, extent_op);
1989 		ret = 0;
1990 	}
1991 	return ret;
1992 }
1993 
1994 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1995 				 struct btrfs_fs_info *fs_info,
1996 				 struct btrfs_path *path,
1997 				 u64 bytenr, u64 parent, u64 root_objectid,
1998 				 u64 owner, u64 offset, int refs_to_add)
1999 {
2000 	int ret;
2001 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2002 		BUG_ON(refs_to_add != 1);
2003 		ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
2004 					    parent, root_objectid);
2005 	} else {
2006 		ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
2007 					     parent, root_objectid,
2008 					     owner, offset, refs_to_add);
2009 	}
2010 	return ret;
2011 }
2012 
2013 static int remove_extent_backref(struct btrfs_trans_handle *trans,
2014 				 struct btrfs_fs_info *fs_info,
2015 				 struct btrfs_path *path,
2016 				 struct btrfs_extent_inline_ref *iref,
2017 				 int refs_to_drop, int is_data, int *last_ref)
2018 {
2019 	int ret = 0;
2020 
2021 	BUG_ON(!is_data && refs_to_drop != 1);
2022 	if (iref) {
2023 		update_inline_extent_backref(fs_info, path, iref,
2024 					     -refs_to_drop, NULL, last_ref);
2025 	} else if (is_data) {
2026 		ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
2027 					     last_ref);
2028 	} else {
2029 		*last_ref = 1;
2030 		ret = btrfs_del_item(trans, fs_info->extent_root, path);
2031 	}
2032 	return ret;
2033 }
2034 
2035 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
2036 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
2037 			       u64 *discarded_bytes)
2038 {
2039 	int j, ret = 0;
2040 	u64 bytes_left, end;
2041 	u64 aligned_start = ALIGN(start, 1 << 9);
2042 
2043 	if (WARN_ON(start != aligned_start)) {
2044 		len -= aligned_start - start;
2045 		len = round_down(len, 1 << 9);
2046 		start = aligned_start;
2047 	}
2048 
2049 	*discarded_bytes = 0;
2050 
2051 	if (!len)
2052 		return 0;
2053 
2054 	end = start + len;
2055 	bytes_left = len;
2056 
2057 	/* Skip any superblocks on this device. */
2058 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
2059 		u64 sb_start = btrfs_sb_offset(j);
2060 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2061 		u64 size = sb_start - start;
2062 
2063 		if (!in_range(sb_start, start, bytes_left) &&
2064 		    !in_range(sb_end, start, bytes_left) &&
2065 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2066 			continue;
2067 
2068 		/*
2069 		 * Superblock spans beginning of range.  Adjust start and
2070 		 * try again.
2071 		 */
2072 		if (sb_start <= start) {
2073 			start += sb_end - start;
2074 			if (start > end) {
2075 				bytes_left = 0;
2076 				break;
2077 			}
2078 			bytes_left = end - start;
2079 			continue;
2080 		}
2081 
2082 		if (size) {
2083 			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2084 						   GFP_NOFS, 0);
2085 			if (!ret)
2086 				*discarded_bytes += size;
2087 			else if (ret != -EOPNOTSUPP)
2088 				return ret;
2089 		}
2090 
2091 		start = sb_end;
2092 		if (start > end) {
2093 			bytes_left = 0;
2094 			break;
2095 		}
2096 		bytes_left = end - start;
2097 	}
2098 
2099 	if (bytes_left) {
2100 		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2101 					   GFP_NOFS, 0);
2102 		if (!ret)
2103 			*discarded_bytes += bytes_left;
2104 	}
2105 	return ret;
2106 }
2107 
2108 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2109 			 u64 num_bytes, u64 *actual_bytes)
2110 {
2111 	int ret;
2112 	u64 discarded_bytes = 0;
2113 	struct btrfs_bio *bbio = NULL;
2114 
2115 
2116 	/*
2117 	 * Avoid races with device replace and make sure our bbio has devices
2118 	 * associated to its stripes that don't go away while we are discarding.
2119 	 */
2120 	btrfs_bio_counter_inc_blocked(fs_info);
2121 	/* Tell the block device(s) that the sectors can be discarded */
2122 	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2123 			      &bbio, 0);
2124 	/* Error condition is -ENOMEM */
2125 	if (!ret) {
2126 		struct btrfs_bio_stripe *stripe = bbio->stripes;
2127 		int i;
2128 
2129 
2130 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2131 			u64 bytes;
2132 			struct request_queue *req_q;
2133 
2134 			if (!stripe->dev->bdev) {
2135 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2136 				continue;
2137 			}
2138 			req_q = bdev_get_queue(stripe->dev->bdev);
2139 			if (!blk_queue_discard(req_q))
2140 				continue;
2141 
2142 			ret = btrfs_issue_discard(stripe->dev->bdev,
2143 						  stripe->physical,
2144 						  stripe->length,
2145 						  &bytes);
2146 			if (!ret)
2147 				discarded_bytes += bytes;
2148 			else if (ret != -EOPNOTSUPP)
2149 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2150 
2151 			/*
2152 			 * Just in case we get back EOPNOTSUPP for some reason,
2153 			 * just ignore the return value so we don't screw up
2154 			 * people calling discard_extent.
2155 			 */
2156 			ret = 0;
2157 		}
2158 		btrfs_put_bbio(bbio);
2159 	}
2160 	btrfs_bio_counter_dec(fs_info);
2161 
2162 	if (actual_bytes)
2163 		*actual_bytes = discarded_bytes;
2164 
2165 
2166 	if (ret == -EOPNOTSUPP)
2167 		ret = 0;
2168 	return ret;
2169 }
2170 
2171 /* Can return -ENOMEM */
2172 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2173 			 struct btrfs_root *root,
2174 			 u64 bytenr, u64 num_bytes, u64 parent,
2175 			 u64 root_objectid, u64 owner, u64 offset)
2176 {
2177 	struct btrfs_fs_info *fs_info = root->fs_info;
2178 	int old_ref_mod, new_ref_mod;
2179 	int ret;
2180 
2181 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2182 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
2183 
2184 	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2185 			   owner, offset, BTRFS_ADD_DELAYED_REF);
2186 
2187 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2188 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2189 						 num_bytes, parent,
2190 						 root_objectid, (int)owner,
2191 						 BTRFS_ADD_DELAYED_REF, NULL,
2192 						 &old_ref_mod, &new_ref_mod);
2193 	} else {
2194 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2195 						 num_bytes, parent,
2196 						 root_objectid, owner, offset,
2197 						 0, BTRFS_ADD_DELAYED_REF,
2198 						 &old_ref_mod, &new_ref_mod);
2199 	}
2200 
2201 	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2202 		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2203 
2204 		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2205 	}
2206 
2207 	return ret;
2208 }
2209 
2210 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2211 				  struct btrfs_fs_info *fs_info,
2212 				  struct btrfs_delayed_ref_node *node,
2213 				  u64 parent, u64 root_objectid,
2214 				  u64 owner, u64 offset, int refs_to_add,
2215 				  struct btrfs_delayed_extent_op *extent_op)
2216 {
2217 	struct btrfs_path *path;
2218 	struct extent_buffer *leaf;
2219 	struct btrfs_extent_item *item;
2220 	struct btrfs_key key;
2221 	u64 bytenr = node->bytenr;
2222 	u64 num_bytes = node->num_bytes;
2223 	u64 refs;
2224 	int ret;
2225 
2226 	path = btrfs_alloc_path();
2227 	if (!path)
2228 		return -ENOMEM;
2229 
2230 	path->reada = READA_FORWARD;
2231 	path->leave_spinning = 1;
2232 	/* this will setup the path even if it fails to insert the back ref */
2233 	ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2234 					   num_bytes, parent, root_objectid,
2235 					   owner, offset,
2236 					   refs_to_add, extent_op);
2237 	if ((ret < 0 && ret != -EAGAIN) || !ret)
2238 		goto out;
2239 
2240 	/*
2241 	 * Ok we had -EAGAIN which means we didn't have space to insert and
2242 	 * inline extent ref, so just update the reference count and add a
2243 	 * normal backref.
2244 	 */
2245 	leaf = path->nodes[0];
2246 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2247 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2248 	refs = btrfs_extent_refs(leaf, item);
2249 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2250 	if (extent_op)
2251 		__run_delayed_extent_op(extent_op, leaf, item);
2252 
2253 	btrfs_mark_buffer_dirty(leaf);
2254 	btrfs_release_path(path);
2255 
2256 	path->reada = READA_FORWARD;
2257 	path->leave_spinning = 1;
2258 	/* now insert the actual backref */
2259 	ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2260 				    root_objectid, owner, offset, refs_to_add);
2261 	if (ret)
2262 		btrfs_abort_transaction(trans, ret);
2263 out:
2264 	btrfs_free_path(path);
2265 	return ret;
2266 }
2267 
2268 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2269 				struct btrfs_fs_info *fs_info,
2270 				struct btrfs_delayed_ref_node *node,
2271 				struct btrfs_delayed_extent_op *extent_op,
2272 				int insert_reserved)
2273 {
2274 	int ret = 0;
2275 	struct btrfs_delayed_data_ref *ref;
2276 	struct btrfs_key ins;
2277 	u64 parent = 0;
2278 	u64 ref_root = 0;
2279 	u64 flags = 0;
2280 
2281 	ins.objectid = node->bytenr;
2282 	ins.offset = node->num_bytes;
2283 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2284 
2285 	ref = btrfs_delayed_node_to_data_ref(node);
2286 	trace_run_delayed_data_ref(fs_info, node, ref, node->action);
2287 
2288 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2289 		parent = ref->parent;
2290 	ref_root = ref->root;
2291 
2292 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2293 		if (extent_op)
2294 			flags |= extent_op->flags_to_set;
2295 		ret = alloc_reserved_file_extent(trans, fs_info,
2296 						 parent, ref_root, flags,
2297 						 ref->objectid, ref->offset,
2298 						 &ins, node->ref_mod);
2299 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2300 		ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
2301 					     ref_root, ref->objectid,
2302 					     ref->offset, node->ref_mod,
2303 					     extent_op);
2304 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2305 		ret = __btrfs_free_extent(trans, fs_info, node, parent,
2306 					  ref_root, ref->objectid,
2307 					  ref->offset, node->ref_mod,
2308 					  extent_op);
2309 	} else {
2310 		BUG();
2311 	}
2312 	return ret;
2313 }
2314 
2315 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2316 				    struct extent_buffer *leaf,
2317 				    struct btrfs_extent_item *ei)
2318 {
2319 	u64 flags = btrfs_extent_flags(leaf, ei);
2320 	if (extent_op->update_flags) {
2321 		flags |= extent_op->flags_to_set;
2322 		btrfs_set_extent_flags(leaf, ei, flags);
2323 	}
2324 
2325 	if (extent_op->update_key) {
2326 		struct btrfs_tree_block_info *bi;
2327 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2328 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2329 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2330 	}
2331 }
2332 
2333 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2334 				 struct btrfs_fs_info *fs_info,
2335 				 struct btrfs_delayed_ref_head *head,
2336 				 struct btrfs_delayed_extent_op *extent_op)
2337 {
2338 	struct btrfs_key key;
2339 	struct btrfs_path *path;
2340 	struct btrfs_extent_item *ei;
2341 	struct extent_buffer *leaf;
2342 	u32 item_size;
2343 	int ret;
2344 	int err = 0;
2345 	int metadata = !extent_op->is_data;
2346 
2347 	if (trans->aborted)
2348 		return 0;
2349 
2350 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2351 		metadata = 0;
2352 
2353 	path = btrfs_alloc_path();
2354 	if (!path)
2355 		return -ENOMEM;
2356 
2357 	key.objectid = head->bytenr;
2358 
2359 	if (metadata) {
2360 		key.type = BTRFS_METADATA_ITEM_KEY;
2361 		key.offset = extent_op->level;
2362 	} else {
2363 		key.type = BTRFS_EXTENT_ITEM_KEY;
2364 		key.offset = head->num_bytes;
2365 	}
2366 
2367 again:
2368 	path->reada = READA_FORWARD;
2369 	path->leave_spinning = 1;
2370 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2371 	if (ret < 0) {
2372 		err = ret;
2373 		goto out;
2374 	}
2375 	if (ret > 0) {
2376 		if (metadata) {
2377 			if (path->slots[0] > 0) {
2378 				path->slots[0]--;
2379 				btrfs_item_key_to_cpu(path->nodes[0], &key,
2380 						      path->slots[0]);
2381 				if (key.objectid == head->bytenr &&
2382 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
2383 				    key.offset == head->num_bytes)
2384 					ret = 0;
2385 			}
2386 			if (ret > 0) {
2387 				btrfs_release_path(path);
2388 				metadata = 0;
2389 
2390 				key.objectid = head->bytenr;
2391 				key.offset = head->num_bytes;
2392 				key.type = BTRFS_EXTENT_ITEM_KEY;
2393 				goto again;
2394 			}
2395 		} else {
2396 			err = -EIO;
2397 			goto out;
2398 		}
2399 	}
2400 
2401 	leaf = path->nodes[0];
2402 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2403 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2404 	if (item_size < sizeof(*ei)) {
2405 		ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
2406 		if (ret < 0) {
2407 			err = ret;
2408 			goto out;
2409 		}
2410 		leaf = path->nodes[0];
2411 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2412 	}
2413 #endif
2414 	BUG_ON(item_size < sizeof(*ei));
2415 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2416 	__run_delayed_extent_op(extent_op, leaf, ei);
2417 
2418 	btrfs_mark_buffer_dirty(leaf);
2419 out:
2420 	btrfs_free_path(path);
2421 	return err;
2422 }
2423 
2424 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2425 				struct btrfs_fs_info *fs_info,
2426 				struct btrfs_delayed_ref_node *node,
2427 				struct btrfs_delayed_extent_op *extent_op,
2428 				int insert_reserved)
2429 {
2430 	int ret = 0;
2431 	struct btrfs_delayed_tree_ref *ref;
2432 	u64 parent = 0;
2433 	u64 ref_root = 0;
2434 
2435 	ref = btrfs_delayed_node_to_tree_ref(node);
2436 	trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
2437 
2438 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2439 		parent = ref->parent;
2440 	ref_root = ref->root;
2441 
2442 	if (node->ref_mod != 1) {
2443 		btrfs_err(fs_info,
2444 	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2445 			  node->bytenr, node->ref_mod, node->action, ref_root,
2446 			  parent);
2447 		return -EIO;
2448 	}
2449 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2450 		BUG_ON(!extent_op || !extent_op->update_flags);
2451 		ret = alloc_reserved_tree_block(trans, node, extent_op);
2452 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2453 		ret = __btrfs_inc_extent_ref(trans, fs_info, node,
2454 					     parent, ref_root,
2455 					     ref->level, 0, 1,
2456 					     extent_op);
2457 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2458 		ret = __btrfs_free_extent(trans, fs_info, node,
2459 					  parent, ref_root,
2460 					  ref->level, 0, 1, extent_op);
2461 	} else {
2462 		BUG();
2463 	}
2464 	return ret;
2465 }
2466 
2467 /* helper function to actually process a single delayed ref entry */
2468 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2469 			       struct btrfs_fs_info *fs_info,
2470 			       struct btrfs_delayed_ref_node *node,
2471 			       struct btrfs_delayed_extent_op *extent_op,
2472 			       int insert_reserved)
2473 {
2474 	int ret = 0;
2475 
2476 	if (trans->aborted) {
2477 		if (insert_reserved)
2478 			btrfs_pin_extent(fs_info, node->bytenr,
2479 					 node->num_bytes, 1);
2480 		return 0;
2481 	}
2482 
2483 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2484 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2485 		ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
2486 					   insert_reserved);
2487 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2488 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2489 		ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
2490 					   insert_reserved);
2491 	else
2492 		BUG();
2493 	return ret;
2494 }
2495 
2496 static inline struct btrfs_delayed_ref_node *
2497 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2498 {
2499 	struct btrfs_delayed_ref_node *ref;
2500 
2501 	if (RB_EMPTY_ROOT(&head->ref_tree))
2502 		return NULL;
2503 
2504 	/*
2505 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2506 	 * This is to prevent a ref count from going down to zero, which deletes
2507 	 * the extent item from the extent tree, when there still are references
2508 	 * to add, which would fail because they would not find the extent item.
2509 	 */
2510 	if (!list_empty(&head->ref_add_list))
2511 		return list_first_entry(&head->ref_add_list,
2512 				struct btrfs_delayed_ref_node, add_list);
2513 
2514 	ref = rb_entry(rb_first(&head->ref_tree),
2515 		       struct btrfs_delayed_ref_node, ref_node);
2516 	ASSERT(list_empty(&ref->add_list));
2517 	return ref;
2518 }
2519 
2520 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2521 				      struct btrfs_delayed_ref_head *head)
2522 {
2523 	spin_lock(&delayed_refs->lock);
2524 	head->processing = 0;
2525 	delayed_refs->num_heads_ready++;
2526 	spin_unlock(&delayed_refs->lock);
2527 	btrfs_delayed_ref_unlock(head);
2528 }
2529 
2530 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2531 			     struct btrfs_fs_info *fs_info,
2532 			     struct btrfs_delayed_ref_head *head)
2533 {
2534 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2535 	int ret;
2536 
2537 	if (!extent_op)
2538 		return 0;
2539 	head->extent_op = NULL;
2540 	if (head->must_insert_reserved) {
2541 		btrfs_free_delayed_extent_op(extent_op);
2542 		return 0;
2543 	}
2544 	spin_unlock(&head->lock);
2545 	ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
2546 	btrfs_free_delayed_extent_op(extent_op);
2547 	return ret ? ret : 1;
2548 }
2549 
2550 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2551 			    struct btrfs_fs_info *fs_info,
2552 			    struct btrfs_delayed_ref_head *head)
2553 {
2554 	struct btrfs_delayed_ref_root *delayed_refs;
2555 	int ret;
2556 
2557 	delayed_refs = &trans->transaction->delayed_refs;
2558 
2559 	ret = cleanup_extent_op(trans, fs_info, head);
2560 	if (ret < 0) {
2561 		unselect_delayed_ref_head(delayed_refs, head);
2562 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2563 		return ret;
2564 	} else if (ret) {
2565 		return ret;
2566 	}
2567 
2568 	/*
2569 	 * Need to drop our head ref lock and re-acquire the delayed ref lock
2570 	 * and then re-check to make sure nobody got added.
2571 	 */
2572 	spin_unlock(&head->lock);
2573 	spin_lock(&delayed_refs->lock);
2574 	spin_lock(&head->lock);
2575 	if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
2576 		spin_unlock(&head->lock);
2577 		spin_unlock(&delayed_refs->lock);
2578 		return 1;
2579 	}
2580 	delayed_refs->num_heads--;
2581 	rb_erase(&head->href_node, &delayed_refs->href_root);
2582 	RB_CLEAR_NODE(&head->href_node);
2583 	spin_unlock(&head->lock);
2584 	spin_unlock(&delayed_refs->lock);
2585 	atomic_dec(&delayed_refs->num_entries);
2586 
2587 	trace_run_delayed_ref_head(fs_info, head, 0);
2588 
2589 	if (head->total_ref_mod < 0) {
2590 		struct btrfs_space_info *space_info;
2591 		u64 flags;
2592 
2593 		if (head->is_data)
2594 			flags = BTRFS_BLOCK_GROUP_DATA;
2595 		else if (head->is_system)
2596 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
2597 		else
2598 			flags = BTRFS_BLOCK_GROUP_METADATA;
2599 		space_info = __find_space_info(fs_info, flags);
2600 		ASSERT(space_info);
2601 		percpu_counter_add(&space_info->total_bytes_pinned,
2602 				   -head->num_bytes);
2603 
2604 		if (head->is_data) {
2605 			spin_lock(&delayed_refs->lock);
2606 			delayed_refs->pending_csums -= head->num_bytes;
2607 			spin_unlock(&delayed_refs->lock);
2608 		}
2609 	}
2610 
2611 	if (head->must_insert_reserved) {
2612 		btrfs_pin_extent(fs_info, head->bytenr,
2613 				 head->num_bytes, 1);
2614 		if (head->is_data) {
2615 			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2616 					      head->num_bytes);
2617 		}
2618 	}
2619 
2620 	/* Also free its reserved qgroup space */
2621 	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2622 				      head->qgroup_reserved);
2623 	btrfs_delayed_ref_unlock(head);
2624 	btrfs_put_delayed_ref_head(head);
2625 	return 0;
2626 }
2627 
2628 /*
2629  * Returns 0 on success or if called with an already aborted transaction.
2630  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2631  */
2632 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2633 					     unsigned long nr)
2634 {
2635 	struct btrfs_fs_info *fs_info = trans->fs_info;
2636 	struct btrfs_delayed_ref_root *delayed_refs;
2637 	struct btrfs_delayed_ref_node *ref;
2638 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2639 	struct btrfs_delayed_extent_op *extent_op;
2640 	ktime_t start = ktime_get();
2641 	int ret;
2642 	unsigned long count = 0;
2643 	unsigned long actual_count = 0;
2644 	int must_insert_reserved = 0;
2645 
2646 	delayed_refs = &trans->transaction->delayed_refs;
2647 	while (1) {
2648 		if (!locked_ref) {
2649 			if (count >= nr)
2650 				break;
2651 
2652 			spin_lock(&delayed_refs->lock);
2653 			locked_ref = btrfs_select_ref_head(trans);
2654 			if (!locked_ref) {
2655 				spin_unlock(&delayed_refs->lock);
2656 				break;
2657 			}
2658 
2659 			/* grab the lock that says we are going to process
2660 			 * all the refs for this head */
2661 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2662 			spin_unlock(&delayed_refs->lock);
2663 			/*
2664 			 * we may have dropped the spin lock to get the head
2665 			 * mutex lock, and that might have given someone else
2666 			 * time to free the head.  If that's true, it has been
2667 			 * removed from our list and we can move on.
2668 			 */
2669 			if (ret == -EAGAIN) {
2670 				locked_ref = NULL;
2671 				count++;
2672 				continue;
2673 			}
2674 		}
2675 
2676 		/*
2677 		 * We need to try and merge add/drops of the same ref since we
2678 		 * can run into issues with relocate dropping the implicit ref
2679 		 * and then it being added back again before the drop can
2680 		 * finish.  If we merged anything we need to re-loop so we can
2681 		 * get a good ref.
2682 		 * Or we can get node references of the same type that weren't
2683 		 * merged when created due to bumps in the tree mod seq, and
2684 		 * we need to merge them to prevent adding an inline extent
2685 		 * backref before dropping it (triggering a BUG_ON at
2686 		 * insert_inline_extent_backref()).
2687 		 */
2688 		spin_lock(&locked_ref->lock);
2689 		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2690 
2691 		ref = select_delayed_ref(locked_ref);
2692 
2693 		if (ref && ref->seq &&
2694 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
2695 			spin_unlock(&locked_ref->lock);
2696 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2697 			locked_ref = NULL;
2698 			cond_resched();
2699 			count++;
2700 			continue;
2701 		}
2702 
2703 		/*
2704 		 * We're done processing refs in this ref_head, clean everything
2705 		 * up and move on to the next ref_head.
2706 		 */
2707 		if (!ref) {
2708 			ret = cleanup_ref_head(trans, fs_info, locked_ref);
2709 			if (ret > 0 ) {
2710 				/* We dropped our lock, we need to loop. */
2711 				ret = 0;
2712 				continue;
2713 			} else if (ret) {
2714 				return ret;
2715 			}
2716 			locked_ref = NULL;
2717 			count++;
2718 			continue;
2719 		}
2720 
2721 		actual_count++;
2722 		ref->in_tree = 0;
2723 		rb_erase(&ref->ref_node, &locked_ref->ref_tree);
2724 		RB_CLEAR_NODE(&ref->ref_node);
2725 		if (!list_empty(&ref->add_list))
2726 			list_del(&ref->add_list);
2727 		/*
2728 		 * When we play the delayed ref, also correct the ref_mod on
2729 		 * head
2730 		 */
2731 		switch (ref->action) {
2732 		case BTRFS_ADD_DELAYED_REF:
2733 		case BTRFS_ADD_DELAYED_EXTENT:
2734 			locked_ref->ref_mod -= ref->ref_mod;
2735 			break;
2736 		case BTRFS_DROP_DELAYED_REF:
2737 			locked_ref->ref_mod += ref->ref_mod;
2738 			break;
2739 		default:
2740 			WARN_ON(1);
2741 		}
2742 		atomic_dec(&delayed_refs->num_entries);
2743 
2744 		/*
2745 		 * Record the must-insert_reserved flag before we drop the spin
2746 		 * lock.
2747 		 */
2748 		must_insert_reserved = locked_ref->must_insert_reserved;
2749 		locked_ref->must_insert_reserved = 0;
2750 
2751 		extent_op = locked_ref->extent_op;
2752 		locked_ref->extent_op = NULL;
2753 		spin_unlock(&locked_ref->lock);
2754 
2755 		ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
2756 					  must_insert_reserved);
2757 
2758 		btrfs_free_delayed_extent_op(extent_op);
2759 		if (ret) {
2760 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2761 			btrfs_put_delayed_ref(ref);
2762 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2763 				    ret);
2764 			return ret;
2765 		}
2766 
2767 		btrfs_put_delayed_ref(ref);
2768 		count++;
2769 		cond_resched();
2770 	}
2771 
2772 	/*
2773 	 * We don't want to include ref heads since we can have empty ref heads
2774 	 * and those will drastically skew our runtime down since we just do
2775 	 * accounting, no actual extent tree updates.
2776 	 */
2777 	if (actual_count > 0) {
2778 		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2779 		u64 avg;
2780 
2781 		/*
2782 		 * We weigh the current average higher than our current runtime
2783 		 * to avoid large swings in the average.
2784 		 */
2785 		spin_lock(&delayed_refs->lock);
2786 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2787 		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2788 		spin_unlock(&delayed_refs->lock);
2789 	}
2790 	return 0;
2791 }
2792 
2793 #ifdef SCRAMBLE_DELAYED_REFS
2794 /*
2795  * Normally delayed refs get processed in ascending bytenr order. This
2796  * correlates in most cases to the order added. To expose dependencies on this
2797  * order, we start to process the tree in the middle instead of the beginning
2798  */
2799 static u64 find_middle(struct rb_root *root)
2800 {
2801 	struct rb_node *n = root->rb_node;
2802 	struct btrfs_delayed_ref_node *entry;
2803 	int alt = 1;
2804 	u64 middle;
2805 	u64 first = 0, last = 0;
2806 
2807 	n = rb_first(root);
2808 	if (n) {
2809 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2810 		first = entry->bytenr;
2811 	}
2812 	n = rb_last(root);
2813 	if (n) {
2814 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2815 		last = entry->bytenr;
2816 	}
2817 	n = root->rb_node;
2818 
2819 	while (n) {
2820 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2821 		WARN_ON(!entry->in_tree);
2822 
2823 		middle = entry->bytenr;
2824 
2825 		if (alt)
2826 			n = n->rb_left;
2827 		else
2828 			n = n->rb_right;
2829 
2830 		alt = 1 - alt;
2831 	}
2832 	return middle;
2833 }
2834 #endif
2835 
2836 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2837 {
2838 	u64 num_bytes;
2839 
2840 	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2841 			     sizeof(struct btrfs_extent_inline_ref));
2842 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2843 		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2844 
2845 	/*
2846 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2847 	 * closer to what we're really going to want to use.
2848 	 */
2849 	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2850 }
2851 
2852 /*
2853  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2854  * would require to store the csums for that many bytes.
2855  */
2856 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2857 {
2858 	u64 csum_size;
2859 	u64 num_csums_per_leaf;
2860 	u64 num_csums;
2861 
2862 	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2863 	num_csums_per_leaf = div64_u64(csum_size,
2864 			(u64)btrfs_super_csum_size(fs_info->super_copy));
2865 	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2866 	num_csums += num_csums_per_leaf - 1;
2867 	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2868 	return num_csums;
2869 }
2870 
2871 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2872 				       struct btrfs_fs_info *fs_info)
2873 {
2874 	struct btrfs_block_rsv *global_rsv;
2875 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2876 	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2877 	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2878 	u64 num_bytes, num_dirty_bgs_bytes;
2879 	int ret = 0;
2880 
2881 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2882 	num_heads = heads_to_leaves(fs_info, num_heads);
2883 	if (num_heads > 1)
2884 		num_bytes += (num_heads - 1) * fs_info->nodesize;
2885 	num_bytes <<= 1;
2886 	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2887 							fs_info->nodesize;
2888 	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2889 							     num_dirty_bgs);
2890 	global_rsv = &fs_info->global_block_rsv;
2891 
2892 	/*
2893 	 * If we can't allocate any more chunks lets make sure we have _lots_ of
2894 	 * wiggle room since running delayed refs can create more delayed refs.
2895 	 */
2896 	if (global_rsv->space_info->full) {
2897 		num_dirty_bgs_bytes <<= 1;
2898 		num_bytes <<= 1;
2899 	}
2900 
2901 	spin_lock(&global_rsv->lock);
2902 	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2903 		ret = 1;
2904 	spin_unlock(&global_rsv->lock);
2905 	return ret;
2906 }
2907 
2908 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2909 				       struct btrfs_fs_info *fs_info)
2910 {
2911 	u64 num_entries =
2912 		atomic_read(&trans->transaction->delayed_refs.num_entries);
2913 	u64 avg_runtime;
2914 	u64 val;
2915 
2916 	smp_mb();
2917 	avg_runtime = fs_info->avg_delayed_ref_runtime;
2918 	val = num_entries * avg_runtime;
2919 	if (val >= NSEC_PER_SEC)
2920 		return 1;
2921 	if (val >= NSEC_PER_SEC / 2)
2922 		return 2;
2923 
2924 	return btrfs_check_space_for_delayed_refs(trans, fs_info);
2925 }
2926 
2927 struct async_delayed_refs {
2928 	struct btrfs_root *root;
2929 	u64 transid;
2930 	int count;
2931 	int error;
2932 	int sync;
2933 	struct completion wait;
2934 	struct btrfs_work work;
2935 };
2936 
2937 static inline struct async_delayed_refs *
2938 to_async_delayed_refs(struct btrfs_work *work)
2939 {
2940 	return container_of(work, struct async_delayed_refs, work);
2941 }
2942 
2943 static void delayed_ref_async_start(struct btrfs_work *work)
2944 {
2945 	struct async_delayed_refs *async = to_async_delayed_refs(work);
2946 	struct btrfs_trans_handle *trans;
2947 	struct btrfs_fs_info *fs_info = async->root->fs_info;
2948 	int ret;
2949 
2950 	/* if the commit is already started, we don't need to wait here */
2951 	if (btrfs_transaction_blocked(fs_info))
2952 		goto done;
2953 
2954 	trans = btrfs_join_transaction(async->root);
2955 	if (IS_ERR(trans)) {
2956 		async->error = PTR_ERR(trans);
2957 		goto done;
2958 	}
2959 
2960 	/*
2961 	 * trans->sync means that when we call end_transaction, we won't
2962 	 * wait on delayed refs
2963 	 */
2964 	trans->sync = true;
2965 
2966 	/* Don't bother flushing if we got into a different transaction */
2967 	if (trans->transid > async->transid)
2968 		goto end;
2969 
2970 	ret = btrfs_run_delayed_refs(trans, async->count);
2971 	if (ret)
2972 		async->error = ret;
2973 end:
2974 	ret = btrfs_end_transaction(trans);
2975 	if (ret && !async->error)
2976 		async->error = ret;
2977 done:
2978 	if (async->sync)
2979 		complete(&async->wait);
2980 	else
2981 		kfree(async);
2982 }
2983 
2984 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2985 				 unsigned long count, u64 transid, int wait)
2986 {
2987 	struct async_delayed_refs *async;
2988 	int ret;
2989 
2990 	async = kmalloc(sizeof(*async), GFP_NOFS);
2991 	if (!async)
2992 		return -ENOMEM;
2993 
2994 	async->root = fs_info->tree_root;
2995 	async->count = count;
2996 	async->error = 0;
2997 	async->transid = transid;
2998 	if (wait)
2999 		async->sync = 1;
3000 	else
3001 		async->sync = 0;
3002 	init_completion(&async->wait);
3003 
3004 	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
3005 			delayed_ref_async_start, NULL, NULL);
3006 
3007 	btrfs_queue_work(fs_info->extent_workers, &async->work);
3008 
3009 	if (wait) {
3010 		wait_for_completion(&async->wait);
3011 		ret = async->error;
3012 		kfree(async);
3013 		return ret;
3014 	}
3015 	return 0;
3016 }
3017 
3018 /*
3019  * this starts processing the delayed reference count updates and
3020  * extent insertions we have queued up so far.  count can be
3021  * 0, which means to process everything in the tree at the start
3022  * of the run (but not newly added entries), or it can be some target
3023  * number you'd like to process.
3024  *
3025  * Returns 0 on success or if called with an aborted transaction
3026  * Returns <0 on error and aborts the transaction
3027  */
3028 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3029 			   unsigned long count)
3030 {
3031 	struct btrfs_fs_info *fs_info = trans->fs_info;
3032 	struct rb_node *node;
3033 	struct btrfs_delayed_ref_root *delayed_refs;
3034 	struct btrfs_delayed_ref_head *head;
3035 	int ret;
3036 	int run_all = count == (unsigned long)-1;
3037 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
3038 
3039 	/* We'll clean this up in btrfs_cleanup_transaction */
3040 	if (trans->aborted)
3041 		return 0;
3042 
3043 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
3044 		return 0;
3045 
3046 	delayed_refs = &trans->transaction->delayed_refs;
3047 	if (count == 0)
3048 		count = atomic_read(&delayed_refs->num_entries) * 2;
3049 
3050 again:
3051 #ifdef SCRAMBLE_DELAYED_REFS
3052 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3053 #endif
3054 	trans->can_flush_pending_bgs = false;
3055 	ret = __btrfs_run_delayed_refs(trans, count);
3056 	if (ret < 0) {
3057 		btrfs_abort_transaction(trans, ret);
3058 		return ret;
3059 	}
3060 
3061 	if (run_all) {
3062 		if (!list_empty(&trans->new_bgs))
3063 			btrfs_create_pending_block_groups(trans);
3064 
3065 		spin_lock(&delayed_refs->lock);
3066 		node = rb_first(&delayed_refs->href_root);
3067 		if (!node) {
3068 			spin_unlock(&delayed_refs->lock);
3069 			goto out;
3070 		}
3071 		head = rb_entry(node, struct btrfs_delayed_ref_head,
3072 				href_node);
3073 		refcount_inc(&head->refs);
3074 		spin_unlock(&delayed_refs->lock);
3075 
3076 		/* Mutex was contended, block until it's released and retry. */
3077 		mutex_lock(&head->mutex);
3078 		mutex_unlock(&head->mutex);
3079 
3080 		btrfs_put_delayed_ref_head(head);
3081 		cond_resched();
3082 		goto again;
3083 	}
3084 out:
3085 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
3086 	return 0;
3087 }
3088 
3089 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3090 				struct btrfs_fs_info *fs_info,
3091 				u64 bytenr, u64 num_bytes, u64 flags,
3092 				int level, int is_data)
3093 {
3094 	struct btrfs_delayed_extent_op *extent_op;
3095 	int ret;
3096 
3097 	extent_op = btrfs_alloc_delayed_extent_op();
3098 	if (!extent_op)
3099 		return -ENOMEM;
3100 
3101 	extent_op->flags_to_set = flags;
3102 	extent_op->update_flags = true;
3103 	extent_op->update_key = false;
3104 	extent_op->is_data = is_data ? true : false;
3105 	extent_op->level = level;
3106 
3107 	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3108 					  num_bytes, extent_op);
3109 	if (ret)
3110 		btrfs_free_delayed_extent_op(extent_op);
3111 	return ret;
3112 }
3113 
3114 static noinline int check_delayed_ref(struct btrfs_root *root,
3115 				      struct btrfs_path *path,
3116 				      u64 objectid, u64 offset, u64 bytenr)
3117 {
3118 	struct btrfs_delayed_ref_head *head;
3119 	struct btrfs_delayed_ref_node *ref;
3120 	struct btrfs_delayed_data_ref *data_ref;
3121 	struct btrfs_delayed_ref_root *delayed_refs;
3122 	struct btrfs_transaction *cur_trans;
3123 	struct rb_node *node;
3124 	int ret = 0;
3125 
3126 	spin_lock(&root->fs_info->trans_lock);
3127 	cur_trans = root->fs_info->running_transaction;
3128 	if (cur_trans)
3129 		refcount_inc(&cur_trans->use_count);
3130 	spin_unlock(&root->fs_info->trans_lock);
3131 	if (!cur_trans)
3132 		return 0;
3133 
3134 	delayed_refs = &cur_trans->delayed_refs;
3135 	spin_lock(&delayed_refs->lock);
3136 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3137 	if (!head) {
3138 		spin_unlock(&delayed_refs->lock);
3139 		btrfs_put_transaction(cur_trans);
3140 		return 0;
3141 	}
3142 
3143 	if (!mutex_trylock(&head->mutex)) {
3144 		refcount_inc(&head->refs);
3145 		spin_unlock(&delayed_refs->lock);
3146 
3147 		btrfs_release_path(path);
3148 
3149 		/*
3150 		 * Mutex was contended, block until it's released and let
3151 		 * caller try again
3152 		 */
3153 		mutex_lock(&head->mutex);
3154 		mutex_unlock(&head->mutex);
3155 		btrfs_put_delayed_ref_head(head);
3156 		btrfs_put_transaction(cur_trans);
3157 		return -EAGAIN;
3158 	}
3159 	spin_unlock(&delayed_refs->lock);
3160 
3161 	spin_lock(&head->lock);
3162 	/*
3163 	 * XXX: We should replace this with a proper search function in the
3164 	 * future.
3165 	 */
3166 	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
3167 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3168 		/* If it's a shared ref we know a cross reference exists */
3169 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3170 			ret = 1;
3171 			break;
3172 		}
3173 
3174 		data_ref = btrfs_delayed_node_to_data_ref(ref);
3175 
3176 		/*
3177 		 * If our ref doesn't match the one we're currently looking at
3178 		 * then we have a cross reference.
3179 		 */
3180 		if (data_ref->root != root->root_key.objectid ||
3181 		    data_ref->objectid != objectid ||
3182 		    data_ref->offset != offset) {
3183 			ret = 1;
3184 			break;
3185 		}
3186 	}
3187 	spin_unlock(&head->lock);
3188 	mutex_unlock(&head->mutex);
3189 	btrfs_put_transaction(cur_trans);
3190 	return ret;
3191 }
3192 
3193 static noinline int check_committed_ref(struct btrfs_root *root,
3194 					struct btrfs_path *path,
3195 					u64 objectid, u64 offset, u64 bytenr)
3196 {
3197 	struct btrfs_fs_info *fs_info = root->fs_info;
3198 	struct btrfs_root *extent_root = fs_info->extent_root;
3199 	struct extent_buffer *leaf;
3200 	struct btrfs_extent_data_ref *ref;
3201 	struct btrfs_extent_inline_ref *iref;
3202 	struct btrfs_extent_item *ei;
3203 	struct btrfs_key key;
3204 	u32 item_size;
3205 	int type;
3206 	int ret;
3207 
3208 	key.objectid = bytenr;
3209 	key.offset = (u64)-1;
3210 	key.type = BTRFS_EXTENT_ITEM_KEY;
3211 
3212 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3213 	if (ret < 0)
3214 		goto out;
3215 	BUG_ON(ret == 0); /* Corruption */
3216 
3217 	ret = -ENOENT;
3218 	if (path->slots[0] == 0)
3219 		goto out;
3220 
3221 	path->slots[0]--;
3222 	leaf = path->nodes[0];
3223 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3224 
3225 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3226 		goto out;
3227 
3228 	ret = 1;
3229 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3230 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3231 	if (item_size < sizeof(*ei)) {
3232 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3233 		goto out;
3234 	}
3235 #endif
3236 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3237 
3238 	if (item_size != sizeof(*ei) +
3239 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3240 		goto out;
3241 
3242 	if (btrfs_extent_generation(leaf, ei) <=
3243 	    btrfs_root_last_snapshot(&root->root_item))
3244 		goto out;
3245 
3246 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3247 
3248 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3249 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
3250 		goto out;
3251 
3252 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3253 	if (btrfs_extent_refs(leaf, ei) !=
3254 	    btrfs_extent_data_ref_count(leaf, ref) ||
3255 	    btrfs_extent_data_ref_root(leaf, ref) !=
3256 	    root->root_key.objectid ||
3257 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3258 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
3259 		goto out;
3260 
3261 	ret = 0;
3262 out:
3263 	return ret;
3264 }
3265 
3266 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3267 			  u64 bytenr)
3268 {
3269 	struct btrfs_path *path;
3270 	int ret;
3271 	int ret2;
3272 
3273 	path = btrfs_alloc_path();
3274 	if (!path)
3275 		return -ENOMEM;
3276 
3277 	do {
3278 		ret = check_committed_ref(root, path, objectid,
3279 					  offset, bytenr);
3280 		if (ret && ret != -ENOENT)
3281 			goto out;
3282 
3283 		ret2 = check_delayed_ref(root, path, objectid,
3284 					 offset, bytenr);
3285 	} while (ret2 == -EAGAIN);
3286 
3287 	if (ret2 && ret2 != -ENOENT) {
3288 		ret = ret2;
3289 		goto out;
3290 	}
3291 
3292 	if (ret != -ENOENT || ret2 != -ENOENT)
3293 		ret = 0;
3294 out:
3295 	btrfs_free_path(path);
3296 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3297 		WARN_ON(ret > 0);
3298 	return ret;
3299 }
3300 
3301 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3302 			   struct btrfs_root *root,
3303 			   struct extent_buffer *buf,
3304 			   int full_backref, int inc)
3305 {
3306 	struct btrfs_fs_info *fs_info = root->fs_info;
3307 	u64 bytenr;
3308 	u64 num_bytes;
3309 	u64 parent;
3310 	u64 ref_root;
3311 	u32 nritems;
3312 	struct btrfs_key key;
3313 	struct btrfs_file_extent_item *fi;
3314 	int i;
3315 	int level;
3316 	int ret = 0;
3317 	int (*process_func)(struct btrfs_trans_handle *,
3318 			    struct btrfs_root *,
3319 			    u64, u64, u64, u64, u64, u64);
3320 
3321 
3322 	if (btrfs_is_testing(fs_info))
3323 		return 0;
3324 
3325 	ref_root = btrfs_header_owner(buf);
3326 	nritems = btrfs_header_nritems(buf);
3327 	level = btrfs_header_level(buf);
3328 
3329 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3330 		return 0;
3331 
3332 	if (inc)
3333 		process_func = btrfs_inc_extent_ref;
3334 	else
3335 		process_func = btrfs_free_extent;
3336 
3337 	if (full_backref)
3338 		parent = buf->start;
3339 	else
3340 		parent = 0;
3341 
3342 	for (i = 0; i < nritems; i++) {
3343 		if (level == 0) {
3344 			btrfs_item_key_to_cpu(buf, &key, i);
3345 			if (key.type != BTRFS_EXTENT_DATA_KEY)
3346 				continue;
3347 			fi = btrfs_item_ptr(buf, i,
3348 					    struct btrfs_file_extent_item);
3349 			if (btrfs_file_extent_type(buf, fi) ==
3350 			    BTRFS_FILE_EXTENT_INLINE)
3351 				continue;
3352 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3353 			if (bytenr == 0)
3354 				continue;
3355 
3356 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3357 			key.offset -= btrfs_file_extent_offset(buf, fi);
3358 			ret = process_func(trans, root, bytenr, num_bytes,
3359 					   parent, ref_root, key.objectid,
3360 					   key.offset);
3361 			if (ret)
3362 				goto fail;
3363 		} else {
3364 			bytenr = btrfs_node_blockptr(buf, i);
3365 			num_bytes = fs_info->nodesize;
3366 			ret = process_func(trans, root, bytenr, num_bytes,
3367 					   parent, ref_root, level - 1, 0);
3368 			if (ret)
3369 				goto fail;
3370 		}
3371 	}
3372 	return 0;
3373 fail:
3374 	return ret;
3375 }
3376 
3377 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3378 		  struct extent_buffer *buf, int full_backref)
3379 {
3380 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3381 }
3382 
3383 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3384 		  struct extent_buffer *buf, int full_backref)
3385 {
3386 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3387 }
3388 
3389 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3390 				 struct btrfs_fs_info *fs_info,
3391 				 struct btrfs_path *path,
3392 				 struct btrfs_block_group_cache *cache)
3393 {
3394 	int ret;
3395 	struct btrfs_root *extent_root = fs_info->extent_root;
3396 	unsigned long bi;
3397 	struct extent_buffer *leaf;
3398 
3399 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3400 	if (ret) {
3401 		if (ret > 0)
3402 			ret = -ENOENT;
3403 		goto fail;
3404 	}
3405 
3406 	leaf = path->nodes[0];
3407 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3408 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3409 	btrfs_mark_buffer_dirty(leaf);
3410 fail:
3411 	btrfs_release_path(path);
3412 	return ret;
3413 
3414 }
3415 
3416 static struct btrfs_block_group_cache *
3417 next_block_group(struct btrfs_fs_info *fs_info,
3418 		 struct btrfs_block_group_cache *cache)
3419 {
3420 	struct rb_node *node;
3421 
3422 	spin_lock(&fs_info->block_group_cache_lock);
3423 
3424 	/* If our block group was removed, we need a full search. */
3425 	if (RB_EMPTY_NODE(&cache->cache_node)) {
3426 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3427 
3428 		spin_unlock(&fs_info->block_group_cache_lock);
3429 		btrfs_put_block_group(cache);
3430 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3431 	}
3432 	node = rb_next(&cache->cache_node);
3433 	btrfs_put_block_group(cache);
3434 	if (node) {
3435 		cache = rb_entry(node, struct btrfs_block_group_cache,
3436 				 cache_node);
3437 		btrfs_get_block_group(cache);
3438 	} else
3439 		cache = NULL;
3440 	spin_unlock(&fs_info->block_group_cache_lock);
3441 	return cache;
3442 }
3443 
3444 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3445 			    struct btrfs_trans_handle *trans,
3446 			    struct btrfs_path *path)
3447 {
3448 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3449 	struct btrfs_root *root = fs_info->tree_root;
3450 	struct inode *inode = NULL;
3451 	struct extent_changeset *data_reserved = NULL;
3452 	u64 alloc_hint = 0;
3453 	int dcs = BTRFS_DC_ERROR;
3454 	u64 num_pages = 0;
3455 	int retries = 0;
3456 	int ret = 0;
3457 
3458 	/*
3459 	 * If this block group is smaller than 100 megs don't bother caching the
3460 	 * block group.
3461 	 */
3462 	if (block_group->key.offset < (100 * SZ_1M)) {
3463 		spin_lock(&block_group->lock);
3464 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3465 		spin_unlock(&block_group->lock);
3466 		return 0;
3467 	}
3468 
3469 	if (trans->aborted)
3470 		return 0;
3471 again:
3472 	inode = lookup_free_space_inode(fs_info, block_group, path);
3473 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3474 		ret = PTR_ERR(inode);
3475 		btrfs_release_path(path);
3476 		goto out;
3477 	}
3478 
3479 	if (IS_ERR(inode)) {
3480 		BUG_ON(retries);
3481 		retries++;
3482 
3483 		if (block_group->ro)
3484 			goto out_free;
3485 
3486 		ret = create_free_space_inode(fs_info, trans, block_group,
3487 					      path);
3488 		if (ret)
3489 			goto out_free;
3490 		goto again;
3491 	}
3492 
3493 	/*
3494 	 * We want to set the generation to 0, that way if anything goes wrong
3495 	 * from here on out we know not to trust this cache when we load up next
3496 	 * time.
3497 	 */
3498 	BTRFS_I(inode)->generation = 0;
3499 	ret = btrfs_update_inode(trans, root, inode);
3500 	if (ret) {
3501 		/*
3502 		 * So theoretically we could recover from this, simply set the
3503 		 * super cache generation to 0 so we know to invalidate the
3504 		 * cache, but then we'd have to keep track of the block groups
3505 		 * that fail this way so we know we _have_ to reset this cache
3506 		 * before the next commit or risk reading stale cache.  So to
3507 		 * limit our exposure to horrible edge cases lets just abort the
3508 		 * transaction, this only happens in really bad situations
3509 		 * anyway.
3510 		 */
3511 		btrfs_abort_transaction(trans, ret);
3512 		goto out_put;
3513 	}
3514 	WARN_ON(ret);
3515 
3516 	/* We've already setup this transaction, go ahead and exit */
3517 	if (block_group->cache_generation == trans->transid &&
3518 	    i_size_read(inode)) {
3519 		dcs = BTRFS_DC_SETUP;
3520 		goto out_put;
3521 	}
3522 
3523 	if (i_size_read(inode) > 0) {
3524 		ret = btrfs_check_trunc_cache_free_space(fs_info,
3525 					&fs_info->global_block_rsv);
3526 		if (ret)
3527 			goto out_put;
3528 
3529 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3530 		if (ret)
3531 			goto out_put;
3532 	}
3533 
3534 	spin_lock(&block_group->lock);
3535 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3536 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3537 		/*
3538 		 * don't bother trying to write stuff out _if_
3539 		 * a) we're not cached,
3540 		 * b) we're with nospace_cache mount option,
3541 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3542 		 */
3543 		dcs = BTRFS_DC_WRITTEN;
3544 		spin_unlock(&block_group->lock);
3545 		goto out_put;
3546 	}
3547 	spin_unlock(&block_group->lock);
3548 
3549 	/*
3550 	 * We hit an ENOSPC when setting up the cache in this transaction, just
3551 	 * skip doing the setup, we've already cleared the cache so we're safe.
3552 	 */
3553 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3554 		ret = -ENOSPC;
3555 		goto out_put;
3556 	}
3557 
3558 	/*
3559 	 * Try to preallocate enough space based on how big the block group is.
3560 	 * Keep in mind this has to include any pinned space which could end up
3561 	 * taking up quite a bit since it's not folded into the other space
3562 	 * cache.
3563 	 */
3564 	num_pages = div_u64(block_group->key.offset, SZ_256M);
3565 	if (!num_pages)
3566 		num_pages = 1;
3567 
3568 	num_pages *= 16;
3569 	num_pages *= PAGE_SIZE;
3570 
3571 	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3572 	if (ret)
3573 		goto out_put;
3574 
3575 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3576 					      num_pages, num_pages,
3577 					      &alloc_hint);
3578 	/*
3579 	 * Our cache requires contiguous chunks so that we don't modify a bunch
3580 	 * of metadata or split extents when writing the cache out, which means
3581 	 * we can enospc if we are heavily fragmented in addition to just normal
3582 	 * out of space conditions.  So if we hit this just skip setting up any
3583 	 * other block groups for this transaction, maybe we'll unpin enough
3584 	 * space the next time around.
3585 	 */
3586 	if (!ret)
3587 		dcs = BTRFS_DC_SETUP;
3588 	else if (ret == -ENOSPC)
3589 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3590 
3591 out_put:
3592 	iput(inode);
3593 out_free:
3594 	btrfs_release_path(path);
3595 out:
3596 	spin_lock(&block_group->lock);
3597 	if (!ret && dcs == BTRFS_DC_SETUP)
3598 		block_group->cache_generation = trans->transid;
3599 	block_group->disk_cache_state = dcs;
3600 	spin_unlock(&block_group->lock);
3601 
3602 	extent_changeset_free(data_reserved);
3603 	return ret;
3604 }
3605 
3606 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3607 			    struct btrfs_fs_info *fs_info)
3608 {
3609 	struct btrfs_block_group_cache *cache, *tmp;
3610 	struct btrfs_transaction *cur_trans = trans->transaction;
3611 	struct btrfs_path *path;
3612 
3613 	if (list_empty(&cur_trans->dirty_bgs) ||
3614 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
3615 		return 0;
3616 
3617 	path = btrfs_alloc_path();
3618 	if (!path)
3619 		return -ENOMEM;
3620 
3621 	/* Could add new block groups, use _safe just in case */
3622 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3623 				 dirty_list) {
3624 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3625 			cache_save_setup(cache, trans, path);
3626 	}
3627 
3628 	btrfs_free_path(path);
3629 	return 0;
3630 }
3631 
3632 /*
3633  * transaction commit does final block group cache writeback during a
3634  * critical section where nothing is allowed to change the FS.  This is
3635  * required in order for the cache to actually match the block group,
3636  * but can introduce a lot of latency into the commit.
3637  *
3638  * So, btrfs_start_dirty_block_groups is here to kick off block group
3639  * cache IO.  There's a chance we'll have to redo some of it if the
3640  * block group changes again during the commit, but it greatly reduces
3641  * the commit latency by getting rid of the easy block groups while
3642  * we're still allowing others to join the commit.
3643  */
3644 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3645 {
3646 	struct btrfs_fs_info *fs_info = trans->fs_info;
3647 	struct btrfs_block_group_cache *cache;
3648 	struct btrfs_transaction *cur_trans = trans->transaction;
3649 	int ret = 0;
3650 	int should_put;
3651 	struct btrfs_path *path = NULL;
3652 	LIST_HEAD(dirty);
3653 	struct list_head *io = &cur_trans->io_bgs;
3654 	int num_started = 0;
3655 	int loops = 0;
3656 
3657 	spin_lock(&cur_trans->dirty_bgs_lock);
3658 	if (list_empty(&cur_trans->dirty_bgs)) {
3659 		spin_unlock(&cur_trans->dirty_bgs_lock);
3660 		return 0;
3661 	}
3662 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3663 	spin_unlock(&cur_trans->dirty_bgs_lock);
3664 
3665 again:
3666 	/*
3667 	 * make sure all the block groups on our dirty list actually
3668 	 * exist
3669 	 */
3670 	btrfs_create_pending_block_groups(trans);
3671 
3672 	if (!path) {
3673 		path = btrfs_alloc_path();
3674 		if (!path)
3675 			return -ENOMEM;
3676 	}
3677 
3678 	/*
3679 	 * cache_write_mutex is here only to save us from balance or automatic
3680 	 * removal of empty block groups deleting this block group while we are
3681 	 * writing out the cache
3682 	 */
3683 	mutex_lock(&trans->transaction->cache_write_mutex);
3684 	while (!list_empty(&dirty)) {
3685 		cache = list_first_entry(&dirty,
3686 					 struct btrfs_block_group_cache,
3687 					 dirty_list);
3688 		/*
3689 		 * this can happen if something re-dirties a block
3690 		 * group that is already under IO.  Just wait for it to
3691 		 * finish and then do it all again
3692 		 */
3693 		if (!list_empty(&cache->io_list)) {
3694 			list_del_init(&cache->io_list);
3695 			btrfs_wait_cache_io(trans, cache, path);
3696 			btrfs_put_block_group(cache);
3697 		}
3698 
3699 
3700 		/*
3701 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3702 		 * if it should update the cache_state.  Don't delete
3703 		 * until after we wait.
3704 		 *
3705 		 * Since we're not running in the commit critical section
3706 		 * we need the dirty_bgs_lock to protect from update_block_group
3707 		 */
3708 		spin_lock(&cur_trans->dirty_bgs_lock);
3709 		list_del_init(&cache->dirty_list);
3710 		spin_unlock(&cur_trans->dirty_bgs_lock);
3711 
3712 		should_put = 1;
3713 
3714 		cache_save_setup(cache, trans, path);
3715 
3716 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3717 			cache->io_ctl.inode = NULL;
3718 			ret = btrfs_write_out_cache(fs_info, trans,
3719 						    cache, path);
3720 			if (ret == 0 && cache->io_ctl.inode) {
3721 				num_started++;
3722 				should_put = 0;
3723 
3724 				/*
3725 				 * The cache_write_mutex is protecting the
3726 				 * io_list, also refer to the definition of
3727 				 * btrfs_transaction::io_bgs for more details
3728 				 */
3729 				list_add_tail(&cache->io_list, io);
3730 			} else {
3731 				/*
3732 				 * if we failed to write the cache, the
3733 				 * generation will be bad and life goes on
3734 				 */
3735 				ret = 0;
3736 			}
3737 		}
3738 		if (!ret) {
3739 			ret = write_one_cache_group(trans, fs_info,
3740 						    path, cache);
3741 			/*
3742 			 * Our block group might still be attached to the list
3743 			 * of new block groups in the transaction handle of some
3744 			 * other task (struct btrfs_trans_handle->new_bgs). This
3745 			 * means its block group item isn't yet in the extent
3746 			 * tree. If this happens ignore the error, as we will
3747 			 * try again later in the critical section of the
3748 			 * transaction commit.
3749 			 */
3750 			if (ret == -ENOENT) {
3751 				ret = 0;
3752 				spin_lock(&cur_trans->dirty_bgs_lock);
3753 				if (list_empty(&cache->dirty_list)) {
3754 					list_add_tail(&cache->dirty_list,
3755 						      &cur_trans->dirty_bgs);
3756 					btrfs_get_block_group(cache);
3757 				}
3758 				spin_unlock(&cur_trans->dirty_bgs_lock);
3759 			} else if (ret) {
3760 				btrfs_abort_transaction(trans, ret);
3761 			}
3762 		}
3763 
3764 		/* if its not on the io list, we need to put the block group */
3765 		if (should_put)
3766 			btrfs_put_block_group(cache);
3767 
3768 		if (ret)
3769 			break;
3770 
3771 		/*
3772 		 * Avoid blocking other tasks for too long. It might even save
3773 		 * us from writing caches for block groups that are going to be
3774 		 * removed.
3775 		 */
3776 		mutex_unlock(&trans->transaction->cache_write_mutex);
3777 		mutex_lock(&trans->transaction->cache_write_mutex);
3778 	}
3779 	mutex_unlock(&trans->transaction->cache_write_mutex);
3780 
3781 	/*
3782 	 * go through delayed refs for all the stuff we've just kicked off
3783 	 * and then loop back (just once)
3784 	 */
3785 	ret = btrfs_run_delayed_refs(trans, 0);
3786 	if (!ret && loops == 0) {
3787 		loops++;
3788 		spin_lock(&cur_trans->dirty_bgs_lock);
3789 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3790 		/*
3791 		 * dirty_bgs_lock protects us from concurrent block group
3792 		 * deletes too (not just cache_write_mutex).
3793 		 */
3794 		if (!list_empty(&dirty)) {
3795 			spin_unlock(&cur_trans->dirty_bgs_lock);
3796 			goto again;
3797 		}
3798 		spin_unlock(&cur_trans->dirty_bgs_lock);
3799 	} else if (ret < 0) {
3800 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3801 	}
3802 
3803 	btrfs_free_path(path);
3804 	return ret;
3805 }
3806 
3807 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3808 				   struct btrfs_fs_info *fs_info)
3809 {
3810 	struct btrfs_block_group_cache *cache;
3811 	struct btrfs_transaction *cur_trans = trans->transaction;
3812 	int ret = 0;
3813 	int should_put;
3814 	struct btrfs_path *path;
3815 	struct list_head *io = &cur_trans->io_bgs;
3816 	int num_started = 0;
3817 
3818 	path = btrfs_alloc_path();
3819 	if (!path)
3820 		return -ENOMEM;
3821 
3822 	/*
3823 	 * Even though we are in the critical section of the transaction commit,
3824 	 * we can still have concurrent tasks adding elements to this
3825 	 * transaction's list of dirty block groups. These tasks correspond to
3826 	 * endio free space workers started when writeback finishes for a
3827 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3828 	 * allocate new block groups as a result of COWing nodes of the root
3829 	 * tree when updating the free space inode. The writeback for the space
3830 	 * caches is triggered by an earlier call to
3831 	 * btrfs_start_dirty_block_groups() and iterations of the following
3832 	 * loop.
3833 	 * Also we want to do the cache_save_setup first and then run the
3834 	 * delayed refs to make sure we have the best chance at doing this all
3835 	 * in one shot.
3836 	 */
3837 	spin_lock(&cur_trans->dirty_bgs_lock);
3838 	while (!list_empty(&cur_trans->dirty_bgs)) {
3839 		cache = list_first_entry(&cur_trans->dirty_bgs,
3840 					 struct btrfs_block_group_cache,
3841 					 dirty_list);
3842 
3843 		/*
3844 		 * this can happen if cache_save_setup re-dirties a block
3845 		 * group that is already under IO.  Just wait for it to
3846 		 * finish and then do it all again
3847 		 */
3848 		if (!list_empty(&cache->io_list)) {
3849 			spin_unlock(&cur_trans->dirty_bgs_lock);
3850 			list_del_init(&cache->io_list);
3851 			btrfs_wait_cache_io(trans, cache, path);
3852 			btrfs_put_block_group(cache);
3853 			spin_lock(&cur_trans->dirty_bgs_lock);
3854 		}
3855 
3856 		/*
3857 		 * don't remove from the dirty list until after we've waited
3858 		 * on any pending IO
3859 		 */
3860 		list_del_init(&cache->dirty_list);
3861 		spin_unlock(&cur_trans->dirty_bgs_lock);
3862 		should_put = 1;
3863 
3864 		cache_save_setup(cache, trans, path);
3865 
3866 		if (!ret)
3867 			ret = btrfs_run_delayed_refs(trans,
3868 						     (unsigned long) -1);
3869 
3870 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3871 			cache->io_ctl.inode = NULL;
3872 			ret = btrfs_write_out_cache(fs_info, trans,
3873 						    cache, path);
3874 			if (ret == 0 && cache->io_ctl.inode) {
3875 				num_started++;
3876 				should_put = 0;
3877 				list_add_tail(&cache->io_list, io);
3878 			} else {
3879 				/*
3880 				 * if we failed to write the cache, the
3881 				 * generation will be bad and life goes on
3882 				 */
3883 				ret = 0;
3884 			}
3885 		}
3886 		if (!ret) {
3887 			ret = write_one_cache_group(trans, fs_info,
3888 						    path, cache);
3889 			/*
3890 			 * One of the free space endio workers might have
3891 			 * created a new block group while updating a free space
3892 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3893 			 * and hasn't released its transaction handle yet, in
3894 			 * which case the new block group is still attached to
3895 			 * its transaction handle and its creation has not
3896 			 * finished yet (no block group item in the extent tree
3897 			 * yet, etc). If this is the case, wait for all free
3898 			 * space endio workers to finish and retry. This is a
3899 			 * a very rare case so no need for a more efficient and
3900 			 * complex approach.
3901 			 */
3902 			if (ret == -ENOENT) {
3903 				wait_event(cur_trans->writer_wait,
3904 				   atomic_read(&cur_trans->num_writers) == 1);
3905 				ret = write_one_cache_group(trans, fs_info,
3906 							    path, cache);
3907 			}
3908 			if (ret)
3909 				btrfs_abort_transaction(trans, ret);
3910 		}
3911 
3912 		/* if its not on the io list, we need to put the block group */
3913 		if (should_put)
3914 			btrfs_put_block_group(cache);
3915 		spin_lock(&cur_trans->dirty_bgs_lock);
3916 	}
3917 	spin_unlock(&cur_trans->dirty_bgs_lock);
3918 
3919 	/*
3920 	 * Refer to the definition of io_bgs member for details why it's safe
3921 	 * to use it without any locking
3922 	 */
3923 	while (!list_empty(io)) {
3924 		cache = list_first_entry(io, struct btrfs_block_group_cache,
3925 					 io_list);
3926 		list_del_init(&cache->io_list);
3927 		btrfs_wait_cache_io(trans, cache, path);
3928 		btrfs_put_block_group(cache);
3929 	}
3930 
3931 	btrfs_free_path(path);
3932 	return ret;
3933 }
3934 
3935 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3936 {
3937 	struct btrfs_block_group_cache *block_group;
3938 	int readonly = 0;
3939 
3940 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
3941 	if (!block_group || block_group->ro)
3942 		readonly = 1;
3943 	if (block_group)
3944 		btrfs_put_block_group(block_group);
3945 	return readonly;
3946 }
3947 
3948 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3949 {
3950 	struct btrfs_block_group_cache *bg;
3951 	bool ret = true;
3952 
3953 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3954 	if (!bg)
3955 		return false;
3956 
3957 	spin_lock(&bg->lock);
3958 	if (bg->ro)
3959 		ret = false;
3960 	else
3961 		atomic_inc(&bg->nocow_writers);
3962 	spin_unlock(&bg->lock);
3963 
3964 	/* no put on block group, done by btrfs_dec_nocow_writers */
3965 	if (!ret)
3966 		btrfs_put_block_group(bg);
3967 
3968 	return ret;
3969 
3970 }
3971 
3972 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3973 {
3974 	struct btrfs_block_group_cache *bg;
3975 
3976 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3977 	ASSERT(bg);
3978 	if (atomic_dec_and_test(&bg->nocow_writers))
3979 		wake_up_var(&bg->nocow_writers);
3980 	/*
3981 	 * Once for our lookup and once for the lookup done by a previous call
3982 	 * to btrfs_inc_nocow_writers()
3983 	 */
3984 	btrfs_put_block_group(bg);
3985 	btrfs_put_block_group(bg);
3986 }
3987 
3988 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3989 {
3990 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3991 }
3992 
3993 static const char *alloc_name(u64 flags)
3994 {
3995 	switch (flags) {
3996 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3997 		return "mixed";
3998 	case BTRFS_BLOCK_GROUP_METADATA:
3999 		return "metadata";
4000 	case BTRFS_BLOCK_GROUP_DATA:
4001 		return "data";
4002 	case BTRFS_BLOCK_GROUP_SYSTEM:
4003 		return "system";
4004 	default:
4005 		WARN_ON(1);
4006 		return "invalid-combination";
4007 	};
4008 }
4009 
4010 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
4011 {
4012 
4013 	struct btrfs_space_info *space_info;
4014 	int i;
4015 	int ret;
4016 
4017 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
4018 	if (!space_info)
4019 		return -ENOMEM;
4020 
4021 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
4022 				 GFP_KERNEL);
4023 	if (ret) {
4024 		kfree(space_info);
4025 		return ret;
4026 	}
4027 
4028 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
4029 		INIT_LIST_HEAD(&space_info->block_groups[i]);
4030 	init_rwsem(&space_info->groups_sem);
4031 	spin_lock_init(&space_info->lock);
4032 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
4033 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4034 	init_waitqueue_head(&space_info->wait);
4035 	INIT_LIST_HEAD(&space_info->ro_bgs);
4036 	INIT_LIST_HEAD(&space_info->tickets);
4037 	INIT_LIST_HEAD(&space_info->priority_tickets);
4038 
4039 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
4040 				    info->space_info_kobj, "%s",
4041 				    alloc_name(space_info->flags));
4042 	if (ret) {
4043 		percpu_counter_destroy(&space_info->total_bytes_pinned);
4044 		kfree(space_info);
4045 		return ret;
4046 	}
4047 
4048 	list_add_rcu(&space_info->list, &info->space_info);
4049 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4050 		info->data_sinfo = space_info;
4051 
4052 	return ret;
4053 }
4054 
4055 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4056 			     u64 total_bytes, u64 bytes_used,
4057 			     u64 bytes_readonly,
4058 			     struct btrfs_space_info **space_info)
4059 {
4060 	struct btrfs_space_info *found;
4061 	int factor;
4062 
4063 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
4064 		     BTRFS_BLOCK_GROUP_RAID10))
4065 		factor = 2;
4066 	else
4067 		factor = 1;
4068 
4069 	found = __find_space_info(info, flags);
4070 	ASSERT(found);
4071 	spin_lock(&found->lock);
4072 	found->total_bytes += total_bytes;
4073 	found->disk_total += total_bytes * factor;
4074 	found->bytes_used += bytes_used;
4075 	found->disk_used += bytes_used * factor;
4076 	found->bytes_readonly += bytes_readonly;
4077 	if (total_bytes > 0)
4078 		found->full = 0;
4079 	space_info_add_new_bytes(info, found, total_bytes -
4080 				 bytes_used - bytes_readonly);
4081 	spin_unlock(&found->lock);
4082 	*space_info = found;
4083 }
4084 
4085 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4086 {
4087 	u64 extra_flags = chunk_to_extended(flags) &
4088 				BTRFS_EXTENDED_PROFILE_MASK;
4089 
4090 	write_seqlock(&fs_info->profiles_lock);
4091 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4092 		fs_info->avail_data_alloc_bits |= extra_flags;
4093 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
4094 		fs_info->avail_metadata_alloc_bits |= extra_flags;
4095 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4096 		fs_info->avail_system_alloc_bits |= extra_flags;
4097 	write_sequnlock(&fs_info->profiles_lock);
4098 }
4099 
4100 /*
4101  * returns target flags in extended format or 0 if restripe for this
4102  * chunk_type is not in progress
4103  *
4104  * should be called with balance_lock held
4105  */
4106 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4107 {
4108 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4109 	u64 target = 0;
4110 
4111 	if (!bctl)
4112 		return 0;
4113 
4114 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
4115 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4116 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4117 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4118 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4119 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4120 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4121 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4122 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4123 	}
4124 
4125 	return target;
4126 }
4127 
4128 /*
4129  * @flags: available profiles in extended format (see ctree.h)
4130  *
4131  * Returns reduced profile in chunk format.  If profile changing is in
4132  * progress (either running or paused) picks the target profile (if it's
4133  * already available), otherwise falls back to plain reducing.
4134  */
4135 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4136 {
4137 	u64 num_devices = fs_info->fs_devices->rw_devices;
4138 	u64 target;
4139 	u64 raid_type;
4140 	u64 allowed = 0;
4141 
4142 	/*
4143 	 * see if restripe for this chunk_type is in progress, if so
4144 	 * try to reduce to the target profile
4145 	 */
4146 	spin_lock(&fs_info->balance_lock);
4147 	target = get_restripe_target(fs_info, flags);
4148 	if (target) {
4149 		/* pick target profile only if it's already available */
4150 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4151 			spin_unlock(&fs_info->balance_lock);
4152 			return extended_to_chunk(target);
4153 		}
4154 	}
4155 	spin_unlock(&fs_info->balance_lock);
4156 
4157 	/* First, mask out the RAID levels which aren't possible */
4158 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4159 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4160 			allowed |= btrfs_raid_array[raid_type].bg_flag;
4161 	}
4162 	allowed &= flags;
4163 
4164 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4165 		allowed = BTRFS_BLOCK_GROUP_RAID6;
4166 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4167 		allowed = BTRFS_BLOCK_GROUP_RAID5;
4168 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4169 		allowed = BTRFS_BLOCK_GROUP_RAID10;
4170 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4171 		allowed = BTRFS_BLOCK_GROUP_RAID1;
4172 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4173 		allowed = BTRFS_BLOCK_GROUP_RAID0;
4174 
4175 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4176 
4177 	return extended_to_chunk(flags | allowed);
4178 }
4179 
4180 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4181 {
4182 	unsigned seq;
4183 	u64 flags;
4184 
4185 	do {
4186 		flags = orig_flags;
4187 		seq = read_seqbegin(&fs_info->profiles_lock);
4188 
4189 		if (flags & BTRFS_BLOCK_GROUP_DATA)
4190 			flags |= fs_info->avail_data_alloc_bits;
4191 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4192 			flags |= fs_info->avail_system_alloc_bits;
4193 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4194 			flags |= fs_info->avail_metadata_alloc_bits;
4195 	} while (read_seqretry(&fs_info->profiles_lock, seq));
4196 
4197 	return btrfs_reduce_alloc_profile(fs_info, flags);
4198 }
4199 
4200 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4201 {
4202 	struct btrfs_fs_info *fs_info = root->fs_info;
4203 	u64 flags;
4204 	u64 ret;
4205 
4206 	if (data)
4207 		flags = BTRFS_BLOCK_GROUP_DATA;
4208 	else if (root == fs_info->chunk_root)
4209 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
4210 	else
4211 		flags = BTRFS_BLOCK_GROUP_METADATA;
4212 
4213 	ret = get_alloc_profile(fs_info, flags);
4214 	return ret;
4215 }
4216 
4217 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4218 {
4219 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4220 }
4221 
4222 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4223 {
4224 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4225 }
4226 
4227 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4228 {
4229 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4230 }
4231 
4232 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4233 				 bool may_use_included)
4234 {
4235 	ASSERT(s_info);
4236 	return s_info->bytes_used + s_info->bytes_reserved +
4237 		s_info->bytes_pinned + s_info->bytes_readonly +
4238 		(may_use_included ? s_info->bytes_may_use : 0);
4239 }
4240 
4241 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4242 {
4243 	struct btrfs_root *root = inode->root;
4244 	struct btrfs_fs_info *fs_info = root->fs_info;
4245 	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4246 	u64 used;
4247 	int ret = 0;
4248 	int need_commit = 2;
4249 	int have_pinned_space;
4250 
4251 	/* make sure bytes are sectorsize aligned */
4252 	bytes = ALIGN(bytes, fs_info->sectorsize);
4253 
4254 	if (btrfs_is_free_space_inode(inode)) {
4255 		need_commit = 0;
4256 		ASSERT(current->journal_info);
4257 	}
4258 
4259 again:
4260 	/* make sure we have enough space to handle the data first */
4261 	spin_lock(&data_sinfo->lock);
4262 	used = btrfs_space_info_used(data_sinfo, true);
4263 
4264 	if (used + bytes > data_sinfo->total_bytes) {
4265 		struct btrfs_trans_handle *trans;
4266 
4267 		/*
4268 		 * if we don't have enough free bytes in this space then we need
4269 		 * to alloc a new chunk.
4270 		 */
4271 		if (!data_sinfo->full) {
4272 			u64 alloc_target;
4273 
4274 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4275 			spin_unlock(&data_sinfo->lock);
4276 
4277 			alloc_target = btrfs_data_alloc_profile(fs_info);
4278 			/*
4279 			 * It is ugly that we don't call nolock join
4280 			 * transaction for the free space inode case here.
4281 			 * But it is safe because we only do the data space
4282 			 * reservation for the free space cache in the
4283 			 * transaction context, the common join transaction
4284 			 * just increase the counter of the current transaction
4285 			 * handler, doesn't try to acquire the trans_lock of
4286 			 * the fs.
4287 			 */
4288 			trans = btrfs_join_transaction(root);
4289 			if (IS_ERR(trans))
4290 				return PTR_ERR(trans);
4291 
4292 			ret = do_chunk_alloc(trans, fs_info, alloc_target,
4293 					     CHUNK_ALLOC_NO_FORCE);
4294 			btrfs_end_transaction(trans);
4295 			if (ret < 0) {
4296 				if (ret != -ENOSPC)
4297 					return ret;
4298 				else {
4299 					have_pinned_space = 1;
4300 					goto commit_trans;
4301 				}
4302 			}
4303 
4304 			goto again;
4305 		}
4306 
4307 		/*
4308 		 * If we don't have enough pinned space to deal with this
4309 		 * allocation, and no removed chunk in current transaction,
4310 		 * don't bother committing the transaction.
4311 		 */
4312 		have_pinned_space = percpu_counter_compare(
4313 			&data_sinfo->total_bytes_pinned,
4314 			used + bytes - data_sinfo->total_bytes);
4315 		spin_unlock(&data_sinfo->lock);
4316 
4317 		/* commit the current transaction and try again */
4318 commit_trans:
4319 		if (need_commit) {
4320 			need_commit--;
4321 
4322 			if (need_commit > 0) {
4323 				btrfs_start_delalloc_roots(fs_info, -1);
4324 				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4325 							 (u64)-1);
4326 			}
4327 
4328 			trans = btrfs_join_transaction(root);
4329 			if (IS_ERR(trans))
4330 				return PTR_ERR(trans);
4331 			if (have_pinned_space >= 0 ||
4332 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4333 				     &trans->transaction->flags) ||
4334 			    need_commit > 0) {
4335 				ret = btrfs_commit_transaction(trans);
4336 				if (ret)
4337 					return ret;
4338 				/*
4339 				 * The cleaner kthread might still be doing iput
4340 				 * operations. Wait for it to finish so that
4341 				 * more space is released.
4342 				 */
4343 				mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4344 				mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4345 				goto again;
4346 			} else {
4347 				btrfs_end_transaction(trans);
4348 			}
4349 		}
4350 
4351 		trace_btrfs_space_reservation(fs_info,
4352 					      "space_info:enospc",
4353 					      data_sinfo->flags, bytes, 1);
4354 		return -ENOSPC;
4355 	}
4356 	data_sinfo->bytes_may_use += bytes;
4357 	trace_btrfs_space_reservation(fs_info, "space_info",
4358 				      data_sinfo->flags, bytes, 1);
4359 	spin_unlock(&data_sinfo->lock);
4360 
4361 	return ret;
4362 }
4363 
4364 int btrfs_check_data_free_space(struct inode *inode,
4365 			struct extent_changeset **reserved, u64 start, u64 len)
4366 {
4367 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4368 	int ret;
4369 
4370 	/* align the range */
4371 	len = round_up(start + len, fs_info->sectorsize) -
4372 	      round_down(start, fs_info->sectorsize);
4373 	start = round_down(start, fs_info->sectorsize);
4374 
4375 	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4376 	if (ret < 0)
4377 		return ret;
4378 
4379 	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4380 	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4381 	if (ret < 0)
4382 		btrfs_free_reserved_data_space_noquota(inode, start, len);
4383 	else
4384 		ret = 0;
4385 	return ret;
4386 }
4387 
4388 /*
4389  * Called if we need to clear a data reservation for this inode
4390  * Normally in a error case.
4391  *
4392  * This one will *NOT* use accurate qgroup reserved space API, just for case
4393  * which we can't sleep and is sure it won't affect qgroup reserved space.
4394  * Like clear_bit_hook().
4395  */
4396 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4397 					    u64 len)
4398 {
4399 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4400 	struct btrfs_space_info *data_sinfo;
4401 
4402 	/* Make sure the range is aligned to sectorsize */
4403 	len = round_up(start + len, fs_info->sectorsize) -
4404 	      round_down(start, fs_info->sectorsize);
4405 	start = round_down(start, fs_info->sectorsize);
4406 
4407 	data_sinfo = fs_info->data_sinfo;
4408 	spin_lock(&data_sinfo->lock);
4409 	if (WARN_ON(data_sinfo->bytes_may_use < len))
4410 		data_sinfo->bytes_may_use = 0;
4411 	else
4412 		data_sinfo->bytes_may_use -= len;
4413 	trace_btrfs_space_reservation(fs_info, "space_info",
4414 				      data_sinfo->flags, len, 0);
4415 	spin_unlock(&data_sinfo->lock);
4416 }
4417 
4418 /*
4419  * Called if we need to clear a data reservation for this inode
4420  * Normally in a error case.
4421  *
4422  * This one will handle the per-inode data rsv map for accurate reserved
4423  * space framework.
4424  */
4425 void btrfs_free_reserved_data_space(struct inode *inode,
4426 			struct extent_changeset *reserved, u64 start, u64 len)
4427 {
4428 	struct btrfs_root *root = BTRFS_I(inode)->root;
4429 
4430 	/* Make sure the range is aligned to sectorsize */
4431 	len = round_up(start + len, root->fs_info->sectorsize) -
4432 	      round_down(start, root->fs_info->sectorsize);
4433 	start = round_down(start, root->fs_info->sectorsize);
4434 
4435 	btrfs_free_reserved_data_space_noquota(inode, start, len);
4436 	btrfs_qgroup_free_data(inode, reserved, start, len);
4437 }
4438 
4439 static void force_metadata_allocation(struct btrfs_fs_info *info)
4440 {
4441 	struct list_head *head = &info->space_info;
4442 	struct btrfs_space_info *found;
4443 
4444 	rcu_read_lock();
4445 	list_for_each_entry_rcu(found, head, list) {
4446 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4447 			found->force_alloc = CHUNK_ALLOC_FORCE;
4448 	}
4449 	rcu_read_unlock();
4450 }
4451 
4452 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4453 {
4454 	return (global->size << 1);
4455 }
4456 
4457 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4458 			      struct btrfs_space_info *sinfo, int force)
4459 {
4460 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4461 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
4462 	u64 thresh;
4463 
4464 	if (force == CHUNK_ALLOC_FORCE)
4465 		return 1;
4466 
4467 	/*
4468 	 * We need to take into account the global rsv because for all intents
4469 	 * and purposes it's used space.  Don't worry about locking the
4470 	 * global_rsv, it doesn't change except when the transaction commits.
4471 	 */
4472 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4473 		bytes_used += calc_global_rsv_need_space(global_rsv);
4474 
4475 	/*
4476 	 * in limited mode, we want to have some free space up to
4477 	 * about 1% of the FS size.
4478 	 */
4479 	if (force == CHUNK_ALLOC_LIMITED) {
4480 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
4481 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4482 
4483 		if (sinfo->total_bytes - bytes_used < thresh)
4484 			return 1;
4485 	}
4486 
4487 	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4488 		return 0;
4489 	return 1;
4490 }
4491 
4492 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4493 {
4494 	u64 num_dev;
4495 
4496 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4497 		    BTRFS_BLOCK_GROUP_RAID0 |
4498 		    BTRFS_BLOCK_GROUP_RAID5 |
4499 		    BTRFS_BLOCK_GROUP_RAID6))
4500 		num_dev = fs_info->fs_devices->rw_devices;
4501 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
4502 		num_dev = 2;
4503 	else
4504 		num_dev = 1;	/* DUP or single */
4505 
4506 	return num_dev;
4507 }
4508 
4509 /*
4510  * If @is_allocation is true, reserve space in the system space info necessary
4511  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4512  * removing a chunk.
4513  */
4514 void check_system_chunk(struct btrfs_trans_handle *trans,
4515 			struct btrfs_fs_info *fs_info, u64 type)
4516 {
4517 	struct btrfs_space_info *info;
4518 	u64 left;
4519 	u64 thresh;
4520 	int ret = 0;
4521 	u64 num_devs;
4522 
4523 	/*
4524 	 * Needed because we can end up allocating a system chunk and for an
4525 	 * atomic and race free space reservation in the chunk block reserve.
4526 	 */
4527 	lockdep_assert_held(&fs_info->chunk_mutex);
4528 
4529 	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4530 	spin_lock(&info->lock);
4531 	left = info->total_bytes - btrfs_space_info_used(info, true);
4532 	spin_unlock(&info->lock);
4533 
4534 	num_devs = get_profile_num_devs(fs_info, type);
4535 
4536 	/* num_devs device items to update and 1 chunk item to add or remove */
4537 	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4538 		btrfs_calc_trans_metadata_size(fs_info, 1);
4539 
4540 	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4541 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4542 			   left, thresh, type);
4543 		dump_space_info(fs_info, info, 0, 0);
4544 	}
4545 
4546 	if (left < thresh) {
4547 		u64 flags = btrfs_system_alloc_profile(fs_info);
4548 
4549 		/*
4550 		 * Ignore failure to create system chunk. We might end up not
4551 		 * needing it, as we might not need to COW all nodes/leafs from
4552 		 * the paths we visit in the chunk tree (they were already COWed
4553 		 * or created in the current transaction for example).
4554 		 */
4555 		ret = btrfs_alloc_chunk(trans, fs_info, flags);
4556 	}
4557 
4558 	if (!ret) {
4559 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
4560 					  &fs_info->chunk_block_rsv,
4561 					  thresh, BTRFS_RESERVE_NO_FLUSH);
4562 		if (!ret)
4563 			trans->chunk_bytes_reserved += thresh;
4564 	}
4565 }
4566 
4567 /*
4568  * If force is CHUNK_ALLOC_FORCE:
4569  *    - return 1 if it successfully allocates a chunk,
4570  *    - return errors including -ENOSPC otherwise.
4571  * If force is NOT CHUNK_ALLOC_FORCE:
4572  *    - return 0 if it doesn't need to allocate a new chunk,
4573  *    - return 1 if it successfully allocates a chunk,
4574  *    - return errors including -ENOSPC otherwise.
4575  */
4576 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4577 			  struct btrfs_fs_info *fs_info, u64 flags, int force)
4578 {
4579 	struct btrfs_space_info *space_info;
4580 	int wait_for_alloc = 0;
4581 	int ret = 0;
4582 
4583 	/* Don't re-enter if we're already allocating a chunk */
4584 	if (trans->allocating_chunk)
4585 		return -ENOSPC;
4586 
4587 	space_info = __find_space_info(fs_info, flags);
4588 	ASSERT(space_info);
4589 
4590 again:
4591 	spin_lock(&space_info->lock);
4592 	if (force < space_info->force_alloc)
4593 		force = space_info->force_alloc;
4594 	if (space_info->full) {
4595 		if (should_alloc_chunk(fs_info, space_info, force))
4596 			ret = -ENOSPC;
4597 		else
4598 			ret = 0;
4599 		spin_unlock(&space_info->lock);
4600 		return ret;
4601 	}
4602 
4603 	if (!should_alloc_chunk(fs_info, space_info, force)) {
4604 		spin_unlock(&space_info->lock);
4605 		return 0;
4606 	} else if (space_info->chunk_alloc) {
4607 		wait_for_alloc = 1;
4608 	} else {
4609 		space_info->chunk_alloc = 1;
4610 	}
4611 
4612 	spin_unlock(&space_info->lock);
4613 
4614 	mutex_lock(&fs_info->chunk_mutex);
4615 
4616 	/*
4617 	 * The chunk_mutex is held throughout the entirety of a chunk
4618 	 * allocation, so once we've acquired the chunk_mutex we know that the
4619 	 * other guy is done and we need to recheck and see if we should
4620 	 * allocate.
4621 	 */
4622 	if (wait_for_alloc) {
4623 		mutex_unlock(&fs_info->chunk_mutex);
4624 		wait_for_alloc = 0;
4625 		cond_resched();
4626 		goto again;
4627 	}
4628 
4629 	trans->allocating_chunk = true;
4630 
4631 	/*
4632 	 * If we have mixed data/metadata chunks we want to make sure we keep
4633 	 * allocating mixed chunks instead of individual chunks.
4634 	 */
4635 	if (btrfs_mixed_space_info(space_info))
4636 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4637 
4638 	/*
4639 	 * if we're doing a data chunk, go ahead and make sure that
4640 	 * we keep a reasonable number of metadata chunks allocated in the
4641 	 * FS as well.
4642 	 */
4643 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4644 		fs_info->data_chunk_allocations++;
4645 		if (!(fs_info->data_chunk_allocations %
4646 		      fs_info->metadata_ratio))
4647 			force_metadata_allocation(fs_info);
4648 	}
4649 
4650 	/*
4651 	 * Check if we have enough space in SYSTEM chunk because we may need
4652 	 * to update devices.
4653 	 */
4654 	check_system_chunk(trans, fs_info, flags);
4655 
4656 	ret = btrfs_alloc_chunk(trans, fs_info, flags);
4657 	trans->allocating_chunk = false;
4658 
4659 	spin_lock(&space_info->lock);
4660 	if (ret < 0) {
4661 		if (ret == -ENOSPC)
4662 			space_info->full = 1;
4663 		else
4664 			goto out;
4665 	} else {
4666 		ret = 1;
4667 	}
4668 
4669 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4670 out:
4671 	space_info->chunk_alloc = 0;
4672 	spin_unlock(&space_info->lock);
4673 	mutex_unlock(&fs_info->chunk_mutex);
4674 	/*
4675 	 * When we allocate a new chunk we reserve space in the chunk block
4676 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4677 	 * add new nodes/leafs to it if we end up needing to do it when
4678 	 * inserting the chunk item and updating device items as part of the
4679 	 * second phase of chunk allocation, performed by
4680 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4681 	 * large number of new block groups to create in our transaction
4682 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
4683 	 * in extreme cases - like having a single transaction create many new
4684 	 * block groups when starting to write out the free space caches of all
4685 	 * the block groups that were made dirty during the lifetime of the
4686 	 * transaction.
4687 	 */
4688 	if (trans->can_flush_pending_bgs &&
4689 	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4690 		btrfs_create_pending_block_groups(trans);
4691 		btrfs_trans_release_chunk_metadata(trans);
4692 	}
4693 	return ret;
4694 }
4695 
4696 static int can_overcommit(struct btrfs_fs_info *fs_info,
4697 			  struct btrfs_space_info *space_info, u64 bytes,
4698 			  enum btrfs_reserve_flush_enum flush,
4699 			  bool system_chunk)
4700 {
4701 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4702 	u64 profile;
4703 	u64 space_size;
4704 	u64 avail;
4705 	u64 used;
4706 
4707 	/* Don't overcommit when in mixed mode. */
4708 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4709 		return 0;
4710 
4711 	if (system_chunk)
4712 		profile = btrfs_system_alloc_profile(fs_info);
4713 	else
4714 		profile = btrfs_metadata_alloc_profile(fs_info);
4715 
4716 	used = btrfs_space_info_used(space_info, false);
4717 
4718 	/*
4719 	 * We only want to allow over committing if we have lots of actual space
4720 	 * free, but if we don't have enough space to handle the global reserve
4721 	 * space then we could end up having a real enospc problem when trying
4722 	 * to allocate a chunk or some other such important allocation.
4723 	 */
4724 	spin_lock(&global_rsv->lock);
4725 	space_size = calc_global_rsv_need_space(global_rsv);
4726 	spin_unlock(&global_rsv->lock);
4727 	if (used + space_size >= space_info->total_bytes)
4728 		return 0;
4729 
4730 	used += space_info->bytes_may_use;
4731 
4732 	avail = atomic64_read(&fs_info->free_chunk_space);
4733 
4734 	/*
4735 	 * If we have dup, raid1 or raid10 then only half of the free
4736 	 * space is actually useable.  For raid56, the space info used
4737 	 * doesn't include the parity drive, so we don't have to
4738 	 * change the math
4739 	 */
4740 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
4741 		       BTRFS_BLOCK_GROUP_RAID1 |
4742 		       BTRFS_BLOCK_GROUP_RAID10))
4743 		avail >>= 1;
4744 
4745 	/*
4746 	 * If we aren't flushing all things, let us overcommit up to
4747 	 * 1/2th of the space. If we can flush, don't let us overcommit
4748 	 * too much, let it overcommit up to 1/8 of the space.
4749 	 */
4750 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
4751 		avail >>= 3;
4752 	else
4753 		avail >>= 1;
4754 
4755 	if (used + bytes < space_info->total_bytes + avail)
4756 		return 1;
4757 	return 0;
4758 }
4759 
4760 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4761 					 unsigned long nr_pages, int nr_items)
4762 {
4763 	struct super_block *sb = fs_info->sb;
4764 
4765 	if (down_read_trylock(&sb->s_umount)) {
4766 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4767 		up_read(&sb->s_umount);
4768 	} else {
4769 		/*
4770 		 * We needn't worry the filesystem going from r/w to r/o though
4771 		 * we don't acquire ->s_umount mutex, because the filesystem
4772 		 * should guarantee the delalloc inodes list be empty after
4773 		 * the filesystem is readonly(all dirty pages are written to
4774 		 * the disk).
4775 		 */
4776 		btrfs_start_delalloc_roots(fs_info, nr_items);
4777 		if (!current->journal_info)
4778 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4779 	}
4780 }
4781 
4782 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4783 					u64 to_reclaim)
4784 {
4785 	u64 bytes;
4786 	u64 nr;
4787 
4788 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4789 	nr = div64_u64(to_reclaim, bytes);
4790 	if (!nr)
4791 		nr = 1;
4792 	return nr;
4793 }
4794 
4795 #define EXTENT_SIZE_PER_ITEM	SZ_256K
4796 
4797 /*
4798  * shrink metadata reservation for delalloc
4799  */
4800 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4801 			    u64 orig, bool wait_ordered)
4802 {
4803 	struct btrfs_space_info *space_info;
4804 	struct btrfs_trans_handle *trans;
4805 	u64 delalloc_bytes;
4806 	u64 max_reclaim;
4807 	u64 items;
4808 	long time_left;
4809 	unsigned long nr_pages;
4810 	int loops;
4811 
4812 	/* Calc the number of the pages we need flush for space reservation */
4813 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
4814 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4815 
4816 	trans = (struct btrfs_trans_handle *)current->journal_info;
4817 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4818 
4819 	delalloc_bytes = percpu_counter_sum_positive(
4820 						&fs_info->delalloc_bytes);
4821 	if (delalloc_bytes == 0) {
4822 		if (trans)
4823 			return;
4824 		if (wait_ordered)
4825 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4826 		return;
4827 	}
4828 
4829 	loops = 0;
4830 	while (delalloc_bytes && loops < 3) {
4831 		max_reclaim = min(delalloc_bytes, to_reclaim);
4832 		nr_pages = max_reclaim >> PAGE_SHIFT;
4833 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4834 		/*
4835 		 * We need to wait for the async pages to actually start before
4836 		 * we do anything.
4837 		 */
4838 		max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4839 		if (!max_reclaim)
4840 			goto skip_async;
4841 
4842 		if (max_reclaim <= nr_pages)
4843 			max_reclaim = 0;
4844 		else
4845 			max_reclaim -= nr_pages;
4846 
4847 		wait_event(fs_info->async_submit_wait,
4848 			   atomic_read(&fs_info->async_delalloc_pages) <=
4849 			   (int)max_reclaim);
4850 skip_async:
4851 		spin_lock(&space_info->lock);
4852 		if (list_empty(&space_info->tickets) &&
4853 		    list_empty(&space_info->priority_tickets)) {
4854 			spin_unlock(&space_info->lock);
4855 			break;
4856 		}
4857 		spin_unlock(&space_info->lock);
4858 
4859 		loops++;
4860 		if (wait_ordered && !trans) {
4861 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4862 		} else {
4863 			time_left = schedule_timeout_killable(1);
4864 			if (time_left)
4865 				break;
4866 		}
4867 		delalloc_bytes = percpu_counter_sum_positive(
4868 						&fs_info->delalloc_bytes);
4869 	}
4870 }
4871 
4872 struct reserve_ticket {
4873 	u64 bytes;
4874 	int error;
4875 	struct list_head list;
4876 	wait_queue_head_t wait;
4877 };
4878 
4879 /**
4880  * maybe_commit_transaction - possibly commit the transaction if its ok to
4881  * @root - the root we're allocating for
4882  * @bytes - the number of bytes we want to reserve
4883  * @force - force the commit
4884  *
4885  * This will check to make sure that committing the transaction will actually
4886  * get us somewhere and then commit the transaction if it does.  Otherwise it
4887  * will return -ENOSPC.
4888  */
4889 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4890 				  struct btrfs_space_info *space_info)
4891 {
4892 	struct reserve_ticket *ticket = NULL;
4893 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4894 	struct btrfs_trans_handle *trans;
4895 	u64 bytes;
4896 
4897 	trans = (struct btrfs_trans_handle *)current->journal_info;
4898 	if (trans)
4899 		return -EAGAIN;
4900 
4901 	spin_lock(&space_info->lock);
4902 	if (!list_empty(&space_info->priority_tickets))
4903 		ticket = list_first_entry(&space_info->priority_tickets,
4904 					  struct reserve_ticket, list);
4905 	else if (!list_empty(&space_info->tickets))
4906 		ticket = list_first_entry(&space_info->tickets,
4907 					  struct reserve_ticket, list);
4908 	bytes = (ticket) ? ticket->bytes : 0;
4909 	spin_unlock(&space_info->lock);
4910 
4911 	if (!bytes)
4912 		return 0;
4913 
4914 	/* See if there is enough pinned space to make this reservation */
4915 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4916 				   bytes) >= 0)
4917 		goto commit;
4918 
4919 	/*
4920 	 * See if there is some space in the delayed insertion reservation for
4921 	 * this reservation.
4922 	 */
4923 	if (space_info != delayed_rsv->space_info)
4924 		return -ENOSPC;
4925 
4926 	spin_lock(&delayed_rsv->lock);
4927 	if (delayed_rsv->size > bytes)
4928 		bytes = 0;
4929 	else
4930 		bytes -= delayed_rsv->size;
4931 	spin_unlock(&delayed_rsv->lock);
4932 
4933 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4934 				   bytes) < 0) {
4935 		return -ENOSPC;
4936 	}
4937 
4938 commit:
4939 	trans = btrfs_join_transaction(fs_info->extent_root);
4940 	if (IS_ERR(trans))
4941 		return -ENOSPC;
4942 
4943 	return btrfs_commit_transaction(trans);
4944 }
4945 
4946 /*
4947  * Try to flush some data based on policy set by @state. This is only advisory
4948  * and may fail for various reasons. The caller is supposed to examine the
4949  * state of @space_info to detect the outcome.
4950  */
4951 static void flush_space(struct btrfs_fs_info *fs_info,
4952 		       struct btrfs_space_info *space_info, u64 num_bytes,
4953 		       int state)
4954 {
4955 	struct btrfs_root *root = fs_info->extent_root;
4956 	struct btrfs_trans_handle *trans;
4957 	int nr;
4958 	int ret = 0;
4959 
4960 	switch (state) {
4961 	case FLUSH_DELAYED_ITEMS_NR:
4962 	case FLUSH_DELAYED_ITEMS:
4963 		if (state == FLUSH_DELAYED_ITEMS_NR)
4964 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4965 		else
4966 			nr = -1;
4967 
4968 		trans = btrfs_join_transaction(root);
4969 		if (IS_ERR(trans)) {
4970 			ret = PTR_ERR(trans);
4971 			break;
4972 		}
4973 		ret = btrfs_run_delayed_items_nr(trans, nr);
4974 		btrfs_end_transaction(trans);
4975 		break;
4976 	case FLUSH_DELALLOC:
4977 	case FLUSH_DELALLOC_WAIT:
4978 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4979 				state == FLUSH_DELALLOC_WAIT);
4980 		break;
4981 	case ALLOC_CHUNK:
4982 		trans = btrfs_join_transaction(root);
4983 		if (IS_ERR(trans)) {
4984 			ret = PTR_ERR(trans);
4985 			break;
4986 		}
4987 		ret = do_chunk_alloc(trans, fs_info,
4988 				     btrfs_metadata_alloc_profile(fs_info),
4989 				     CHUNK_ALLOC_NO_FORCE);
4990 		btrfs_end_transaction(trans);
4991 		if (ret > 0 || ret == -ENOSPC)
4992 			ret = 0;
4993 		break;
4994 	case COMMIT_TRANS:
4995 		ret = may_commit_transaction(fs_info, space_info);
4996 		break;
4997 	default:
4998 		ret = -ENOSPC;
4999 		break;
5000 	}
5001 
5002 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
5003 				ret);
5004 	return;
5005 }
5006 
5007 static inline u64
5008 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
5009 				 struct btrfs_space_info *space_info,
5010 				 bool system_chunk)
5011 {
5012 	struct reserve_ticket *ticket;
5013 	u64 used;
5014 	u64 expected;
5015 	u64 to_reclaim = 0;
5016 
5017 	list_for_each_entry(ticket, &space_info->tickets, list)
5018 		to_reclaim += ticket->bytes;
5019 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
5020 		to_reclaim += ticket->bytes;
5021 	if (to_reclaim)
5022 		return to_reclaim;
5023 
5024 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
5025 	if (can_overcommit(fs_info, space_info, to_reclaim,
5026 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5027 		return 0;
5028 
5029 	used = btrfs_space_info_used(space_info, true);
5030 
5031 	if (can_overcommit(fs_info, space_info, SZ_1M,
5032 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5033 		expected = div_factor_fine(space_info->total_bytes, 95);
5034 	else
5035 		expected = div_factor_fine(space_info->total_bytes, 90);
5036 
5037 	if (used > expected)
5038 		to_reclaim = used - expected;
5039 	else
5040 		to_reclaim = 0;
5041 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5042 				     space_info->bytes_reserved);
5043 	return to_reclaim;
5044 }
5045 
5046 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5047 					struct btrfs_space_info *space_info,
5048 					u64 used, bool system_chunk)
5049 {
5050 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5051 
5052 	/* If we're just plain full then async reclaim just slows us down. */
5053 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5054 		return 0;
5055 
5056 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5057 					      system_chunk))
5058 		return 0;
5059 
5060 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5061 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5062 }
5063 
5064 static void wake_all_tickets(struct list_head *head)
5065 {
5066 	struct reserve_ticket *ticket;
5067 
5068 	while (!list_empty(head)) {
5069 		ticket = list_first_entry(head, struct reserve_ticket, list);
5070 		list_del_init(&ticket->list);
5071 		ticket->error = -ENOSPC;
5072 		wake_up(&ticket->wait);
5073 	}
5074 }
5075 
5076 /*
5077  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5078  * will loop and continuously try to flush as long as we are making progress.
5079  * We count progress as clearing off tickets each time we have to loop.
5080  */
5081 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5082 {
5083 	struct btrfs_fs_info *fs_info;
5084 	struct btrfs_space_info *space_info;
5085 	u64 to_reclaim;
5086 	int flush_state;
5087 	int commit_cycles = 0;
5088 	u64 last_tickets_id;
5089 
5090 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5091 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5092 
5093 	spin_lock(&space_info->lock);
5094 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5095 						      false);
5096 	if (!to_reclaim) {
5097 		space_info->flush = 0;
5098 		spin_unlock(&space_info->lock);
5099 		return;
5100 	}
5101 	last_tickets_id = space_info->tickets_id;
5102 	spin_unlock(&space_info->lock);
5103 
5104 	flush_state = FLUSH_DELAYED_ITEMS_NR;
5105 	do {
5106 		flush_space(fs_info, space_info, to_reclaim, flush_state);
5107 		spin_lock(&space_info->lock);
5108 		if (list_empty(&space_info->tickets)) {
5109 			space_info->flush = 0;
5110 			spin_unlock(&space_info->lock);
5111 			return;
5112 		}
5113 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5114 							      space_info,
5115 							      false);
5116 		if (last_tickets_id == space_info->tickets_id) {
5117 			flush_state++;
5118 		} else {
5119 			last_tickets_id = space_info->tickets_id;
5120 			flush_state = FLUSH_DELAYED_ITEMS_NR;
5121 			if (commit_cycles)
5122 				commit_cycles--;
5123 		}
5124 
5125 		if (flush_state > COMMIT_TRANS) {
5126 			commit_cycles++;
5127 			if (commit_cycles > 2) {
5128 				wake_all_tickets(&space_info->tickets);
5129 				space_info->flush = 0;
5130 			} else {
5131 				flush_state = FLUSH_DELAYED_ITEMS_NR;
5132 			}
5133 		}
5134 		spin_unlock(&space_info->lock);
5135 	} while (flush_state <= COMMIT_TRANS);
5136 }
5137 
5138 void btrfs_init_async_reclaim_work(struct work_struct *work)
5139 {
5140 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5141 }
5142 
5143 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5144 					    struct btrfs_space_info *space_info,
5145 					    struct reserve_ticket *ticket)
5146 {
5147 	u64 to_reclaim;
5148 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
5149 
5150 	spin_lock(&space_info->lock);
5151 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5152 						      false);
5153 	if (!to_reclaim) {
5154 		spin_unlock(&space_info->lock);
5155 		return;
5156 	}
5157 	spin_unlock(&space_info->lock);
5158 
5159 	do {
5160 		flush_space(fs_info, space_info, to_reclaim, flush_state);
5161 		flush_state++;
5162 		spin_lock(&space_info->lock);
5163 		if (ticket->bytes == 0) {
5164 			spin_unlock(&space_info->lock);
5165 			return;
5166 		}
5167 		spin_unlock(&space_info->lock);
5168 
5169 		/*
5170 		 * Priority flushers can't wait on delalloc without
5171 		 * deadlocking.
5172 		 */
5173 		if (flush_state == FLUSH_DELALLOC ||
5174 		    flush_state == FLUSH_DELALLOC_WAIT)
5175 			flush_state = ALLOC_CHUNK;
5176 	} while (flush_state < COMMIT_TRANS);
5177 }
5178 
5179 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5180 			       struct btrfs_space_info *space_info,
5181 			       struct reserve_ticket *ticket, u64 orig_bytes)
5182 
5183 {
5184 	DEFINE_WAIT(wait);
5185 	int ret = 0;
5186 
5187 	spin_lock(&space_info->lock);
5188 	while (ticket->bytes > 0 && ticket->error == 0) {
5189 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5190 		if (ret) {
5191 			ret = -EINTR;
5192 			break;
5193 		}
5194 		spin_unlock(&space_info->lock);
5195 
5196 		schedule();
5197 
5198 		finish_wait(&ticket->wait, &wait);
5199 		spin_lock(&space_info->lock);
5200 	}
5201 	if (!ret)
5202 		ret = ticket->error;
5203 	if (!list_empty(&ticket->list))
5204 		list_del_init(&ticket->list);
5205 	if (ticket->bytes && ticket->bytes < orig_bytes) {
5206 		u64 num_bytes = orig_bytes - ticket->bytes;
5207 		space_info->bytes_may_use -= num_bytes;
5208 		trace_btrfs_space_reservation(fs_info, "space_info",
5209 					      space_info->flags, num_bytes, 0);
5210 	}
5211 	spin_unlock(&space_info->lock);
5212 
5213 	return ret;
5214 }
5215 
5216 /**
5217  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5218  * @root - the root we're allocating for
5219  * @space_info - the space info we want to allocate from
5220  * @orig_bytes - the number of bytes we want
5221  * @flush - whether or not we can flush to make our reservation
5222  *
5223  * This will reserve orig_bytes number of bytes from the space info associated
5224  * with the block_rsv.  If there is not enough space it will make an attempt to
5225  * flush out space to make room.  It will do this by flushing delalloc if
5226  * possible or committing the transaction.  If flush is 0 then no attempts to
5227  * regain reservations will be made and this will fail if there is not enough
5228  * space already.
5229  */
5230 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5231 				    struct btrfs_space_info *space_info,
5232 				    u64 orig_bytes,
5233 				    enum btrfs_reserve_flush_enum flush,
5234 				    bool system_chunk)
5235 {
5236 	struct reserve_ticket ticket;
5237 	u64 used;
5238 	int ret = 0;
5239 
5240 	ASSERT(orig_bytes);
5241 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5242 
5243 	spin_lock(&space_info->lock);
5244 	ret = -ENOSPC;
5245 	used = btrfs_space_info_used(space_info, true);
5246 
5247 	/*
5248 	 * If we have enough space then hooray, make our reservation and carry
5249 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
5250 	 * If not things get more complicated.
5251 	 */
5252 	if (used + orig_bytes <= space_info->total_bytes) {
5253 		space_info->bytes_may_use += orig_bytes;
5254 		trace_btrfs_space_reservation(fs_info, "space_info",
5255 					      space_info->flags, orig_bytes, 1);
5256 		ret = 0;
5257 	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5258 				  system_chunk)) {
5259 		space_info->bytes_may_use += orig_bytes;
5260 		trace_btrfs_space_reservation(fs_info, "space_info",
5261 					      space_info->flags, orig_bytes, 1);
5262 		ret = 0;
5263 	}
5264 
5265 	/*
5266 	 * If we couldn't make a reservation then setup our reservation ticket
5267 	 * and kick the async worker if it's not already running.
5268 	 *
5269 	 * If we are a priority flusher then we just need to add our ticket to
5270 	 * the list and we will do our own flushing further down.
5271 	 */
5272 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5273 		ticket.bytes = orig_bytes;
5274 		ticket.error = 0;
5275 		init_waitqueue_head(&ticket.wait);
5276 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5277 			list_add_tail(&ticket.list, &space_info->tickets);
5278 			if (!space_info->flush) {
5279 				space_info->flush = 1;
5280 				trace_btrfs_trigger_flush(fs_info,
5281 							  space_info->flags,
5282 							  orig_bytes, flush,
5283 							  "enospc");
5284 				queue_work(system_unbound_wq,
5285 					   &fs_info->async_reclaim_work);
5286 			}
5287 		} else {
5288 			list_add_tail(&ticket.list,
5289 				      &space_info->priority_tickets);
5290 		}
5291 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5292 		used += orig_bytes;
5293 		/*
5294 		 * We will do the space reservation dance during log replay,
5295 		 * which means we won't have fs_info->fs_root set, so don't do
5296 		 * the async reclaim as we will panic.
5297 		 */
5298 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5299 		    need_do_async_reclaim(fs_info, space_info,
5300 					  used, system_chunk) &&
5301 		    !work_busy(&fs_info->async_reclaim_work)) {
5302 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
5303 						  orig_bytes, flush, "preempt");
5304 			queue_work(system_unbound_wq,
5305 				   &fs_info->async_reclaim_work);
5306 		}
5307 	}
5308 	spin_unlock(&space_info->lock);
5309 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5310 		return ret;
5311 
5312 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
5313 		return wait_reserve_ticket(fs_info, space_info, &ticket,
5314 					   orig_bytes);
5315 
5316 	ret = 0;
5317 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5318 	spin_lock(&space_info->lock);
5319 	if (ticket.bytes) {
5320 		if (ticket.bytes < orig_bytes) {
5321 			u64 num_bytes = orig_bytes - ticket.bytes;
5322 			space_info->bytes_may_use -= num_bytes;
5323 			trace_btrfs_space_reservation(fs_info, "space_info",
5324 						      space_info->flags,
5325 						      num_bytes, 0);
5326 
5327 		}
5328 		list_del_init(&ticket.list);
5329 		ret = -ENOSPC;
5330 	}
5331 	spin_unlock(&space_info->lock);
5332 	ASSERT(list_empty(&ticket.list));
5333 	return ret;
5334 }
5335 
5336 /**
5337  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5338  * @root - the root we're allocating for
5339  * @block_rsv - the block_rsv we're allocating for
5340  * @orig_bytes - the number of bytes we want
5341  * @flush - whether or not we can flush to make our reservation
5342  *
5343  * This will reserve orgi_bytes number of bytes from the space info associated
5344  * with the block_rsv.  If there is not enough space it will make an attempt to
5345  * flush out space to make room.  It will do this by flushing delalloc if
5346  * possible or committing the transaction.  If flush is 0 then no attempts to
5347  * regain reservations will be made and this will fail if there is not enough
5348  * space already.
5349  */
5350 static int reserve_metadata_bytes(struct btrfs_root *root,
5351 				  struct btrfs_block_rsv *block_rsv,
5352 				  u64 orig_bytes,
5353 				  enum btrfs_reserve_flush_enum flush)
5354 {
5355 	struct btrfs_fs_info *fs_info = root->fs_info;
5356 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5357 	int ret;
5358 	bool system_chunk = (root == fs_info->chunk_root);
5359 
5360 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5361 				       orig_bytes, flush, system_chunk);
5362 	if (ret == -ENOSPC &&
5363 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5364 		if (block_rsv != global_rsv &&
5365 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
5366 			ret = 0;
5367 	}
5368 	if (ret == -ENOSPC) {
5369 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5370 					      block_rsv->space_info->flags,
5371 					      orig_bytes, 1);
5372 
5373 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5374 			dump_space_info(fs_info, block_rsv->space_info,
5375 					orig_bytes, 0);
5376 	}
5377 	return ret;
5378 }
5379 
5380 static struct btrfs_block_rsv *get_block_rsv(
5381 					const struct btrfs_trans_handle *trans,
5382 					const struct btrfs_root *root)
5383 {
5384 	struct btrfs_fs_info *fs_info = root->fs_info;
5385 	struct btrfs_block_rsv *block_rsv = NULL;
5386 
5387 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5388 	    (root == fs_info->csum_root && trans->adding_csums) ||
5389 	    (root == fs_info->uuid_root))
5390 		block_rsv = trans->block_rsv;
5391 
5392 	if (!block_rsv)
5393 		block_rsv = root->block_rsv;
5394 
5395 	if (!block_rsv)
5396 		block_rsv = &fs_info->empty_block_rsv;
5397 
5398 	return block_rsv;
5399 }
5400 
5401 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5402 			       u64 num_bytes)
5403 {
5404 	int ret = -ENOSPC;
5405 	spin_lock(&block_rsv->lock);
5406 	if (block_rsv->reserved >= num_bytes) {
5407 		block_rsv->reserved -= num_bytes;
5408 		if (block_rsv->reserved < block_rsv->size)
5409 			block_rsv->full = 0;
5410 		ret = 0;
5411 	}
5412 	spin_unlock(&block_rsv->lock);
5413 	return ret;
5414 }
5415 
5416 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5417 				u64 num_bytes, int update_size)
5418 {
5419 	spin_lock(&block_rsv->lock);
5420 	block_rsv->reserved += num_bytes;
5421 	if (update_size)
5422 		block_rsv->size += num_bytes;
5423 	else if (block_rsv->reserved >= block_rsv->size)
5424 		block_rsv->full = 1;
5425 	spin_unlock(&block_rsv->lock);
5426 }
5427 
5428 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5429 			     struct btrfs_block_rsv *dest, u64 num_bytes,
5430 			     int min_factor)
5431 {
5432 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5433 	u64 min_bytes;
5434 
5435 	if (global_rsv->space_info != dest->space_info)
5436 		return -ENOSPC;
5437 
5438 	spin_lock(&global_rsv->lock);
5439 	min_bytes = div_factor(global_rsv->size, min_factor);
5440 	if (global_rsv->reserved < min_bytes + num_bytes) {
5441 		spin_unlock(&global_rsv->lock);
5442 		return -ENOSPC;
5443 	}
5444 	global_rsv->reserved -= num_bytes;
5445 	if (global_rsv->reserved < global_rsv->size)
5446 		global_rsv->full = 0;
5447 	spin_unlock(&global_rsv->lock);
5448 
5449 	block_rsv_add_bytes(dest, num_bytes, 1);
5450 	return 0;
5451 }
5452 
5453 /*
5454  * This is for space we already have accounted in space_info->bytes_may_use, so
5455  * basically when we're returning space from block_rsv's.
5456  */
5457 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5458 				     struct btrfs_space_info *space_info,
5459 				     u64 num_bytes)
5460 {
5461 	struct reserve_ticket *ticket;
5462 	struct list_head *head;
5463 	u64 used;
5464 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5465 	bool check_overcommit = false;
5466 
5467 	spin_lock(&space_info->lock);
5468 	head = &space_info->priority_tickets;
5469 
5470 	/*
5471 	 * If we are over our limit then we need to check and see if we can
5472 	 * overcommit, and if we can't then we just need to free up our space
5473 	 * and not satisfy any requests.
5474 	 */
5475 	used = btrfs_space_info_used(space_info, true);
5476 	if (used - num_bytes >= space_info->total_bytes)
5477 		check_overcommit = true;
5478 again:
5479 	while (!list_empty(head) && num_bytes) {
5480 		ticket = list_first_entry(head, struct reserve_ticket,
5481 					  list);
5482 		/*
5483 		 * We use 0 bytes because this space is already reserved, so
5484 		 * adding the ticket space would be a double count.
5485 		 */
5486 		if (check_overcommit &&
5487 		    !can_overcommit(fs_info, space_info, 0, flush, false))
5488 			break;
5489 		if (num_bytes >= ticket->bytes) {
5490 			list_del_init(&ticket->list);
5491 			num_bytes -= ticket->bytes;
5492 			ticket->bytes = 0;
5493 			space_info->tickets_id++;
5494 			wake_up(&ticket->wait);
5495 		} else {
5496 			ticket->bytes -= num_bytes;
5497 			num_bytes = 0;
5498 		}
5499 	}
5500 
5501 	if (num_bytes && head == &space_info->priority_tickets) {
5502 		head = &space_info->tickets;
5503 		flush = BTRFS_RESERVE_FLUSH_ALL;
5504 		goto again;
5505 	}
5506 	space_info->bytes_may_use -= num_bytes;
5507 	trace_btrfs_space_reservation(fs_info, "space_info",
5508 				      space_info->flags, num_bytes, 0);
5509 	spin_unlock(&space_info->lock);
5510 }
5511 
5512 /*
5513  * This is for newly allocated space that isn't accounted in
5514  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5515  * we use this helper.
5516  */
5517 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5518 				     struct btrfs_space_info *space_info,
5519 				     u64 num_bytes)
5520 {
5521 	struct reserve_ticket *ticket;
5522 	struct list_head *head = &space_info->priority_tickets;
5523 
5524 again:
5525 	while (!list_empty(head) && num_bytes) {
5526 		ticket = list_first_entry(head, struct reserve_ticket,
5527 					  list);
5528 		if (num_bytes >= ticket->bytes) {
5529 			trace_btrfs_space_reservation(fs_info, "space_info",
5530 						      space_info->flags,
5531 						      ticket->bytes, 1);
5532 			list_del_init(&ticket->list);
5533 			num_bytes -= ticket->bytes;
5534 			space_info->bytes_may_use += ticket->bytes;
5535 			ticket->bytes = 0;
5536 			space_info->tickets_id++;
5537 			wake_up(&ticket->wait);
5538 		} else {
5539 			trace_btrfs_space_reservation(fs_info, "space_info",
5540 						      space_info->flags,
5541 						      num_bytes, 1);
5542 			space_info->bytes_may_use += num_bytes;
5543 			ticket->bytes -= num_bytes;
5544 			num_bytes = 0;
5545 		}
5546 	}
5547 
5548 	if (num_bytes && head == &space_info->priority_tickets) {
5549 		head = &space_info->tickets;
5550 		goto again;
5551 	}
5552 }
5553 
5554 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5555 				    struct btrfs_block_rsv *block_rsv,
5556 				    struct btrfs_block_rsv *dest, u64 num_bytes,
5557 				    u64 *qgroup_to_release_ret)
5558 {
5559 	struct btrfs_space_info *space_info = block_rsv->space_info;
5560 	u64 qgroup_to_release = 0;
5561 	u64 ret;
5562 
5563 	spin_lock(&block_rsv->lock);
5564 	if (num_bytes == (u64)-1) {
5565 		num_bytes = block_rsv->size;
5566 		qgroup_to_release = block_rsv->qgroup_rsv_size;
5567 	}
5568 	block_rsv->size -= num_bytes;
5569 	if (block_rsv->reserved >= block_rsv->size) {
5570 		num_bytes = block_rsv->reserved - block_rsv->size;
5571 		block_rsv->reserved = block_rsv->size;
5572 		block_rsv->full = 1;
5573 	} else {
5574 		num_bytes = 0;
5575 	}
5576 	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5577 		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5578 				    block_rsv->qgroup_rsv_size;
5579 		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5580 	} else {
5581 		qgroup_to_release = 0;
5582 	}
5583 	spin_unlock(&block_rsv->lock);
5584 
5585 	ret = num_bytes;
5586 	if (num_bytes > 0) {
5587 		if (dest) {
5588 			spin_lock(&dest->lock);
5589 			if (!dest->full) {
5590 				u64 bytes_to_add;
5591 
5592 				bytes_to_add = dest->size - dest->reserved;
5593 				bytes_to_add = min(num_bytes, bytes_to_add);
5594 				dest->reserved += bytes_to_add;
5595 				if (dest->reserved >= dest->size)
5596 					dest->full = 1;
5597 				num_bytes -= bytes_to_add;
5598 			}
5599 			spin_unlock(&dest->lock);
5600 		}
5601 		if (num_bytes)
5602 			space_info_add_old_bytes(fs_info, space_info,
5603 						 num_bytes);
5604 	}
5605 	if (qgroup_to_release_ret)
5606 		*qgroup_to_release_ret = qgroup_to_release;
5607 	return ret;
5608 }
5609 
5610 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5611 			    struct btrfs_block_rsv *dst, u64 num_bytes,
5612 			    int update_size)
5613 {
5614 	int ret;
5615 
5616 	ret = block_rsv_use_bytes(src, num_bytes);
5617 	if (ret)
5618 		return ret;
5619 
5620 	block_rsv_add_bytes(dst, num_bytes, update_size);
5621 	return 0;
5622 }
5623 
5624 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5625 {
5626 	memset(rsv, 0, sizeof(*rsv));
5627 	spin_lock_init(&rsv->lock);
5628 	rsv->type = type;
5629 }
5630 
5631 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5632 				   struct btrfs_block_rsv *rsv,
5633 				   unsigned short type)
5634 {
5635 	btrfs_init_block_rsv(rsv, type);
5636 	rsv->space_info = __find_space_info(fs_info,
5637 					    BTRFS_BLOCK_GROUP_METADATA);
5638 }
5639 
5640 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5641 					      unsigned short type)
5642 {
5643 	struct btrfs_block_rsv *block_rsv;
5644 
5645 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5646 	if (!block_rsv)
5647 		return NULL;
5648 
5649 	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5650 	return block_rsv;
5651 }
5652 
5653 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5654 			  struct btrfs_block_rsv *rsv)
5655 {
5656 	if (!rsv)
5657 		return;
5658 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5659 	kfree(rsv);
5660 }
5661 
5662 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5663 {
5664 	kfree(rsv);
5665 }
5666 
5667 int btrfs_block_rsv_add(struct btrfs_root *root,
5668 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5669 			enum btrfs_reserve_flush_enum flush)
5670 {
5671 	int ret;
5672 
5673 	if (num_bytes == 0)
5674 		return 0;
5675 
5676 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5677 	if (!ret) {
5678 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
5679 		return 0;
5680 	}
5681 
5682 	return ret;
5683 }
5684 
5685 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5686 {
5687 	u64 num_bytes = 0;
5688 	int ret = -ENOSPC;
5689 
5690 	if (!block_rsv)
5691 		return 0;
5692 
5693 	spin_lock(&block_rsv->lock);
5694 	num_bytes = div_factor(block_rsv->size, min_factor);
5695 	if (block_rsv->reserved >= num_bytes)
5696 		ret = 0;
5697 	spin_unlock(&block_rsv->lock);
5698 
5699 	return ret;
5700 }
5701 
5702 int btrfs_block_rsv_refill(struct btrfs_root *root,
5703 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5704 			   enum btrfs_reserve_flush_enum flush)
5705 {
5706 	u64 num_bytes = 0;
5707 	int ret = -ENOSPC;
5708 
5709 	if (!block_rsv)
5710 		return 0;
5711 
5712 	spin_lock(&block_rsv->lock);
5713 	num_bytes = min_reserved;
5714 	if (block_rsv->reserved >= num_bytes)
5715 		ret = 0;
5716 	else
5717 		num_bytes -= block_rsv->reserved;
5718 	spin_unlock(&block_rsv->lock);
5719 
5720 	if (!ret)
5721 		return 0;
5722 
5723 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5724 	if (!ret) {
5725 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
5726 		return 0;
5727 	}
5728 
5729 	return ret;
5730 }
5731 
5732 /**
5733  * btrfs_inode_rsv_refill - refill the inode block rsv.
5734  * @inode - the inode we are refilling.
5735  * @flush - the flusing restriction.
5736  *
5737  * Essentially the same as btrfs_block_rsv_refill, except it uses the
5738  * block_rsv->size as the minimum size.  We'll either refill the missing amount
5739  * or return if we already have enough space.  This will also handle the resreve
5740  * tracepoint for the reserved amount.
5741  */
5742 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5743 				  enum btrfs_reserve_flush_enum flush)
5744 {
5745 	struct btrfs_root *root = inode->root;
5746 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5747 	u64 num_bytes = 0;
5748 	u64 qgroup_num_bytes = 0;
5749 	int ret = -ENOSPC;
5750 
5751 	spin_lock(&block_rsv->lock);
5752 	if (block_rsv->reserved < block_rsv->size)
5753 		num_bytes = block_rsv->size - block_rsv->reserved;
5754 	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5755 		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5756 				   block_rsv->qgroup_rsv_reserved;
5757 	spin_unlock(&block_rsv->lock);
5758 
5759 	if (num_bytes == 0)
5760 		return 0;
5761 
5762 	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5763 	if (ret)
5764 		return ret;
5765 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5766 	if (!ret) {
5767 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
5768 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
5769 					      btrfs_ino(inode), num_bytes, 1);
5770 
5771 		/* Don't forget to increase qgroup_rsv_reserved */
5772 		spin_lock(&block_rsv->lock);
5773 		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5774 		spin_unlock(&block_rsv->lock);
5775 	} else
5776 		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5777 	return ret;
5778 }
5779 
5780 /**
5781  * btrfs_inode_rsv_release - release any excessive reservation.
5782  * @inode - the inode we need to release from.
5783  * @qgroup_free - free or convert qgroup meta.
5784  *   Unlike normal operation, qgroup meta reservation needs to know if we are
5785  *   freeing qgroup reservation or just converting it into per-trans.  Normally
5786  *   @qgroup_free is true for error handling, and false for normal release.
5787  *
5788  * This is the same as btrfs_block_rsv_release, except that it handles the
5789  * tracepoint for the reservation.
5790  */
5791 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5792 {
5793 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5794 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5795 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5796 	u64 released = 0;
5797 	u64 qgroup_to_release = 0;
5798 
5799 	/*
5800 	 * Since we statically set the block_rsv->size we just want to say we
5801 	 * are releasing 0 bytes, and then we'll just get the reservation over
5802 	 * the size free'd.
5803 	 */
5804 	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5805 					   &qgroup_to_release);
5806 	if (released > 0)
5807 		trace_btrfs_space_reservation(fs_info, "delalloc",
5808 					      btrfs_ino(inode), released, 0);
5809 	if (qgroup_free)
5810 		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5811 	else
5812 		btrfs_qgroup_convert_reserved_meta(inode->root,
5813 						   qgroup_to_release);
5814 }
5815 
5816 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5817 			     struct btrfs_block_rsv *block_rsv,
5818 			     u64 num_bytes)
5819 {
5820 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5821 
5822 	if (global_rsv == block_rsv ||
5823 	    block_rsv->space_info != global_rsv->space_info)
5824 		global_rsv = NULL;
5825 	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5826 }
5827 
5828 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5829 {
5830 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5831 	struct btrfs_space_info *sinfo = block_rsv->space_info;
5832 	u64 num_bytes;
5833 
5834 	/*
5835 	 * The global block rsv is based on the size of the extent tree, the
5836 	 * checksum tree and the root tree.  If the fs is empty we want to set
5837 	 * it to a minimal amount for safety.
5838 	 */
5839 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5840 		btrfs_root_used(&fs_info->csum_root->root_item) +
5841 		btrfs_root_used(&fs_info->tree_root->root_item);
5842 	num_bytes = max_t(u64, num_bytes, SZ_16M);
5843 
5844 	spin_lock(&sinfo->lock);
5845 	spin_lock(&block_rsv->lock);
5846 
5847 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5848 
5849 	if (block_rsv->reserved < block_rsv->size) {
5850 		num_bytes = btrfs_space_info_used(sinfo, true);
5851 		if (sinfo->total_bytes > num_bytes) {
5852 			num_bytes = sinfo->total_bytes - num_bytes;
5853 			num_bytes = min(num_bytes,
5854 					block_rsv->size - block_rsv->reserved);
5855 			block_rsv->reserved += num_bytes;
5856 			sinfo->bytes_may_use += num_bytes;
5857 			trace_btrfs_space_reservation(fs_info, "space_info",
5858 						      sinfo->flags, num_bytes,
5859 						      1);
5860 		}
5861 	} else if (block_rsv->reserved > block_rsv->size) {
5862 		num_bytes = block_rsv->reserved - block_rsv->size;
5863 		sinfo->bytes_may_use -= num_bytes;
5864 		trace_btrfs_space_reservation(fs_info, "space_info",
5865 				      sinfo->flags, num_bytes, 0);
5866 		block_rsv->reserved = block_rsv->size;
5867 	}
5868 
5869 	if (block_rsv->reserved == block_rsv->size)
5870 		block_rsv->full = 1;
5871 	else
5872 		block_rsv->full = 0;
5873 
5874 	spin_unlock(&block_rsv->lock);
5875 	spin_unlock(&sinfo->lock);
5876 }
5877 
5878 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5879 {
5880 	struct btrfs_space_info *space_info;
5881 
5882 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5883 	fs_info->chunk_block_rsv.space_info = space_info;
5884 
5885 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5886 	fs_info->global_block_rsv.space_info = space_info;
5887 	fs_info->trans_block_rsv.space_info = space_info;
5888 	fs_info->empty_block_rsv.space_info = space_info;
5889 	fs_info->delayed_block_rsv.space_info = space_info;
5890 
5891 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5892 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5893 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5894 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5895 	if (fs_info->quota_root)
5896 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5897 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5898 
5899 	update_global_block_rsv(fs_info);
5900 }
5901 
5902 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5903 {
5904 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5905 				(u64)-1, NULL);
5906 	WARN_ON(fs_info->trans_block_rsv.size > 0);
5907 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5908 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
5909 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5910 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
5911 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5912 }
5913 
5914 
5915 /*
5916  * To be called after all the new block groups attached to the transaction
5917  * handle have been created (btrfs_create_pending_block_groups()).
5918  */
5919 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5920 {
5921 	struct btrfs_fs_info *fs_info = trans->fs_info;
5922 
5923 	if (!trans->chunk_bytes_reserved)
5924 		return;
5925 
5926 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5927 
5928 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5929 				trans->chunk_bytes_reserved, NULL);
5930 	trans->chunk_bytes_reserved = 0;
5931 }
5932 
5933 /*
5934  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5935  * root: the root of the parent directory
5936  * rsv: block reservation
5937  * items: the number of items that we need do reservation
5938  * qgroup_reserved: used to return the reserved size in qgroup
5939  *
5940  * This function is used to reserve the space for snapshot/subvolume
5941  * creation and deletion. Those operations are different with the
5942  * common file/directory operations, they change two fs/file trees
5943  * and root tree, the number of items that the qgroup reserves is
5944  * different with the free space reservation. So we can not use
5945  * the space reservation mechanism in start_transaction().
5946  */
5947 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5948 				     struct btrfs_block_rsv *rsv,
5949 				     int items,
5950 				     bool use_global_rsv)
5951 {
5952 	u64 num_bytes;
5953 	int ret;
5954 	struct btrfs_fs_info *fs_info = root->fs_info;
5955 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5956 
5957 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5958 		/* One for parent inode, two for dir entries */
5959 		num_bytes = 3 * fs_info->nodesize;
5960 		ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
5961 		if (ret)
5962 			return ret;
5963 	} else {
5964 		num_bytes = 0;
5965 	}
5966 
5967 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5968 	rsv->space_info = __find_space_info(fs_info,
5969 					    BTRFS_BLOCK_GROUP_METADATA);
5970 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5971 				  BTRFS_RESERVE_FLUSH_ALL);
5972 
5973 	if (ret == -ENOSPC && use_global_rsv)
5974 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5975 
5976 	if (ret && num_bytes)
5977 		btrfs_qgroup_free_meta_prealloc(root, num_bytes);
5978 
5979 	return ret;
5980 }
5981 
5982 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5983 				      struct btrfs_block_rsv *rsv)
5984 {
5985 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5986 }
5987 
5988 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5989 						 struct btrfs_inode *inode)
5990 {
5991 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5992 	u64 reserve_size = 0;
5993 	u64 qgroup_rsv_size = 0;
5994 	u64 csum_leaves;
5995 	unsigned outstanding_extents;
5996 
5997 	lockdep_assert_held(&inode->lock);
5998 	outstanding_extents = inode->outstanding_extents;
5999 	if (outstanding_extents)
6000 		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6001 						outstanding_extents + 1);
6002 	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6003 						 inode->csum_bytes);
6004 	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6005 						       csum_leaves);
6006 	/*
6007 	 * For qgroup rsv, the calculation is very simple:
6008 	 * account one nodesize for each outstanding extent
6009 	 *
6010 	 * This is overestimating in most cases.
6011 	 */
6012 	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
6013 
6014 	spin_lock(&block_rsv->lock);
6015 	block_rsv->size = reserve_size;
6016 	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6017 	spin_unlock(&block_rsv->lock);
6018 }
6019 
6020 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6021 {
6022 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6023 	unsigned nr_extents;
6024 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6025 	int ret = 0;
6026 	bool delalloc_lock = true;
6027 
6028 	/* If we are a free space inode we need to not flush since we will be in
6029 	 * the middle of a transaction commit.  We also don't need the delalloc
6030 	 * mutex since we won't race with anybody.  We need this mostly to make
6031 	 * lockdep shut its filthy mouth.
6032 	 *
6033 	 * If we have a transaction open (can happen if we call truncate_block
6034 	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6035 	 */
6036 	if (btrfs_is_free_space_inode(inode)) {
6037 		flush = BTRFS_RESERVE_NO_FLUSH;
6038 		delalloc_lock = false;
6039 	} else {
6040 		if (current->journal_info)
6041 			flush = BTRFS_RESERVE_FLUSH_LIMIT;
6042 
6043 		if (btrfs_transaction_in_commit(fs_info))
6044 			schedule_timeout(1);
6045 	}
6046 
6047 	if (delalloc_lock)
6048 		mutex_lock(&inode->delalloc_mutex);
6049 
6050 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6051 
6052 	/* Add our new extents and calculate the new rsv size. */
6053 	spin_lock(&inode->lock);
6054 	nr_extents = count_max_extents(num_bytes);
6055 	btrfs_mod_outstanding_extents(inode, nr_extents);
6056 	inode->csum_bytes += num_bytes;
6057 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6058 	spin_unlock(&inode->lock);
6059 
6060 	ret = btrfs_inode_rsv_refill(inode, flush);
6061 	if (unlikely(ret))
6062 		goto out_fail;
6063 
6064 	if (delalloc_lock)
6065 		mutex_unlock(&inode->delalloc_mutex);
6066 	return 0;
6067 
6068 out_fail:
6069 	spin_lock(&inode->lock);
6070 	nr_extents = count_max_extents(num_bytes);
6071 	btrfs_mod_outstanding_extents(inode, -nr_extents);
6072 	inode->csum_bytes -= num_bytes;
6073 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6074 	spin_unlock(&inode->lock);
6075 
6076 	btrfs_inode_rsv_release(inode, true);
6077 	if (delalloc_lock)
6078 		mutex_unlock(&inode->delalloc_mutex);
6079 	return ret;
6080 }
6081 
6082 /**
6083  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6084  * @inode: the inode to release the reservation for.
6085  * @num_bytes: the number of bytes we are releasing.
6086  * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6087  *
6088  * This will release the metadata reservation for an inode.  This can be called
6089  * once we complete IO for a given set of bytes to release their metadata
6090  * reservations, or on error for the same reason.
6091  */
6092 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6093 				     bool qgroup_free)
6094 {
6095 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6096 
6097 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6098 	spin_lock(&inode->lock);
6099 	inode->csum_bytes -= num_bytes;
6100 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6101 	spin_unlock(&inode->lock);
6102 
6103 	if (btrfs_is_testing(fs_info))
6104 		return;
6105 
6106 	btrfs_inode_rsv_release(inode, qgroup_free);
6107 }
6108 
6109 /**
6110  * btrfs_delalloc_release_extents - release our outstanding_extents
6111  * @inode: the inode to balance the reservation for.
6112  * @num_bytes: the number of bytes we originally reserved with
6113  * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6114  *
6115  * When we reserve space we increase outstanding_extents for the extents we may
6116  * add.  Once we've set the range as delalloc or created our ordered extents we
6117  * have outstanding_extents to track the real usage, so we use this to free our
6118  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6119  * with btrfs_delalloc_reserve_metadata.
6120  */
6121 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6122 				    bool qgroup_free)
6123 {
6124 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6125 	unsigned num_extents;
6126 
6127 	spin_lock(&inode->lock);
6128 	num_extents = count_max_extents(num_bytes);
6129 	btrfs_mod_outstanding_extents(inode, -num_extents);
6130 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6131 	spin_unlock(&inode->lock);
6132 
6133 	if (btrfs_is_testing(fs_info))
6134 		return;
6135 
6136 	btrfs_inode_rsv_release(inode, qgroup_free);
6137 }
6138 
6139 /**
6140  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6141  * delalloc
6142  * @inode: inode we're writing to
6143  * @start: start range we are writing to
6144  * @len: how long the range we are writing to
6145  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6146  * 	      current reservation.
6147  *
6148  * This will do the following things
6149  *
6150  * o reserve space in data space info for num bytes
6151  *   and reserve precious corresponding qgroup space
6152  *   (Done in check_data_free_space)
6153  *
6154  * o reserve space for metadata space, based on the number of outstanding
6155  *   extents and how much csums will be needed
6156  *   also reserve metadata space in a per root over-reserve method.
6157  * o add to the inodes->delalloc_bytes
6158  * o add it to the fs_info's delalloc inodes list.
6159  *   (Above 3 all done in delalloc_reserve_metadata)
6160  *
6161  * Return 0 for success
6162  * Return <0 for error(-ENOSPC or -EQUOT)
6163  */
6164 int btrfs_delalloc_reserve_space(struct inode *inode,
6165 			struct extent_changeset **reserved, u64 start, u64 len)
6166 {
6167 	int ret;
6168 
6169 	ret = btrfs_check_data_free_space(inode, reserved, start, len);
6170 	if (ret < 0)
6171 		return ret;
6172 	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6173 	if (ret < 0)
6174 		btrfs_free_reserved_data_space(inode, *reserved, start, len);
6175 	return ret;
6176 }
6177 
6178 /**
6179  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6180  * @inode: inode we're releasing space for
6181  * @start: start position of the space already reserved
6182  * @len: the len of the space already reserved
6183  * @release_bytes: the len of the space we consumed or didn't use
6184  *
6185  * This function will release the metadata space that was not used and will
6186  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6187  * list if there are no delalloc bytes left.
6188  * Also it will handle the qgroup reserved space.
6189  */
6190 void btrfs_delalloc_release_space(struct inode *inode,
6191 				  struct extent_changeset *reserved,
6192 				  u64 start, u64 len, bool qgroup_free)
6193 {
6194 	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6195 	btrfs_free_reserved_data_space(inode, reserved, start, len);
6196 }
6197 
6198 static int update_block_group(struct btrfs_trans_handle *trans,
6199 			      struct btrfs_fs_info *info, u64 bytenr,
6200 			      u64 num_bytes, int alloc)
6201 {
6202 	struct btrfs_block_group_cache *cache = NULL;
6203 	u64 total = num_bytes;
6204 	u64 old_val;
6205 	u64 byte_in_group;
6206 	int factor;
6207 
6208 	/* block accounting for super block */
6209 	spin_lock(&info->delalloc_root_lock);
6210 	old_val = btrfs_super_bytes_used(info->super_copy);
6211 	if (alloc)
6212 		old_val += num_bytes;
6213 	else
6214 		old_val -= num_bytes;
6215 	btrfs_set_super_bytes_used(info->super_copy, old_val);
6216 	spin_unlock(&info->delalloc_root_lock);
6217 
6218 	while (total) {
6219 		cache = btrfs_lookup_block_group(info, bytenr);
6220 		if (!cache)
6221 			return -ENOENT;
6222 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6223 				    BTRFS_BLOCK_GROUP_RAID1 |
6224 				    BTRFS_BLOCK_GROUP_RAID10))
6225 			factor = 2;
6226 		else
6227 			factor = 1;
6228 		/*
6229 		 * If this block group has free space cache written out, we
6230 		 * need to make sure to load it if we are removing space.  This
6231 		 * is because we need the unpinning stage to actually add the
6232 		 * space back to the block group, otherwise we will leak space.
6233 		 */
6234 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
6235 			cache_block_group(cache, 1);
6236 
6237 		byte_in_group = bytenr - cache->key.objectid;
6238 		WARN_ON(byte_in_group > cache->key.offset);
6239 
6240 		spin_lock(&cache->space_info->lock);
6241 		spin_lock(&cache->lock);
6242 
6243 		if (btrfs_test_opt(info, SPACE_CACHE) &&
6244 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
6245 			cache->disk_cache_state = BTRFS_DC_CLEAR;
6246 
6247 		old_val = btrfs_block_group_used(&cache->item);
6248 		num_bytes = min(total, cache->key.offset - byte_in_group);
6249 		if (alloc) {
6250 			old_val += num_bytes;
6251 			btrfs_set_block_group_used(&cache->item, old_val);
6252 			cache->reserved -= num_bytes;
6253 			cache->space_info->bytes_reserved -= num_bytes;
6254 			cache->space_info->bytes_used += num_bytes;
6255 			cache->space_info->disk_used += num_bytes * factor;
6256 			spin_unlock(&cache->lock);
6257 			spin_unlock(&cache->space_info->lock);
6258 		} else {
6259 			old_val -= num_bytes;
6260 			btrfs_set_block_group_used(&cache->item, old_val);
6261 			cache->pinned += num_bytes;
6262 			cache->space_info->bytes_pinned += num_bytes;
6263 			cache->space_info->bytes_used -= num_bytes;
6264 			cache->space_info->disk_used -= num_bytes * factor;
6265 			spin_unlock(&cache->lock);
6266 			spin_unlock(&cache->space_info->lock);
6267 
6268 			trace_btrfs_space_reservation(info, "pinned",
6269 						      cache->space_info->flags,
6270 						      num_bytes, 1);
6271 			percpu_counter_add(&cache->space_info->total_bytes_pinned,
6272 					   num_bytes);
6273 			set_extent_dirty(info->pinned_extents,
6274 					 bytenr, bytenr + num_bytes - 1,
6275 					 GFP_NOFS | __GFP_NOFAIL);
6276 		}
6277 
6278 		spin_lock(&trans->transaction->dirty_bgs_lock);
6279 		if (list_empty(&cache->dirty_list)) {
6280 			list_add_tail(&cache->dirty_list,
6281 				      &trans->transaction->dirty_bgs);
6282 				trans->transaction->num_dirty_bgs++;
6283 			btrfs_get_block_group(cache);
6284 		}
6285 		spin_unlock(&trans->transaction->dirty_bgs_lock);
6286 
6287 		/*
6288 		 * No longer have used bytes in this block group, queue it for
6289 		 * deletion. We do this after adding the block group to the
6290 		 * dirty list to avoid races between cleaner kthread and space
6291 		 * cache writeout.
6292 		 */
6293 		if (!alloc && old_val == 0) {
6294 			spin_lock(&info->unused_bgs_lock);
6295 			if (list_empty(&cache->bg_list)) {
6296 				btrfs_get_block_group(cache);
6297 				trace_btrfs_add_unused_block_group(cache);
6298 				list_add_tail(&cache->bg_list,
6299 					      &info->unused_bgs);
6300 			}
6301 			spin_unlock(&info->unused_bgs_lock);
6302 		}
6303 
6304 		btrfs_put_block_group(cache);
6305 		total -= num_bytes;
6306 		bytenr += num_bytes;
6307 	}
6308 	return 0;
6309 }
6310 
6311 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6312 {
6313 	struct btrfs_block_group_cache *cache;
6314 	u64 bytenr;
6315 
6316 	spin_lock(&fs_info->block_group_cache_lock);
6317 	bytenr = fs_info->first_logical_byte;
6318 	spin_unlock(&fs_info->block_group_cache_lock);
6319 
6320 	if (bytenr < (u64)-1)
6321 		return bytenr;
6322 
6323 	cache = btrfs_lookup_first_block_group(fs_info, search_start);
6324 	if (!cache)
6325 		return 0;
6326 
6327 	bytenr = cache->key.objectid;
6328 	btrfs_put_block_group(cache);
6329 
6330 	return bytenr;
6331 }
6332 
6333 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6334 			   struct btrfs_block_group_cache *cache,
6335 			   u64 bytenr, u64 num_bytes, int reserved)
6336 {
6337 	spin_lock(&cache->space_info->lock);
6338 	spin_lock(&cache->lock);
6339 	cache->pinned += num_bytes;
6340 	cache->space_info->bytes_pinned += num_bytes;
6341 	if (reserved) {
6342 		cache->reserved -= num_bytes;
6343 		cache->space_info->bytes_reserved -= num_bytes;
6344 	}
6345 	spin_unlock(&cache->lock);
6346 	spin_unlock(&cache->space_info->lock);
6347 
6348 	trace_btrfs_space_reservation(fs_info, "pinned",
6349 				      cache->space_info->flags, num_bytes, 1);
6350 	percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
6351 	set_extent_dirty(fs_info->pinned_extents, bytenr,
6352 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6353 	return 0;
6354 }
6355 
6356 /*
6357  * this function must be called within transaction
6358  */
6359 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6360 		     u64 bytenr, u64 num_bytes, int reserved)
6361 {
6362 	struct btrfs_block_group_cache *cache;
6363 
6364 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6365 	BUG_ON(!cache); /* Logic error */
6366 
6367 	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6368 
6369 	btrfs_put_block_group(cache);
6370 	return 0;
6371 }
6372 
6373 /*
6374  * this function must be called within transaction
6375  */
6376 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6377 				    u64 bytenr, u64 num_bytes)
6378 {
6379 	struct btrfs_block_group_cache *cache;
6380 	int ret;
6381 
6382 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6383 	if (!cache)
6384 		return -EINVAL;
6385 
6386 	/*
6387 	 * pull in the free space cache (if any) so that our pin
6388 	 * removes the free space from the cache.  We have load_only set
6389 	 * to one because the slow code to read in the free extents does check
6390 	 * the pinned extents.
6391 	 */
6392 	cache_block_group(cache, 1);
6393 
6394 	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6395 
6396 	/* remove us from the free space cache (if we're there at all) */
6397 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6398 	btrfs_put_block_group(cache);
6399 	return ret;
6400 }
6401 
6402 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6403 				   u64 start, u64 num_bytes)
6404 {
6405 	int ret;
6406 	struct btrfs_block_group_cache *block_group;
6407 	struct btrfs_caching_control *caching_ctl;
6408 
6409 	block_group = btrfs_lookup_block_group(fs_info, start);
6410 	if (!block_group)
6411 		return -EINVAL;
6412 
6413 	cache_block_group(block_group, 0);
6414 	caching_ctl = get_caching_control(block_group);
6415 
6416 	if (!caching_ctl) {
6417 		/* Logic error */
6418 		BUG_ON(!block_group_cache_done(block_group));
6419 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6420 	} else {
6421 		mutex_lock(&caching_ctl->mutex);
6422 
6423 		if (start >= caching_ctl->progress) {
6424 			ret = add_excluded_extent(fs_info, start, num_bytes);
6425 		} else if (start + num_bytes <= caching_ctl->progress) {
6426 			ret = btrfs_remove_free_space(block_group,
6427 						      start, num_bytes);
6428 		} else {
6429 			num_bytes = caching_ctl->progress - start;
6430 			ret = btrfs_remove_free_space(block_group,
6431 						      start, num_bytes);
6432 			if (ret)
6433 				goto out_lock;
6434 
6435 			num_bytes = (start + num_bytes) -
6436 				caching_ctl->progress;
6437 			start = caching_ctl->progress;
6438 			ret = add_excluded_extent(fs_info, start, num_bytes);
6439 		}
6440 out_lock:
6441 		mutex_unlock(&caching_ctl->mutex);
6442 		put_caching_control(caching_ctl);
6443 	}
6444 	btrfs_put_block_group(block_group);
6445 	return ret;
6446 }
6447 
6448 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6449 				 struct extent_buffer *eb)
6450 {
6451 	struct btrfs_file_extent_item *item;
6452 	struct btrfs_key key;
6453 	int found_type;
6454 	int i;
6455 	int ret = 0;
6456 
6457 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6458 		return 0;
6459 
6460 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
6461 		btrfs_item_key_to_cpu(eb, &key, i);
6462 		if (key.type != BTRFS_EXTENT_DATA_KEY)
6463 			continue;
6464 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6465 		found_type = btrfs_file_extent_type(eb, item);
6466 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
6467 			continue;
6468 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6469 			continue;
6470 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6471 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6472 		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6473 		if (ret)
6474 			break;
6475 	}
6476 
6477 	return ret;
6478 }
6479 
6480 static void
6481 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6482 {
6483 	atomic_inc(&bg->reservations);
6484 }
6485 
6486 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6487 					const u64 start)
6488 {
6489 	struct btrfs_block_group_cache *bg;
6490 
6491 	bg = btrfs_lookup_block_group(fs_info, start);
6492 	ASSERT(bg);
6493 	if (atomic_dec_and_test(&bg->reservations))
6494 		wake_up_var(&bg->reservations);
6495 	btrfs_put_block_group(bg);
6496 }
6497 
6498 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6499 {
6500 	struct btrfs_space_info *space_info = bg->space_info;
6501 
6502 	ASSERT(bg->ro);
6503 
6504 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6505 		return;
6506 
6507 	/*
6508 	 * Our block group is read only but before we set it to read only,
6509 	 * some task might have had allocated an extent from it already, but it
6510 	 * has not yet created a respective ordered extent (and added it to a
6511 	 * root's list of ordered extents).
6512 	 * Therefore wait for any task currently allocating extents, since the
6513 	 * block group's reservations counter is incremented while a read lock
6514 	 * on the groups' semaphore is held and decremented after releasing
6515 	 * the read access on that semaphore and creating the ordered extent.
6516 	 */
6517 	down_write(&space_info->groups_sem);
6518 	up_write(&space_info->groups_sem);
6519 
6520 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6521 }
6522 
6523 /**
6524  * btrfs_add_reserved_bytes - update the block_group and space info counters
6525  * @cache:	The cache we are manipulating
6526  * @ram_bytes:  The number of bytes of file content, and will be same to
6527  *              @num_bytes except for the compress path.
6528  * @num_bytes:	The number of bytes in question
6529  * @delalloc:   The blocks are allocated for the delalloc write
6530  *
6531  * This is called by the allocator when it reserves space. If this is a
6532  * reservation and the block group has become read only we cannot make the
6533  * reservation and return -EAGAIN, otherwise this function always succeeds.
6534  */
6535 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6536 				    u64 ram_bytes, u64 num_bytes, int delalloc)
6537 {
6538 	struct btrfs_space_info *space_info = cache->space_info;
6539 	int ret = 0;
6540 
6541 	spin_lock(&space_info->lock);
6542 	spin_lock(&cache->lock);
6543 	if (cache->ro) {
6544 		ret = -EAGAIN;
6545 	} else {
6546 		cache->reserved += num_bytes;
6547 		space_info->bytes_reserved += num_bytes;
6548 
6549 		trace_btrfs_space_reservation(cache->fs_info,
6550 				"space_info", space_info->flags,
6551 				ram_bytes, 0);
6552 		space_info->bytes_may_use -= ram_bytes;
6553 		if (delalloc)
6554 			cache->delalloc_bytes += num_bytes;
6555 	}
6556 	spin_unlock(&cache->lock);
6557 	spin_unlock(&space_info->lock);
6558 	return ret;
6559 }
6560 
6561 /**
6562  * btrfs_free_reserved_bytes - update the block_group and space info counters
6563  * @cache:      The cache we are manipulating
6564  * @num_bytes:  The number of bytes in question
6565  * @delalloc:   The blocks are allocated for the delalloc write
6566  *
6567  * This is called by somebody who is freeing space that was never actually used
6568  * on disk.  For example if you reserve some space for a new leaf in transaction
6569  * A and before transaction A commits you free that leaf, you call this with
6570  * reserve set to 0 in order to clear the reservation.
6571  */
6572 
6573 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6574 				     u64 num_bytes, int delalloc)
6575 {
6576 	struct btrfs_space_info *space_info = cache->space_info;
6577 	int ret = 0;
6578 
6579 	spin_lock(&space_info->lock);
6580 	spin_lock(&cache->lock);
6581 	if (cache->ro)
6582 		space_info->bytes_readonly += num_bytes;
6583 	cache->reserved -= num_bytes;
6584 	space_info->bytes_reserved -= num_bytes;
6585 
6586 	if (delalloc)
6587 		cache->delalloc_bytes -= num_bytes;
6588 	spin_unlock(&cache->lock);
6589 	spin_unlock(&space_info->lock);
6590 	return ret;
6591 }
6592 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6593 {
6594 	struct btrfs_caching_control *next;
6595 	struct btrfs_caching_control *caching_ctl;
6596 	struct btrfs_block_group_cache *cache;
6597 
6598 	down_write(&fs_info->commit_root_sem);
6599 
6600 	list_for_each_entry_safe(caching_ctl, next,
6601 				 &fs_info->caching_block_groups, list) {
6602 		cache = caching_ctl->block_group;
6603 		if (block_group_cache_done(cache)) {
6604 			cache->last_byte_to_unpin = (u64)-1;
6605 			list_del_init(&caching_ctl->list);
6606 			put_caching_control(caching_ctl);
6607 		} else {
6608 			cache->last_byte_to_unpin = caching_ctl->progress;
6609 		}
6610 	}
6611 
6612 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6613 		fs_info->pinned_extents = &fs_info->freed_extents[1];
6614 	else
6615 		fs_info->pinned_extents = &fs_info->freed_extents[0];
6616 
6617 	up_write(&fs_info->commit_root_sem);
6618 
6619 	update_global_block_rsv(fs_info);
6620 }
6621 
6622 /*
6623  * Returns the free cluster for the given space info and sets empty_cluster to
6624  * what it should be based on the mount options.
6625  */
6626 static struct btrfs_free_cluster *
6627 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6628 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
6629 {
6630 	struct btrfs_free_cluster *ret = NULL;
6631 
6632 	*empty_cluster = 0;
6633 	if (btrfs_mixed_space_info(space_info))
6634 		return ret;
6635 
6636 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6637 		ret = &fs_info->meta_alloc_cluster;
6638 		if (btrfs_test_opt(fs_info, SSD))
6639 			*empty_cluster = SZ_2M;
6640 		else
6641 			*empty_cluster = SZ_64K;
6642 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6643 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
6644 		*empty_cluster = SZ_2M;
6645 		ret = &fs_info->data_alloc_cluster;
6646 	}
6647 
6648 	return ret;
6649 }
6650 
6651 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6652 			      u64 start, u64 end,
6653 			      const bool return_free_space)
6654 {
6655 	struct btrfs_block_group_cache *cache = NULL;
6656 	struct btrfs_space_info *space_info;
6657 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6658 	struct btrfs_free_cluster *cluster = NULL;
6659 	u64 len;
6660 	u64 total_unpinned = 0;
6661 	u64 empty_cluster = 0;
6662 	bool readonly;
6663 
6664 	while (start <= end) {
6665 		readonly = false;
6666 		if (!cache ||
6667 		    start >= cache->key.objectid + cache->key.offset) {
6668 			if (cache)
6669 				btrfs_put_block_group(cache);
6670 			total_unpinned = 0;
6671 			cache = btrfs_lookup_block_group(fs_info, start);
6672 			BUG_ON(!cache); /* Logic error */
6673 
6674 			cluster = fetch_cluster_info(fs_info,
6675 						     cache->space_info,
6676 						     &empty_cluster);
6677 			empty_cluster <<= 1;
6678 		}
6679 
6680 		len = cache->key.objectid + cache->key.offset - start;
6681 		len = min(len, end + 1 - start);
6682 
6683 		if (start < cache->last_byte_to_unpin) {
6684 			len = min(len, cache->last_byte_to_unpin - start);
6685 			if (return_free_space)
6686 				btrfs_add_free_space(cache, start, len);
6687 		}
6688 
6689 		start += len;
6690 		total_unpinned += len;
6691 		space_info = cache->space_info;
6692 
6693 		/*
6694 		 * If this space cluster has been marked as fragmented and we've
6695 		 * unpinned enough in this block group to potentially allow a
6696 		 * cluster to be created inside of it go ahead and clear the
6697 		 * fragmented check.
6698 		 */
6699 		if (cluster && cluster->fragmented &&
6700 		    total_unpinned > empty_cluster) {
6701 			spin_lock(&cluster->lock);
6702 			cluster->fragmented = 0;
6703 			spin_unlock(&cluster->lock);
6704 		}
6705 
6706 		spin_lock(&space_info->lock);
6707 		spin_lock(&cache->lock);
6708 		cache->pinned -= len;
6709 		space_info->bytes_pinned -= len;
6710 
6711 		trace_btrfs_space_reservation(fs_info, "pinned",
6712 					      space_info->flags, len, 0);
6713 		space_info->max_extent_size = 0;
6714 		percpu_counter_add(&space_info->total_bytes_pinned, -len);
6715 		if (cache->ro) {
6716 			space_info->bytes_readonly += len;
6717 			readonly = true;
6718 		}
6719 		spin_unlock(&cache->lock);
6720 		if (!readonly && return_free_space &&
6721 		    global_rsv->space_info == space_info) {
6722 			u64 to_add = len;
6723 
6724 			spin_lock(&global_rsv->lock);
6725 			if (!global_rsv->full) {
6726 				to_add = min(len, global_rsv->size -
6727 					     global_rsv->reserved);
6728 				global_rsv->reserved += to_add;
6729 				space_info->bytes_may_use += to_add;
6730 				if (global_rsv->reserved >= global_rsv->size)
6731 					global_rsv->full = 1;
6732 				trace_btrfs_space_reservation(fs_info,
6733 							      "space_info",
6734 							      space_info->flags,
6735 							      to_add, 1);
6736 				len -= to_add;
6737 			}
6738 			spin_unlock(&global_rsv->lock);
6739 			/* Add to any tickets we may have */
6740 			if (len)
6741 				space_info_add_new_bytes(fs_info, space_info,
6742 							 len);
6743 		}
6744 		spin_unlock(&space_info->lock);
6745 	}
6746 
6747 	if (cache)
6748 		btrfs_put_block_group(cache);
6749 	return 0;
6750 }
6751 
6752 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6753 {
6754 	struct btrfs_fs_info *fs_info = trans->fs_info;
6755 	struct btrfs_block_group_cache *block_group, *tmp;
6756 	struct list_head *deleted_bgs;
6757 	struct extent_io_tree *unpin;
6758 	u64 start;
6759 	u64 end;
6760 	int ret;
6761 
6762 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6763 		unpin = &fs_info->freed_extents[1];
6764 	else
6765 		unpin = &fs_info->freed_extents[0];
6766 
6767 	while (!trans->aborted) {
6768 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
6769 		ret = find_first_extent_bit(unpin, 0, &start, &end,
6770 					    EXTENT_DIRTY, NULL);
6771 		if (ret) {
6772 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6773 			break;
6774 		}
6775 
6776 		if (btrfs_test_opt(fs_info, DISCARD))
6777 			ret = btrfs_discard_extent(fs_info, start,
6778 						   end + 1 - start, NULL);
6779 
6780 		clear_extent_dirty(unpin, start, end);
6781 		unpin_extent_range(fs_info, start, end, true);
6782 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6783 		cond_resched();
6784 	}
6785 
6786 	/*
6787 	 * Transaction is finished.  We don't need the lock anymore.  We
6788 	 * do need to clean up the block groups in case of a transaction
6789 	 * abort.
6790 	 */
6791 	deleted_bgs = &trans->transaction->deleted_bgs;
6792 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6793 		u64 trimmed = 0;
6794 
6795 		ret = -EROFS;
6796 		if (!trans->aborted)
6797 			ret = btrfs_discard_extent(fs_info,
6798 						   block_group->key.objectid,
6799 						   block_group->key.offset,
6800 						   &trimmed);
6801 
6802 		list_del_init(&block_group->bg_list);
6803 		btrfs_put_block_group_trimming(block_group);
6804 		btrfs_put_block_group(block_group);
6805 
6806 		if (ret) {
6807 			const char *errstr = btrfs_decode_error(ret);
6808 			btrfs_warn(fs_info,
6809 			   "discard failed while removing blockgroup: errno=%d %s",
6810 				   ret, errstr);
6811 		}
6812 	}
6813 
6814 	return 0;
6815 }
6816 
6817 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6818 				struct btrfs_fs_info *info,
6819 				struct btrfs_delayed_ref_node *node, u64 parent,
6820 				u64 root_objectid, u64 owner_objectid,
6821 				u64 owner_offset, int refs_to_drop,
6822 				struct btrfs_delayed_extent_op *extent_op)
6823 {
6824 	struct btrfs_key key;
6825 	struct btrfs_path *path;
6826 	struct btrfs_root *extent_root = info->extent_root;
6827 	struct extent_buffer *leaf;
6828 	struct btrfs_extent_item *ei;
6829 	struct btrfs_extent_inline_ref *iref;
6830 	int ret;
6831 	int is_data;
6832 	int extent_slot = 0;
6833 	int found_extent = 0;
6834 	int num_to_del = 1;
6835 	u32 item_size;
6836 	u64 refs;
6837 	u64 bytenr = node->bytenr;
6838 	u64 num_bytes = node->num_bytes;
6839 	int last_ref = 0;
6840 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6841 
6842 	path = btrfs_alloc_path();
6843 	if (!path)
6844 		return -ENOMEM;
6845 
6846 	path->reada = READA_FORWARD;
6847 	path->leave_spinning = 1;
6848 
6849 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6850 	BUG_ON(!is_data && refs_to_drop != 1);
6851 
6852 	if (is_data)
6853 		skinny_metadata = false;
6854 
6855 	ret = lookup_extent_backref(trans, info, path, &iref,
6856 				    bytenr, num_bytes, parent,
6857 				    root_objectid, owner_objectid,
6858 				    owner_offset);
6859 	if (ret == 0) {
6860 		extent_slot = path->slots[0];
6861 		while (extent_slot >= 0) {
6862 			btrfs_item_key_to_cpu(path->nodes[0], &key,
6863 					      extent_slot);
6864 			if (key.objectid != bytenr)
6865 				break;
6866 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6867 			    key.offset == num_bytes) {
6868 				found_extent = 1;
6869 				break;
6870 			}
6871 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
6872 			    key.offset == owner_objectid) {
6873 				found_extent = 1;
6874 				break;
6875 			}
6876 			if (path->slots[0] - extent_slot > 5)
6877 				break;
6878 			extent_slot--;
6879 		}
6880 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6881 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6882 		if (found_extent && item_size < sizeof(*ei))
6883 			found_extent = 0;
6884 #endif
6885 		if (!found_extent) {
6886 			BUG_ON(iref);
6887 			ret = remove_extent_backref(trans, info, path, NULL,
6888 						    refs_to_drop,
6889 						    is_data, &last_ref);
6890 			if (ret) {
6891 				btrfs_abort_transaction(trans, ret);
6892 				goto out;
6893 			}
6894 			btrfs_release_path(path);
6895 			path->leave_spinning = 1;
6896 
6897 			key.objectid = bytenr;
6898 			key.type = BTRFS_EXTENT_ITEM_KEY;
6899 			key.offset = num_bytes;
6900 
6901 			if (!is_data && skinny_metadata) {
6902 				key.type = BTRFS_METADATA_ITEM_KEY;
6903 				key.offset = owner_objectid;
6904 			}
6905 
6906 			ret = btrfs_search_slot(trans, extent_root,
6907 						&key, path, -1, 1);
6908 			if (ret > 0 && skinny_metadata && path->slots[0]) {
6909 				/*
6910 				 * Couldn't find our skinny metadata item,
6911 				 * see if we have ye olde extent item.
6912 				 */
6913 				path->slots[0]--;
6914 				btrfs_item_key_to_cpu(path->nodes[0], &key,
6915 						      path->slots[0]);
6916 				if (key.objectid == bytenr &&
6917 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
6918 				    key.offset == num_bytes)
6919 					ret = 0;
6920 			}
6921 
6922 			if (ret > 0 && skinny_metadata) {
6923 				skinny_metadata = false;
6924 				key.objectid = bytenr;
6925 				key.type = BTRFS_EXTENT_ITEM_KEY;
6926 				key.offset = num_bytes;
6927 				btrfs_release_path(path);
6928 				ret = btrfs_search_slot(trans, extent_root,
6929 							&key, path, -1, 1);
6930 			}
6931 
6932 			if (ret) {
6933 				btrfs_err(info,
6934 					  "umm, got %d back from search, was looking for %llu",
6935 					  ret, bytenr);
6936 				if (ret > 0)
6937 					btrfs_print_leaf(path->nodes[0]);
6938 			}
6939 			if (ret < 0) {
6940 				btrfs_abort_transaction(trans, ret);
6941 				goto out;
6942 			}
6943 			extent_slot = path->slots[0];
6944 		}
6945 	} else if (WARN_ON(ret == -ENOENT)) {
6946 		btrfs_print_leaf(path->nodes[0]);
6947 		btrfs_err(info,
6948 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6949 			bytenr, parent, root_objectid, owner_objectid,
6950 			owner_offset);
6951 		btrfs_abort_transaction(trans, ret);
6952 		goto out;
6953 	} else {
6954 		btrfs_abort_transaction(trans, ret);
6955 		goto out;
6956 	}
6957 
6958 	leaf = path->nodes[0];
6959 	item_size = btrfs_item_size_nr(leaf, extent_slot);
6960 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6961 	if (item_size < sizeof(*ei)) {
6962 		BUG_ON(found_extent || extent_slot != path->slots[0]);
6963 		ret = convert_extent_item_v0(trans, info, path, owner_objectid,
6964 					     0);
6965 		if (ret < 0) {
6966 			btrfs_abort_transaction(trans, ret);
6967 			goto out;
6968 		}
6969 
6970 		btrfs_release_path(path);
6971 		path->leave_spinning = 1;
6972 
6973 		key.objectid = bytenr;
6974 		key.type = BTRFS_EXTENT_ITEM_KEY;
6975 		key.offset = num_bytes;
6976 
6977 		ret = btrfs_search_slot(trans, extent_root, &key, path,
6978 					-1, 1);
6979 		if (ret) {
6980 			btrfs_err(info,
6981 				  "umm, got %d back from search, was looking for %llu",
6982 				ret, bytenr);
6983 			btrfs_print_leaf(path->nodes[0]);
6984 		}
6985 		if (ret < 0) {
6986 			btrfs_abort_transaction(trans, ret);
6987 			goto out;
6988 		}
6989 
6990 		extent_slot = path->slots[0];
6991 		leaf = path->nodes[0];
6992 		item_size = btrfs_item_size_nr(leaf, extent_slot);
6993 	}
6994 #endif
6995 	BUG_ON(item_size < sizeof(*ei));
6996 	ei = btrfs_item_ptr(leaf, extent_slot,
6997 			    struct btrfs_extent_item);
6998 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6999 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
7000 		struct btrfs_tree_block_info *bi;
7001 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7002 		bi = (struct btrfs_tree_block_info *)(ei + 1);
7003 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7004 	}
7005 
7006 	refs = btrfs_extent_refs(leaf, ei);
7007 	if (refs < refs_to_drop) {
7008 		btrfs_err(info,
7009 			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7010 			  refs_to_drop, refs, bytenr);
7011 		ret = -EINVAL;
7012 		btrfs_abort_transaction(trans, ret);
7013 		goto out;
7014 	}
7015 	refs -= refs_to_drop;
7016 
7017 	if (refs > 0) {
7018 		if (extent_op)
7019 			__run_delayed_extent_op(extent_op, leaf, ei);
7020 		/*
7021 		 * In the case of inline back ref, reference count will
7022 		 * be updated by remove_extent_backref
7023 		 */
7024 		if (iref) {
7025 			BUG_ON(!found_extent);
7026 		} else {
7027 			btrfs_set_extent_refs(leaf, ei, refs);
7028 			btrfs_mark_buffer_dirty(leaf);
7029 		}
7030 		if (found_extent) {
7031 			ret = remove_extent_backref(trans, info, path,
7032 						    iref, refs_to_drop,
7033 						    is_data, &last_ref);
7034 			if (ret) {
7035 				btrfs_abort_transaction(trans, ret);
7036 				goto out;
7037 			}
7038 		}
7039 	} else {
7040 		if (found_extent) {
7041 			BUG_ON(is_data && refs_to_drop !=
7042 			       extent_data_ref_count(path, iref));
7043 			if (iref) {
7044 				BUG_ON(path->slots[0] != extent_slot);
7045 			} else {
7046 				BUG_ON(path->slots[0] != extent_slot + 1);
7047 				path->slots[0] = extent_slot;
7048 				num_to_del = 2;
7049 			}
7050 		}
7051 
7052 		last_ref = 1;
7053 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7054 				      num_to_del);
7055 		if (ret) {
7056 			btrfs_abort_transaction(trans, ret);
7057 			goto out;
7058 		}
7059 		btrfs_release_path(path);
7060 
7061 		if (is_data) {
7062 			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7063 			if (ret) {
7064 				btrfs_abort_transaction(trans, ret);
7065 				goto out;
7066 			}
7067 		}
7068 
7069 		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
7070 		if (ret) {
7071 			btrfs_abort_transaction(trans, ret);
7072 			goto out;
7073 		}
7074 
7075 		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7076 		if (ret) {
7077 			btrfs_abort_transaction(trans, ret);
7078 			goto out;
7079 		}
7080 	}
7081 	btrfs_release_path(path);
7082 
7083 out:
7084 	btrfs_free_path(path);
7085 	return ret;
7086 }
7087 
7088 /*
7089  * when we free an block, it is possible (and likely) that we free the last
7090  * delayed ref for that extent as well.  This searches the delayed ref tree for
7091  * a given extent, and if there are no other delayed refs to be processed, it
7092  * removes it from the tree.
7093  */
7094 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7095 				      u64 bytenr)
7096 {
7097 	struct btrfs_delayed_ref_head *head;
7098 	struct btrfs_delayed_ref_root *delayed_refs;
7099 	int ret = 0;
7100 
7101 	delayed_refs = &trans->transaction->delayed_refs;
7102 	spin_lock(&delayed_refs->lock);
7103 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7104 	if (!head)
7105 		goto out_delayed_unlock;
7106 
7107 	spin_lock(&head->lock);
7108 	if (!RB_EMPTY_ROOT(&head->ref_tree))
7109 		goto out;
7110 
7111 	if (head->extent_op) {
7112 		if (!head->must_insert_reserved)
7113 			goto out;
7114 		btrfs_free_delayed_extent_op(head->extent_op);
7115 		head->extent_op = NULL;
7116 	}
7117 
7118 	/*
7119 	 * waiting for the lock here would deadlock.  If someone else has it
7120 	 * locked they are already in the process of dropping it anyway
7121 	 */
7122 	if (!mutex_trylock(&head->mutex))
7123 		goto out;
7124 
7125 	/*
7126 	 * at this point we have a head with no other entries.  Go
7127 	 * ahead and process it.
7128 	 */
7129 	rb_erase(&head->href_node, &delayed_refs->href_root);
7130 	RB_CLEAR_NODE(&head->href_node);
7131 	atomic_dec(&delayed_refs->num_entries);
7132 
7133 	/*
7134 	 * we don't take a ref on the node because we're removing it from the
7135 	 * tree, so we just steal the ref the tree was holding.
7136 	 */
7137 	delayed_refs->num_heads--;
7138 	if (head->processing == 0)
7139 		delayed_refs->num_heads_ready--;
7140 	head->processing = 0;
7141 	spin_unlock(&head->lock);
7142 	spin_unlock(&delayed_refs->lock);
7143 
7144 	BUG_ON(head->extent_op);
7145 	if (head->must_insert_reserved)
7146 		ret = 1;
7147 
7148 	mutex_unlock(&head->mutex);
7149 	btrfs_put_delayed_ref_head(head);
7150 	return ret;
7151 out:
7152 	spin_unlock(&head->lock);
7153 
7154 out_delayed_unlock:
7155 	spin_unlock(&delayed_refs->lock);
7156 	return 0;
7157 }
7158 
7159 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7160 			   struct btrfs_root *root,
7161 			   struct extent_buffer *buf,
7162 			   u64 parent, int last_ref)
7163 {
7164 	struct btrfs_fs_info *fs_info = root->fs_info;
7165 	int pin = 1;
7166 	int ret;
7167 
7168 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7169 		int old_ref_mod, new_ref_mod;
7170 
7171 		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7172 				   root->root_key.objectid,
7173 				   btrfs_header_level(buf), 0,
7174 				   BTRFS_DROP_DELAYED_REF);
7175 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7176 						 buf->len, parent,
7177 						 root->root_key.objectid,
7178 						 btrfs_header_level(buf),
7179 						 BTRFS_DROP_DELAYED_REF, NULL,
7180 						 &old_ref_mod, &new_ref_mod);
7181 		BUG_ON(ret); /* -ENOMEM */
7182 		pin = old_ref_mod >= 0 && new_ref_mod < 0;
7183 	}
7184 
7185 	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7186 		struct btrfs_block_group_cache *cache;
7187 
7188 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7189 			ret = check_ref_cleanup(trans, buf->start);
7190 			if (!ret)
7191 				goto out;
7192 		}
7193 
7194 		pin = 0;
7195 		cache = btrfs_lookup_block_group(fs_info, buf->start);
7196 
7197 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7198 			pin_down_extent(fs_info, cache, buf->start,
7199 					buf->len, 1);
7200 			btrfs_put_block_group(cache);
7201 			goto out;
7202 		}
7203 
7204 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7205 
7206 		btrfs_add_free_space(cache, buf->start, buf->len);
7207 		btrfs_free_reserved_bytes(cache, buf->len, 0);
7208 		btrfs_put_block_group(cache);
7209 		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7210 	}
7211 out:
7212 	if (pin)
7213 		add_pinned_bytes(fs_info, buf->len, true,
7214 				 root->root_key.objectid);
7215 
7216 	if (last_ref) {
7217 		/*
7218 		 * Deleting the buffer, clear the corrupt flag since it doesn't
7219 		 * matter anymore.
7220 		 */
7221 		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7222 	}
7223 }
7224 
7225 /* Can return -ENOMEM */
7226 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7227 		      struct btrfs_root *root,
7228 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7229 		      u64 owner, u64 offset)
7230 {
7231 	struct btrfs_fs_info *fs_info = root->fs_info;
7232 	int old_ref_mod, new_ref_mod;
7233 	int ret;
7234 
7235 	if (btrfs_is_testing(fs_info))
7236 		return 0;
7237 
7238 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7239 		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7240 				   root_objectid, owner, offset,
7241 				   BTRFS_DROP_DELAYED_REF);
7242 
7243 	/*
7244 	 * tree log blocks never actually go into the extent allocation
7245 	 * tree, just update pinning info and exit early.
7246 	 */
7247 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7248 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7249 		/* unlocks the pinned mutex */
7250 		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7251 		old_ref_mod = new_ref_mod = 0;
7252 		ret = 0;
7253 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7254 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7255 						 num_bytes, parent,
7256 						 root_objectid, (int)owner,
7257 						 BTRFS_DROP_DELAYED_REF, NULL,
7258 						 &old_ref_mod, &new_ref_mod);
7259 	} else {
7260 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7261 						 num_bytes, parent,
7262 						 root_objectid, owner, offset,
7263 						 0, BTRFS_DROP_DELAYED_REF,
7264 						 &old_ref_mod, &new_ref_mod);
7265 	}
7266 
7267 	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7268 		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
7269 
7270 		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7271 	}
7272 
7273 	return ret;
7274 }
7275 
7276 /*
7277  * when we wait for progress in the block group caching, its because
7278  * our allocation attempt failed at least once.  So, we must sleep
7279  * and let some progress happen before we try again.
7280  *
7281  * This function will sleep at least once waiting for new free space to
7282  * show up, and then it will check the block group free space numbers
7283  * for our min num_bytes.  Another option is to have it go ahead
7284  * and look in the rbtree for a free extent of a given size, but this
7285  * is a good start.
7286  *
7287  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7288  * any of the information in this block group.
7289  */
7290 static noinline void
7291 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7292 				u64 num_bytes)
7293 {
7294 	struct btrfs_caching_control *caching_ctl;
7295 
7296 	caching_ctl = get_caching_control(cache);
7297 	if (!caching_ctl)
7298 		return;
7299 
7300 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7301 		   (cache->free_space_ctl->free_space >= num_bytes));
7302 
7303 	put_caching_control(caching_ctl);
7304 }
7305 
7306 static noinline int
7307 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7308 {
7309 	struct btrfs_caching_control *caching_ctl;
7310 	int ret = 0;
7311 
7312 	caching_ctl = get_caching_control(cache);
7313 	if (!caching_ctl)
7314 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7315 
7316 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
7317 	if (cache->cached == BTRFS_CACHE_ERROR)
7318 		ret = -EIO;
7319 	put_caching_control(caching_ctl);
7320 	return ret;
7321 }
7322 
7323 enum btrfs_loop_type {
7324 	LOOP_CACHING_NOWAIT = 0,
7325 	LOOP_CACHING_WAIT = 1,
7326 	LOOP_ALLOC_CHUNK = 2,
7327 	LOOP_NO_EMPTY_SIZE = 3,
7328 };
7329 
7330 static inline void
7331 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7332 		       int delalloc)
7333 {
7334 	if (delalloc)
7335 		down_read(&cache->data_rwsem);
7336 }
7337 
7338 static inline void
7339 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7340 		       int delalloc)
7341 {
7342 	btrfs_get_block_group(cache);
7343 	if (delalloc)
7344 		down_read(&cache->data_rwsem);
7345 }
7346 
7347 static struct btrfs_block_group_cache *
7348 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7349 		   struct btrfs_free_cluster *cluster,
7350 		   int delalloc)
7351 {
7352 	struct btrfs_block_group_cache *used_bg = NULL;
7353 
7354 	spin_lock(&cluster->refill_lock);
7355 	while (1) {
7356 		used_bg = cluster->block_group;
7357 		if (!used_bg)
7358 			return NULL;
7359 
7360 		if (used_bg == block_group)
7361 			return used_bg;
7362 
7363 		btrfs_get_block_group(used_bg);
7364 
7365 		if (!delalloc)
7366 			return used_bg;
7367 
7368 		if (down_read_trylock(&used_bg->data_rwsem))
7369 			return used_bg;
7370 
7371 		spin_unlock(&cluster->refill_lock);
7372 
7373 		/* We should only have one-level nested. */
7374 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7375 
7376 		spin_lock(&cluster->refill_lock);
7377 		if (used_bg == cluster->block_group)
7378 			return used_bg;
7379 
7380 		up_read(&used_bg->data_rwsem);
7381 		btrfs_put_block_group(used_bg);
7382 	}
7383 }
7384 
7385 static inline void
7386 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7387 			 int delalloc)
7388 {
7389 	if (delalloc)
7390 		up_read(&cache->data_rwsem);
7391 	btrfs_put_block_group(cache);
7392 }
7393 
7394 /*
7395  * walks the btree of allocated extents and find a hole of a given size.
7396  * The key ins is changed to record the hole:
7397  * ins->objectid == start position
7398  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7399  * ins->offset == the size of the hole.
7400  * Any available blocks before search_start are skipped.
7401  *
7402  * If there is no suitable free space, we will record the max size of
7403  * the free space extent currently.
7404  */
7405 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7406 				u64 ram_bytes, u64 num_bytes, u64 empty_size,
7407 				u64 hint_byte, struct btrfs_key *ins,
7408 				u64 flags, int delalloc)
7409 {
7410 	int ret = 0;
7411 	struct btrfs_root *root = fs_info->extent_root;
7412 	struct btrfs_free_cluster *last_ptr = NULL;
7413 	struct btrfs_block_group_cache *block_group = NULL;
7414 	u64 search_start = 0;
7415 	u64 max_extent_size = 0;
7416 	u64 empty_cluster = 0;
7417 	struct btrfs_space_info *space_info;
7418 	int loop = 0;
7419 	int index = btrfs_bg_flags_to_raid_index(flags);
7420 	bool failed_cluster_refill = false;
7421 	bool failed_alloc = false;
7422 	bool use_cluster = true;
7423 	bool have_caching_bg = false;
7424 	bool orig_have_caching_bg = false;
7425 	bool full_search = false;
7426 
7427 	WARN_ON(num_bytes < fs_info->sectorsize);
7428 	ins->type = BTRFS_EXTENT_ITEM_KEY;
7429 	ins->objectid = 0;
7430 	ins->offset = 0;
7431 
7432 	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7433 
7434 	space_info = __find_space_info(fs_info, flags);
7435 	if (!space_info) {
7436 		btrfs_err(fs_info, "No space info for %llu", flags);
7437 		return -ENOSPC;
7438 	}
7439 
7440 	/*
7441 	 * If our free space is heavily fragmented we may not be able to make
7442 	 * big contiguous allocations, so instead of doing the expensive search
7443 	 * for free space, simply return ENOSPC with our max_extent_size so we
7444 	 * can go ahead and search for a more manageable chunk.
7445 	 *
7446 	 * If our max_extent_size is large enough for our allocation simply
7447 	 * disable clustering since we will likely not be able to find enough
7448 	 * space to create a cluster and induce latency trying.
7449 	 */
7450 	if (unlikely(space_info->max_extent_size)) {
7451 		spin_lock(&space_info->lock);
7452 		if (space_info->max_extent_size &&
7453 		    num_bytes > space_info->max_extent_size) {
7454 			ins->offset = space_info->max_extent_size;
7455 			spin_unlock(&space_info->lock);
7456 			return -ENOSPC;
7457 		} else if (space_info->max_extent_size) {
7458 			use_cluster = false;
7459 		}
7460 		spin_unlock(&space_info->lock);
7461 	}
7462 
7463 	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7464 	if (last_ptr) {
7465 		spin_lock(&last_ptr->lock);
7466 		if (last_ptr->block_group)
7467 			hint_byte = last_ptr->window_start;
7468 		if (last_ptr->fragmented) {
7469 			/*
7470 			 * We still set window_start so we can keep track of the
7471 			 * last place we found an allocation to try and save
7472 			 * some time.
7473 			 */
7474 			hint_byte = last_ptr->window_start;
7475 			use_cluster = false;
7476 		}
7477 		spin_unlock(&last_ptr->lock);
7478 	}
7479 
7480 	search_start = max(search_start, first_logical_byte(fs_info, 0));
7481 	search_start = max(search_start, hint_byte);
7482 	if (search_start == hint_byte) {
7483 		block_group = btrfs_lookup_block_group(fs_info, search_start);
7484 		/*
7485 		 * we don't want to use the block group if it doesn't match our
7486 		 * allocation bits, or if its not cached.
7487 		 *
7488 		 * However if we are re-searching with an ideal block group
7489 		 * picked out then we don't care that the block group is cached.
7490 		 */
7491 		if (block_group && block_group_bits(block_group, flags) &&
7492 		    block_group->cached != BTRFS_CACHE_NO) {
7493 			down_read(&space_info->groups_sem);
7494 			if (list_empty(&block_group->list) ||
7495 			    block_group->ro) {
7496 				/*
7497 				 * someone is removing this block group,
7498 				 * we can't jump into the have_block_group
7499 				 * target because our list pointers are not
7500 				 * valid
7501 				 */
7502 				btrfs_put_block_group(block_group);
7503 				up_read(&space_info->groups_sem);
7504 			} else {
7505 				index = btrfs_bg_flags_to_raid_index(
7506 						block_group->flags);
7507 				btrfs_lock_block_group(block_group, delalloc);
7508 				goto have_block_group;
7509 			}
7510 		} else if (block_group) {
7511 			btrfs_put_block_group(block_group);
7512 		}
7513 	}
7514 search:
7515 	have_caching_bg = false;
7516 	if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
7517 		full_search = true;
7518 	down_read(&space_info->groups_sem);
7519 	list_for_each_entry(block_group, &space_info->block_groups[index],
7520 			    list) {
7521 		u64 offset;
7522 		int cached;
7523 
7524 		/* If the block group is read-only, we can skip it entirely. */
7525 		if (unlikely(block_group->ro))
7526 			continue;
7527 
7528 		btrfs_grab_block_group(block_group, delalloc);
7529 		search_start = block_group->key.objectid;
7530 
7531 		/*
7532 		 * this can happen if we end up cycling through all the
7533 		 * raid types, but we want to make sure we only allocate
7534 		 * for the proper type.
7535 		 */
7536 		if (!block_group_bits(block_group, flags)) {
7537 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
7538 				BTRFS_BLOCK_GROUP_RAID1 |
7539 				BTRFS_BLOCK_GROUP_RAID5 |
7540 				BTRFS_BLOCK_GROUP_RAID6 |
7541 				BTRFS_BLOCK_GROUP_RAID10;
7542 
7543 			/*
7544 			 * if they asked for extra copies and this block group
7545 			 * doesn't provide them, bail.  This does allow us to
7546 			 * fill raid0 from raid1.
7547 			 */
7548 			if ((flags & extra) && !(block_group->flags & extra))
7549 				goto loop;
7550 		}
7551 
7552 have_block_group:
7553 		cached = block_group_cache_done(block_group);
7554 		if (unlikely(!cached)) {
7555 			have_caching_bg = true;
7556 			ret = cache_block_group(block_group, 0);
7557 			BUG_ON(ret < 0);
7558 			ret = 0;
7559 		}
7560 
7561 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7562 			goto loop;
7563 
7564 		/*
7565 		 * Ok we want to try and use the cluster allocator, so
7566 		 * lets look there
7567 		 */
7568 		if (last_ptr && use_cluster) {
7569 			struct btrfs_block_group_cache *used_block_group;
7570 			unsigned long aligned_cluster;
7571 			/*
7572 			 * the refill lock keeps out other
7573 			 * people trying to start a new cluster
7574 			 */
7575 			used_block_group = btrfs_lock_cluster(block_group,
7576 							      last_ptr,
7577 							      delalloc);
7578 			if (!used_block_group)
7579 				goto refill_cluster;
7580 
7581 			if (used_block_group != block_group &&
7582 			    (used_block_group->ro ||
7583 			     !block_group_bits(used_block_group, flags)))
7584 				goto release_cluster;
7585 
7586 			offset = btrfs_alloc_from_cluster(used_block_group,
7587 						last_ptr,
7588 						num_bytes,
7589 						used_block_group->key.objectid,
7590 						&max_extent_size);
7591 			if (offset) {
7592 				/* we have a block, we're done */
7593 				spin_unlock(&last_ptr->refill_lock);
7594 				trace_btrfs_reserve_extent_cluster(
7595 						used_block_group,
7596 						search_start, num_bytes);
7597 				if (used_block_group != block_group) {
7598 					btrfs_release_block_group(block_group,
7599 								  delalloc);
7600 					block_group = used_block_group;
7601 				}
7602 				goto checks;
7603 			}
7604 
7605 			WARN_ON(last_ptr->block_group != used_block_group);
7606 release_cluster:
7607 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
7608 			 * set up a new clusters, so lets just skip it
7609 			 * and let the allocator find whatever block
7610 			 * it can find.  If we reach this point, we
7611 			 * will have tried the cluster allocator
7612 			 * plenty of times and not have found
7613 			 * anything, so we are likely way too
7614 			 * fragmented for the clustering stuff to find
7615 			 * anything.
7616 			 *
7617 			 * However, if the cluster is taken from the
7618 			 * current block group, release the cluster
7619 			 * first, so that we stand a better chance of
7620 			 * succeeding in the unclustered
7621 			 * allocation.  */
7622 			if (loop >= LOOP_NO_EMPTY_SIZE &&
7623 			    used_block_group != block_group) {
7624 				spin_unlock(&last_ptr->refill_lock);
7625 				btrfs_release_block_group(used_block_group,
7626 							  delalloc);
7627 				goto unclustered_alloc;
7628 			}
7629 
7630 			/*
7631 			 * this cluster didn't work out, free it and
7632 			 * start over
7633 			 */
7634 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7635 
7636 			if (used_block_group != block_group)
7637 				btrfs_release_block_group(used_block_group,
7638 							  delalloc);
7639 refill_cluster:
7640 			if (loop >= LOOP_NO_EMPTY_SIZE) {
7641 				spin_unlock(&last_ptr->refill_lock);
7642 				goto unclustered_alloc;
7643 			}
7644 
7645 			aligned_cluster = max_t(unsigned long,
7646 						empty_cluster + empty_size,
7647 					      block_group->full_stripe_len);
7648 
7649 			/* allocate a cluster in this block group */
7650 			ret = btrfs_find_space_cluster(fs_info, block_group,
7651 						       last_ptr, search_start,
7652 						       num_bytes,
7653 						       aligned_cluster);
7654 			if (ret == 0) {
7655 				/*
7656 				 * now pull our allocation out of this
7657 				 * cluster
7658 				 */
7659 				offset = btrfs_alloc_from_cluster(block_group,
7660 							last_ptr,
7661 							num_bytes,
7662 							search_start,
7663 							&max_extent_size);
7664 				if (offset) {
7665 					/* we found one, proceed */
7666 					spin_unlock(&last_ptr->refill_lock);
7667 					trace_btrfs_reserve_extent_cluster(
7668 						block_group, search_start,
7669 						num_bytes);
7670 					goto checks;
7671 				}
7672 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
7673 				   && !failed_cluster_refill) {
7674 				spin_unlock(&last_ptr->refill_lock);
7675 
7676 				failed_cluster_refill = true;
7677 				wait_block_group_cache_progress(block_group,
7678 				       num_bytes + empty_cluster + empty_size);
7679 				goto have_block_group;
7680 			}
7681 
7682 			/*
7683 			 * at this point we either didn't find a cluster
7684 			 * or we weren't able to allocate a block from our
7685 			 * cluster.  Free the cluster we've been trying
7686 			 * to use, and go to the next block group
7687 			 */
7688 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7689 			spin_unlock(&last_ptr->refill_lock);
7690 			goto loop;
7691 		}
7692 
7693 unclustered_alloc:
7694 		/*
7695 		 * We are doing an unclustered alloc, set the fragmented flag so
7696 		 * we don't bother trying to setup a cluster again until we get
7697 		 * more space.
7698 		 */
7699 		if (unlikely(last_ptr)) {
7700 			spin_lock(&last_ptr->lock);
7701 			last_ptr->fragmented = 1;
7702 			spin_unlock(&last_ptr->lock);
7703 		}
7704 		if (cached) {
7705 			struct btrfs_free_space_ctl *ctl =
7706 				block_group->free_space_ctl;
7707 
7708 			spin_lock(&ctl->tree_lock);
7709 			if (ctl->free_space <
7710 			    num_bytes + empty_cluster + empty_size) {
7711 				if (ctl->free_space > max_extent_size)
7712 					max_extent_size = ctl->free_space;
7713 				spin_unlock(&ctl->tree_lock);
7714 				goto loop;
7715 			}
7716 			spin_unlock(&ctl->tree_lock);
7717 		}
7718 
7719 		offset = btrfs_find_space_for_alloc(block_group, search_start,
7720 						    num_bytes, empty_size,
7721 						    &max_extent_size);
7722 		/*
7723 		 * If we didn't find a chunk, and we haven't failed on this
7724 		 * block group before, and this block group is in the middle of
7725 		 * caching and we are ok with waiting, then go ahead and wait
7726 		 * for progress to be made, and set failed_alloc to true.
7727 		 *
7728 		 * If failed_alloc is true then we've already waited on this
7729 		 * block group once and should move on to the next block group.
7730 		 */
7731 		if (!offset && !failed_alloc && !cached &&
7732 		    loop > LOOP_CACHING_NOWAIT) {
7733 			wait_block_group_cache_progress(block_group,
7734 						num_bytes + empty_size);
7735 			failed_alloc = true;
7736 			goto have_block_group;
7737 		} else if (!offset) {
7738 			goto loop;
7739 		}
7740 checks:
7741 		search_start = ALIGN(offset, fs_info->stripesize);
7742 
7743 		/* move on to the next group */
7744 		if (search_start + num_bytes >
7745 		    block_group->key.objectid + block_group->key.offset) {
7746 			btrfs_add_free_space(block_group, offset, num_bytes);
7747 			goto loop;
7748 		}
7749 
7750 		if (offset < search_start)
7751 			btrfs_add_free_space(block_group, offset,
7752 					     search_start - offset);
7753 		BUG_ON(offset > search_start);
7754 
7755 		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7756 				num_bytes, delalloc);
7757 		if (ret == -EAGAIN) {
7758 			btrfs_add_free_space(block_group, offset, num_bytes);
7759 			goto loop;
7760 		}
7761 		btrfs_inc_block_group_reservations(block_group);
7762 
7763 		/* we are all good, lets return */
7764 		ins->objectid = search_start;
7765 		ins->offset = num_bytes;
7766 
7767 		trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
7768 		btrfs_release_block_group(block_group, delalloc);
7769 		break;
7770 loop:
7771 		failed_cluster_refill = false;
7772 		failed_alloc = false;
7773 		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7774 		       index);
7775 		btrfs_release_block_group(block_group, delalloc);
7776 		cond_resched();
7777 	}
7778 	up_read(&space_info->groups_sem);
7779 
7780 	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7781 		&& !orig_have_caching_bg)
7782 		orig_have_caching_bg = true;
7783 
7784 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7785 		goto search;
7786 
7787 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7788 		goto search;
7789 
7790 	/*
7791 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7792 	 *			caching kthreads as we move along
7793 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7794 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7795 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7796 	 *			again
7797 	 */
7798 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7799 		index = 0;
7800 		if (loop == LOOP_CACHING_NOWAIT) {
7801 			/*
7802 			 * We want to skip the LOOP_CACHING_WAIT step if we
7803 			 * don't have any uncached bgs and we've already done a
7804 			 * full search through.
7805 			 */
7806 			if (orig_have_caching_bg || !full_search)
7807 				loop = LOOP_CACHING_WAIT;
7808 			else
7809 				loop = LOOP_ALLOC_CHUNK;
7810 		} else {
7811 			loop++;
7812 		}
7813 
7814 		if (loop == LOOP_ALLOC_CHUNK) {
7815 			struct btrfs_trans_handle *trans;
7816 			int exist = 0;
7817 
7818 			trans = current->journal_info;
7819 			if (trans)
7820 				exist = 1;
7821 			else
7822 				trans = btrfs_join_transaction(root);
7823 
7824 			if (IS_ERR(trans)) {
7825 				ret = PTR_ERR(trans);
7826 				goto out;
7827 			}
7828 
7829 			ret = do_chunk_alloc(trans, fs_info, flags,
7830 					     CHUNK_ALLOC_FORCE);
7831 
7832 			/*
7833 			 * If we can't allocate a new chunk we've already looped
7834 			 * through at least once, move on to the NO_EMPTY_SIZE
7835 			 * case.
7836 			 */
7837 			if (ret == -ENOSPC)
7838 				loop = LOOP_NO_EMPTY_SIZE;
7839 
7840 			/*
7841 			 * Do not bail out on ENOSPC since we
7842 			 * can do more things.
7843 			 */
7844 			if (ret < 0 && ret != -ENOSPC)
7845 				btrfs_abort_transaction(trans, ret);
7846 			else
7847 				ret = 0;
7848 			if (!exist)
7849 				btrfs_end_transaction(trans);
7850 			if (ret)
7851 				goto out;
7852 		}
7853 
7854 		if (loop == LOOP_NO_EMPTY_SIZE) {
7855 			/*
7856 			 * Don't loop again if we already have no empty_size and
7857 			 * no empty_cluster.
7858 			 */
7859 			if (empty_size == 0 &&
7860 			    empty_cluster == 0) {
7861 				ret = -ENOSPC;
7862 				goto out;
7863 			}
7864 			empty_size = 0;
7865 			empty_cluster = 0;
7866 		}
7867 
7868 		goto search;
7869 	} else if (!ins->objectid) {
7870 		ret = -ENOSPC;
7871 	} else if (ins->objectid) {
7872 		if (!use_cluster && last_ptr) {
7873 			spin_lock(&last_ptr->lock);
7874 			last_ptr->window_start = ins->objectid;
7875 			spin_unlock(&last_ptr->lock);
7876 		}
7877 		ret = 0;
7878 	}
7879 out:
7880 	if (ret == -ENOSPC) {
7881 		spin_lock(&space_info->lock);
7882 		space_info->max_extent_size = max_extent_size;
7883 		spin_unlock(&space_info->lock);
7884 		ins->offset = max_extent_size;
7885 	}
7886 	return ret;
7887 }
7888 
7889 static void dump_space_info(struct btrfs_fs_info *fs_info,
7890 			    struct btrfs_space_info *info, u64 bytes,
7891 			    int dump_block_groups)
7892 {
7893 	struct btrfs_block_group_cache *cache;
7894 	int index = 0;
7895 
7896 	spin_lock(&info->lock);
7897 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7898 		   info->flags,
7899 		   info->total_bytes - btrfs_space_info_used(info, true),
7900 		   info->full ? "" : "not ");
7901 	btrfs_info(fs_info,
7902 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7903 		info->total_bytes, info->bytes_used, info->bytes_pinned,
7904 		info->bytes_reserved, info->bytes_may_use,
7905 		info->bytes_readonly);
7906 	spin_unlock(&info->lock);
7907 
7908 	if (!dump_block_groups)
7909 		return;
7910 
7911 	down_read(&info->groups_sem);
7912 again:
7913 	list_for_each_entry(cache, &info->block_groups[index], list) {
7914 		spin_lock(&cache->lock);
7915 		btrfs_info(fs_info,
7916 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7917 			cache->key.objectid, cache->key.offset,
7918 			btrfs_block_group_used(&cache->item), cache->pinned,
7919 			cache->reserved, cache->ro ? "[readonly]" : "");
7920 		btrfs_dump_free_space(cache, bytes);
7921 		spin_unlock(&cache->lock);
7922 	}
7923 	if (++index < BTRFS_NR_RAID_TYPES)
7924 		goto again;
7925 	up_read(&info->groups_sem);
7926 }
7927 
7928 /*
7929  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
7930  *			  hole that is at least as big as @num_bytes.
7931  *
7932  * @root           -	The root that will contain this extent
7933  *
7934  * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
7935  *			is used for accounting purposes. This value differs
7936  *			from @num_bytes only in the case of compressed extents.
7937  *
7938  * @num_bytes      -	Number of bytes to allocate on-disk.
7939  *
7940  * @min_alloc_size -	Indicates the minimum amount of space that the
7941  *			allocator should try to satisfy. In some cases
7942  *			@num_bytes may be larger than what is required and if
7943  *			the filesystem is fragmented then allocation fails.
7944  *			However, the presence of @min_alloc_size gives a
7945  *			chance to try and satisfy the smaller allocation.
7946  *
7947  * @empty_size     -	A hint that you plan on doing more COW. This is the
7948  *			size in bytes the allocator should try to find free
7949  *			next to the block it returns.  This is just a hint and
7950  *			may be ignored by the allocator.
7951  *
7952  * @hint_byte      -	Hint to the allocator to start searching above the byte
7953  *			address passed. It might be ignored.
7954  *
7955  * @ins            -	This key is modified to record the found hole. It will
7956  *			have the following values:
7957  *			ins->objectid == start position
7958  *			ins->flags = BTRFS_EXTENT_ITEM_KEY
7959  *			ins->offset == the size of the hole.
7960  *
7961  * @is_data        -	Boolean flag indicating whether an extent is
7962  *			allocated for data (true) or metadata (false)
7963  *
7964  * @delalloc       -	Boolean flag indicating whether this allocation is for
7965  *			delalloc or not. If 'true' data_rwsem of block groups
7966  *			is going to be acquired.
7967  *
7968  *
7969  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
7970  * case -ENOSPC is returned then @ins->offset will contain the size of the
7971  * largest available hole the allocator managed to find.
7972  */
7973 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7974 			 u64 num_bytes, u64 min_alloc_size,
7975 			 u64 empty_size, u64 hint_byte,
7976 			 struct btrfs_key *ins, int is_data, int delalloc)
7977 {
7978 	struct btrfs_fs_info *fs_info = root->fs_info;
7979 	bool final_tried = num_bytes == min_alloc_size;
7980 	u64 flags;
7981 	int ret;
7982 
7983 	flags = get_alloc_profile_by_root(root, is_data);
7984 again:
7985 	WARN_ON(num_bytes < fs_info->sectorsize);
7986 	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
7987 			       hint_byte, ins, flags, delalloc);
7988 	if (!ret && !is_data) {
7989 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7990 	} else if (ret == -ENOSPC) {
7991 		if (!final_tried && ins->offset) {
7992 			num_bytes = min(num_bytes >> 1, ins->offset);
7993 			num_bytes = round_down(num_bytes,
7994 					       fs_info->sectorsize);
7995 			num_bytes = max(num_bytes, min_alloc_size);
7996 			ram_bytes = num_bytes;
7997 			if (num_bytes == min_alloc_size)
7998 				final_tried = true;
7999 			goto again;
8000 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8001 			struct btrfs_space_info *sinfo;
8002 
8003 			sinfo = __find_space_info(fs_info, flags);
8004 			btrfs_err(fs_info,
8005 				  "allocation failed flags %llu, wanted %llu",
8006 				  flags, num_bytes);
8007 			if (sinfo)
8008 				dump_space_info(fs_info, sinfo, num_bytes, 1);
8009 		}
8010 	}
8011 
8012 	return ret;
8013 }
8014 
8015 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8016 					u64 start, u64 len,
8017 					int pin, int delalloc)
8018 {
8019 	struct btrfs_block_group_cache *cache;
8020 	int ret = 0;
8021 
8022 	cache = btrfs_lookup_block_group(fs_info, start);
8023 	if (!cache) {
8024 		btrfs_err(fs_info, "Unable to find block group for %llu",
8025 			  start);
8026 		return -ENOSPC;
8027 	}
8028 
8029 	if (pin)
8030 		pin_down_extent(fs_info, cache, start, len, 1);
8031 	else {
8032 		if (btrfs_test_opt(fs_info, DISCARD))
8033 			ret = btrfs_discard_extent(fs_info, start, len, NULL);
8034 		btrfs_add_free_space(cache, start, len);
8035 		btrfs_free_reserved_bytes(cache, len, delalloc);
8036 		trace_btrfs_reserved_extent_free(fs_info, start, len);
8037 	}
8038 
8039 	btrfs_put_block_group(cache);
8040 	return ret;
8041 }
8042 
8043 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8044 			       u64 start, u64 len, int delalloc)
8045 {
8046 	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8047 }
8048 
8049 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8050 				       u64 start, u64 len)
8051 {
8052 	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8053 }
8054 
8055 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8056 				      struct btrfs_fs_info *fs_info,
8057 				      u64 parent, u64 root_objectid,
8058 				      u64 flags, u64 owner, u64 offset,
8059 				      struct btrfs_key *ins, int ref_mod)
8060 {
8061 	int ret;
8062 	struct btrfs_extent_item *extent_item;
8063 	struct btrfs_extent_inline_ref *iref;
8064 	struct btrfs_path *path;
8065 	struct extent_buffer *leaf;
8066 	int type;
8067 	u32 size;
8068 
8069 	if (parent > 0)
8070 		type = BTRFS_SHARED_DATA_REF_KEY;
8071 	else
8072 		type = BTRFS_EXTENT_DATA_REF_KEY;
8073 
8074 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8075 
8076 	path = btrfs_alloc_path();
8077 	if (!path)
8078 		return -ENOMEM;
8079 
8080 	path->leave_spinning = 1;
8081 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8082 				      ins, size);
8083 	if (ret) {
8084 		btrfs_free_path(path);
8085 		return ret;
8086 	}
8087 
8088 	leaf = path->nodes[0];
8089 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8090 				     struct btrfs_extent_item);
8091 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8092 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8093 	btrfs_set_extent_flags(leaf, extent_item,
8094 			       flags | BTRFS_EXTENT_FLAG_DATA);
8095 
8096 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8097 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
8098 	if (parent > 0) {
8099 		struct btrfs_shared_data_ref *ref;
8100 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
8101 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8102 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8103 	} else {
8104 		struct btrfs_extent_data_ref *ref;
8105 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8106 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8107 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8108 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8109 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8110 	}
8111 
8112 	btrfs_mark_buffer_dirty(path->nodes[0]);
8113 	btrfs_free_path(path);
8114 
8115 	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8116 	if (ret)
8117 		return ret;
8118 
8119 	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8120 	if (ret) { /* -ENOENT, logic error */
8121 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8122 			ins->objectid, ins->offset);
8123 		BUG();
8124 	}
8125 	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8126 	return ret;
8127 }
8128 
8129 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8130 				     struct btrfs_delayed_ref_node *node,
8131 				     struct btrfs_delayed_extent_op *extent_op)
8132 {
8133 	struct btrfs_fs_info *fs_info = trans->fs_info;
8134 	int ret;
8135 	struct btrfs_extent_item *extent_item;
8136 	struct btrfs_key extent_key;
8137 	struct btrfs_tree_block_info *block_info;
8138 	struct btrfs_extent_inline_ref *iref;
8139 	struct btrfs_path *path;
8140 	struct extent_buffer *leaf;
8141 	struct btrfs_delayed_tree_ref *ref;
8142 	u32 size = sizeof(*extent_item) + sizeof(*iref);
8143 	u64 num_bytes;
8144 	u64 flags = extent_op->flags_to_set;
8145 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8146 
8147 	ref = btrfs_delayed_node_to_tree_ref(node);
8148 
8149 	extent_key.objectid = node->bytenr;
8150 	if (skinny_metadata) {
8151 		extent_key.offset = ref->level;
8152 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
8153 		num_bytes = fs_info->nodesize;
8154 	} else {
8155 		extent_key.offset = node->num_bytes;
8156 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8157 		size += sizeof(*block_info);
8158 		num_bytes = node->num_bytes;
8159 	}
8160 
8161 	path = btrfs_alloc_path();
8162 	if (!path) {
8163 		btrfs_free_and_pin_reserved_extent(fs_info,
8164 						   extent_key.objectid,
8165 						   fs_info->nodesize);
8166 		return -ENOMEM;
8167 	}
8168 
8169 	path->leave_spinning = 1;
8170 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8171 				      &extent_key, size);
8172 	if (ret) {
8173 		btrfs_free_path(path);
8174 		btrfs_free_and_pin_reserved_extent(fs_info,
8175 						   extent_key.objectid,
8176 						   fs_info->nodesize);
8177 		return ret;
8178 	}
8179 
8180 	leaf = path->nodes[0];
8181 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8182 				     struct btrfs_extent_item);
8183 	btrfs_set_extent_refs(leaf, extent_item, 1);
8184 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8185 	btrfs_set_extent_flags(leaf, extent_item,
8186 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8187 
8188 	if (skinny_metadata) {
8189 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8190 	} else {
8191 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8192 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8193 		btrfs_set_tree_block_level(leaf, block_info, ref->level);
8194 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8195 	}
8196 
8197 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8198 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8199 		btrfs_set_extent_inline_ref_type(leaf, iref,
8200 						 BTRFS_SHARED_BLOCK_REF_KEY);
8201 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8202 	} else {
8203 		btrfs_set_extent_inline_ref_type(leaf, iref,
8204 						 BTRFS_TREE_BLOCK_REF_KEY);
8205 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8206 	}
8207 
8208 	btrfs_mark_buffer_dirty(leaf);
8209 	btrfs_free_path(path);
8210 
8211 	ret = remove_from_free_space_tree(trans, extent_key.objectid,
8212 					  num_bytes);
8213 	if (ret)
8214 		return ret;
8215 
8216 	ret = update_block_group(trans, fs_info, extent_key.objectid,
8217 				 fs_info->nodesize, 1);
8218 	if (ret) { /* -ENOENT, logic error */
8219 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8220 			extent_key.objectid, extent_key.offset);
8221 		BUG();
8222 	}
8223 
8224 	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8225 					  fs_info->nodesize);
8226 	return ret;
8227 }
8228 
8229 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8230 				     struct btrfs_root *root, u64 owner,
8231 				     u64 offset, u64 ram_bytes,
8232 				     struct btrfs_key *ins)
8233 {
8234 	struct btrfs_fs_info *fs_info = root->fs_info;
8235 	int ret;
8236 
8237 	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8238 
8239 	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8240 			   root->root_key.objectid, owner, offset,
8241 			   BTRFS_ADD_DELAYED_EXTENT);
8242 
8243 	ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8244 					 ins->offset, 0,
8245 					 root->root_key.objectid, owner,
8246 					 offset, ram_bytes,
8247 					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8248 	return ret;
8249 }
8250 
8251 /*
8252  * this is used by the tree logging recovery code.  It records that
8253  * an extent has been allocated and makes sure to clear the free
8254  * space cache bits as well
8255  */
8256 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8257 				   struct btrfs_fs_info *fs_info,
8258 				   u64 root_objectid, u64 owner, u64 offset,
8259 				   struct btrfs_key *ins)
8260 {
8261 	int ret;
8262 	struct btrfs_block_group_cache *block_group;
8263 	struct btrfs_space_info *space_info;
8264 
8265 	/*
8266 	 * Mixed block groups will exclude before processing the log so we only
8267 	 * need to do the exclude dance if this fs isn't mixed.
8268 	 */
8269 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8270 		ret = __exclude_logged_extent(fs_info, ins->objectid,
8271 					      ins->offset);
8272 		if (ret)
8273 			return ret;
8274 	}
8275 
8276 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8277 	if (!block_group)
8278 		return -EINVAL;
8279 
8280 	space_info = block_group->space_info;
8281 	spin_lock(&space_info->lock);
8282 	spin_lock(&block_group->lock);
8283 	space_info->bytes_reserved += ins->offset;
8284 	block_group->reserved += ins->offset;
8285 	spin_unlock(&block_group->lock);
8286 	spin_unlock(&space_info->lock);
8287 
8288 	ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
8289 					 0, owner, offset, ins, 1);
8290 	btrfs_put_block_group(block_group);
8291 	return ret;
8292 }
8293 
8294 static struct extent_buffer *
8295 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8296 		      u64 bytenr, int level)
8297 {
8298 	struct btrfs_fs_info *fs_info = root->fs_info;
8299 	struct extent_buffer *buf;
8300 
8301 	buf = btrfs_find_create_tree_block(fs_info, bytenr);
8302 	if (IS_ERR(buf))
8303 		return buf;
8304 
8305 	btrfs_set_header_generation(buf, trans->transid);
8306 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8307 	btrfs_tree_lock(buf);
8308 	clean_tree_block(fs_info, buf);
8309 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8310 
8311 	btrfs_set_lock_blocking(buf);
8312 	set_extent_buffer_uptodate(buf);
8313 
8314 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8315 		buf->log_index = root->log_transid % 2;
8316 		/*
8317 		 * we allow two log transactions at a time, use different
8318 		 * EXENT bit to differentiate dirty pages.
8319 		 */
8320 		if (buf->log_index == 0)
8321 			set_extent_dirty(&root->dirty_log_pages, buf->start,
8322 					buf->start + buf->len - 1, GFP_NOFS);
8323 		else
8324 			set_extent_new(&root->dirty_log_pages, buf->start,
8325 					buf->start + buf->len - 1);
8326 	} else {
8327 		buf->log_index = -1;
8328 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8329 			 buf->start + buf->len - 1, GFP_NOFS);
8330 	}
8331 	trans->dirty = true;
8332 	/* this returns a buffer locked for blocking */
8333 	return buf;
8334 }
8335 
8336 static struct btrfs_block_rsv *
8337 use_block_rsv(struct btrfs_trans_handle *trans,
8338 	      struct btrfs_root *root, u32 blocksize)
8339 {
8340 	struct btrfs_fs_info *fs_info = root->fs_info;
8341 	struct btrfs_block_rsv *block_rsv;
8342 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8343 	int ret;
8344 	bool global_updated = false;
8345 
8346 	block_rsv = get_block_rsv(trans, root);
8347 
8348 	if (unlikely(block_rsv->size == 0))
8349 		goto try_reserve;
8350 again:
8351 	ret = block_rsv_use_bytes(block_rsv, blocksize);
8352 	if (!ret)
8353 		return block_rsv;
8354 
8355 	if (block_rsv->failfast)
8356 		return ERR_PTR(ret);
8357 
8358 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8359 		global_updated = true;
8360 		update_global_block_rsv(fs_info);
8361 		goto again;
8362 	}
8363 
8364 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8365 		static DEFINE_RATELIMIT_STATE(_rs,
8366 				DEFAULT_RATELIMIT_INTERVAL * 10,
8367 				/*DEFAULT_RATELIMIT_BURST*/ 1);
8368 		if (__ratelimit(&_rs))
8369 			WARN(1, KERN_DEBUG
8370 				"BTRFS: block rsv returned %d\n", ret);
8371 	}
8372 try_reserve:
8373 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8374 				     BTRFS_RESERVE_NO_FLUSH);
8375 	if (!ret)
8376 		return block_rsv;
8377 	/*
8378 	 * If we couldn't reserve metadata bytes try and use some from
8379 	 * the global reserve if its space type is the same as the global
8380 	 * reservation.
8381 	 */
8382 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8383 	    block_rsv->space_info == global_rsv->space_info) {
8384 		ret = block_rsv_use_bytes(global_rsv, blocksize);
8385 		if (!ret)
8386 			return global_rsv;
8387 	}
8388 	return ERR_PTR(ret);
8389 }
8390 
8391 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8392 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
8393 {
8394 	block_rsv_add_bytes(block_rsv, blocksize, 0);
8395 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8396 }
8397 
8398 /*
8399  * finds a free extent and does all the dirty work required for allocation
8400  * returns the tree buffer or an ERR_PTR on error.
8401  */
8402 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8403 					     struct btrfs_root *root,
8404 					     u64 parent, u64 root_objectid,
8405 					     const struct btrfs_disk_key *key,
8406 					     int level, u64 hint,
8407 					     u64 empty_size)
8408 {
8409 	struct btrfs_fs_info *fs_info = root->fs_info;
8410 	struct btrfs_key ins;
8411 	struct btrfs_block_rsv *block_rsv;
8412 	struct extent_buffer *buf;
8413 	struct btrfs_delayed_extent_op *extent_op;
8414 	u64 flags = 0;
8415 	int ret;
8416 	u32 blocksize = fs_info->nodesize;
8417 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8418 
8419 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8420 	if (btrfs_is_testing(fs_info)) {
8421 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8422 					    level);
8423 		if (!IS_ERR(buf))
8424 			root->alloc_bytenr += blocksize;
8425 		return buf;
8426 	}
8427 #endif
8428 
8429 	block_rsv = use_block_rsv(trans, root, blocksize);
8430 	if (IS_ERR(block_rsv))
8431 		return ERR_CAST(block_rsv);
8432 
8433 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8434 				   empty_size, hint, &ins, 0, 0);
8435 	if (ret)
8436 		goto out_unuse;
8437 
8438 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8439 	if (IS_ERR(buf)) {
8440 		ret = PTR_ERR(buf);
8441 		goto out_free_reserved;
8442 	}
8443 
8444 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8445 		if (parent == 0)
8446 			parent = ins.objectid;
8447 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8448 	} else
8449 		BUG_ON(parent > 0);
8450 
8451 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8452 		extent_op = btrfs_alloc_delayed_extent_op();
8453 		if (!extent_op) {
8454 			ret = -ENOMEM;
8455 			goto out_free_buf;
8456 		}
8457 		if (key)
8458 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
8459 		else
8460 			memset(&extent_op->key, 0, sizeof(extent_op->key));
8461 		extent_op->flags_to_set = flags;
8462 		extent_op->update_key = skinny_metadata ? false : true;
8463 		extent_op->update_flags = true;
8464 		extent_op->is_data = false;
8465 		extent_op->level = level;
8466 
8467 		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8468 				   root_objectid, level, 0,
8469 				   BTRFS_ADD_DELAYED_EXTENT);
8470 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8471 						 ins.offset, parent,
8472 						 root_objectid, level,
8473 						 BTRFS_ADD_DELAYED_EXTENT,
8474 						 extent_op, NULL, NULL);
8475 		if (ret)
8476 			goto out_free_delayed;
8477 	}
8478 	return buf;
8479 
8480 out_free_delayed:
8481 	btrfs_free_delayed_extent_op(extent_op);
8482 out_free_buf:
8483 	free_extent_buffer(buf);
8484 out_free_reserved:
8485 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8486 out_unuse:
8487 	unuse_block_rsv(fs_info, block_rsv, blocksize);
8488 	return ERR_PTR(ret);
8489 }
8490 
8491 struct walk_control {
8492 	u64 refs[BTRFS_MAX_LEVEL];
8493 	u64 flags[BTRFS_MAX_LEVEL];
8494 	struct btrfs_key update_progress;
8495 	int stage;
8496 	int level;
8497 	int shared_level;
8498 	int update_ref;
8499 	int keep_locks;
8500 	int reada_slot;
8501 	int reada_count;
8502 	int for_reloc;
8503 };
8504 
8505 #define DROP_REFERENCE	1
8506 #define UPDATE_BACKREF	2
8507 
8508 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8509 				     struct btrfs_root *root,
8510 				     struct walk_control *wc,
8511 				     struct btrfs_path *path)
8512 {
8513 	struct btrfs_fs_info *fs_info = root->fs_info;
8514 	u64 bytenr;
8515 	u64 generation;
8516 	u64 refs;
8517 	u64 flags;
8518 	u32 nritems;
8519 	struct btrfs_key key;
8520 	struct extent_buffer *eb;
8521 	int ret;
8522 	int slot;
8523 	int nread = 0;
8524 
8525 	if (path->slots[wc->level] < wc->reada_slot) {
8526 		wc->reada_count = wc->reada_count * 2 / 3;
8527 		wc->reada_count = max(wc->reada_count, 2);
8528 	} else {
8529 		wc->reada_count = wc->reada_count * 3 / 2;
8530 		wc->reada_count = min_t(int, wc->reada_count,
8531 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8532 	}
8533 
8534 	eb = path->nodes[wc->level];
8535 	nritems = btrfs_header_nritems(eb);
8536 
8537 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8538 		if (nread >= wc->reada_count)
8539 			break;
8540 
8541 		cond_resched();
8542 		bytenr = btrfs_node_blockptr(eb, slot);
8543 		generation = btrfs_node_ptr_generation(eb, slot);
8544 
8545 		if (slot == path->slots[wc->level])
8546 			goto reada;
8547 
8548 		if (wc->stage == UPDATE_BACKREF &&
8549 		    generation <= root->root_key.offset)
8550 			continue;
8551 
8552 		/* We don't lock the tree block, it's OK to be racy here */
8553 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8554 					       wc->level - 1, 1, &refs,
8555 					       &flags);
8556 		/* We don't care about errors in readahead. */
8557 		if (ret < 0)
8558 			continue;
8559 		BUG_ON(refs == 0);
8560 
8561 		if (wc->stage == DROP_REFERENCE) {
8562 			if (refs == 1)
8563 				goto reada;
8564 
8565 			if (wc->level == 1 &&
8566 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8567 				continue;
8568 			if (!wc->update_ref ||
8569 			    generation <= root->root_key.offset)
8570 				continue;
8571 			btrfs_node_key_to_cpu(eb, &key, slot);
8572 			ret = btrfs_comp_cpu_keys(&key,
8573 						  &wc->update_progress);
8574 			if (ret < 0)
8575 				continue;
8576 		} else {
8577 			if (wc->level == 1 &&
8578 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8579 				continue;
8580 		}
8581 reada:
8582 		readahead_tree_block(fs_info, bytenr);
8583 		nread++;
8584 	}
8585 	wc->reada_slot = slot;
8586 }
8587 
8588 /*
8589  * helper to process tree block while walking down the tree.
8590  *
8591  * when wc->stage == UPDATE_BACKREF, this function updates
8592  * back refs for pointers in the block.
8593  *
8594  * NOTE: return value 1 means we should stop walking down.
8595  */
8596 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8597 				   struct btrfs_root *root,
8598 				   struct btrfs_path *path,
8599 				   struct walk_control *wc, int lookup_info)
8600 {
8601 	struct btrfs_fs_info *fs_info = root->fs_info;
8602 	int level = wc->level;
8603 	struct extent_buffer *eb = path->nodes[level];
8604 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8605 	int ret;
8606 
8607 	if (wc->stage == UPDATE_BACKREF &&
8608 	    btrfs_header_owner(eb) != root->root_key.objectid)
8609 		return 1;
8610 
8611 	/*
8612 	 * when reference count of tree block is 1, it won't increase
8613 	 * again. once full backref flag is set, we never clear it.
8614 	 */
8615 	if (lookup_info &&
8616 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8617 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8618 		BUG_ON(!path->locks[level]);
8619 		ret = btrfs_lookup_extent_info(trans, fs_info,
8620 					       eb->start, level, 1,
8621 					       &wc->refs[level],
8622 					       &wc->flags[level]);
8623 		BUG_ON(ret == -ENOMEM);
8624 		if (ret)
8625 			return ret;
8626 		BUG_ON(wc->refs[level] == 0);
8627 	}
8628 
8629 	if (wc->stage == DROP_REFERENCE) {
8630 		if (wc->refs[level] > 1)
8631 			return 1;
8632 
8633 		if (path->locks[level] && !wc->keep_locks) {
8634 			btrfs_tree_unlock_rw(eb, path->locks[level]);
8635 			path->locks[level] = 0;
8636 		}
8637 		return 0;
8638 	}
8639 
8640 	/* wc->stage == UPDATE_BACKREF */
8641 	if (!(wc->flags[level] & flag)) {
8642 		BUG_ON(!path->locks[level]);
8643 		ret = btrfs_inc_ref(trans, root, eb, 1);
8644 		BUG_ON(ret); /* -ENOMEM */
8645 		ret = btrfs_dec_ref(trans, root, eb, 0);
8646 		BUG_ON(ret); /* -ENOMEM */
8647 		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8648 						  eb->len, flag,
8649 						  btrfs_header_level(eb), 0);
8650 		BUG_ON(ret); /* -ENOMEM */
8651 		wc->flags[level] |= flag;
8652 	}
8653 
8654 	/*
8655 	 * the block is shared by multiple trees, so it's not good to
8656 	 * keep the tree lock
8657 	 */
8658 	if (path->locks[level] && level > 0) {
8659 		btrfs_tree_unlock_rw(eb, path->locks[level]);
8660 		path->locks[level] = 0;
8661 	}
8662 	return 0;
8663 }
8664 
8665 /*
8666  * helper to process tree block pointer.
8667  *
8668  * when wc->stage == DROP_REFERENCE, this function checks
8669  * reference count of the block pointed to. if the block
8670  * is shared and we need update back refs for the subtree
8671  * rooted at the block, this function changes wc->stage to
8672  * UPDATE_BACKREF. if the block is shared and there is no
8673  * need to update back, this function drops the reference
8674  * to the block.
8675  *
8676  * NOTE: return value 1 means we should stop walking down.
8677  */
8678 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8679 				 struct btrfs_root *root,
8680 				 struct btrfs_path *path,
8681 				 struct walk_control *wc, int *lookup_info)
8682 {
8683 	struct btrfs_fs_info *fs_info = root->fs_info;
8684 	u64 bytenr;
8685 	u64 generation;
8686 	u64 parent;
8687 	u32 blocksize;
8688 	struct btrfs_key key;
8689 	struct btrfs_key first_key;
8690 	struct extent_buffer *next;
8691 	int level = wc->level;
8692 	int reada = 0;
8693 	int ret = 0;
8694 	bool need_account = false;
8695 
8696 	generation = btrfs_node_ptr_generation(path->nodes[level],
8697 					       path->slots[level]);
8698 	/*
8699 	 * if the lower level block was created before the snapshot
8700 	 * was created, we know there is no need to update back refs
8701 	 * for the subtree
8702 	 */
8703 	if (wc->stage == UPDATE_BACKREF &&
8704 	    generation <= root->root_key.offset) {
8705 		*lookup_info = 1;
8706 		return 1;
8707 	}
8708 
8709 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8710 	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8711 			      path->slots[level]);
8712 	blocksize = fs_info->nodesize;
8713 
8714 	next = find_extent_buffer(fs_info, bytenr);
8715 	if (!next) {
8716 		next = btrfs_find_create_tree_block(fs_info, bytenr);
8717 		if (IS_ERR(next))
8718 			return PTR_ERR(next);
8719 
8720 		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8721 					       level - 1);
8722 		reada = 1;
8723 	}
8724 	btrfs_tree_lock(next);
8725 	btrfs_set_lock_blocking(next);
8726 
8727 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8728 				       &wc->refs[level - 1],
8729 				       &wc->flags[level - 1]);
8730 	if (ret < 0)
8731 		goto out_unlock;
8732 
8733 	if (unlikely(wc->refs[level - 1] == 0)) {
8734 		btrfs_err(fs_info, "Missing references.");
8735 		ret = -EIO;
8736 		goto out_unlock;
8737 	}
8738 	*lookup_info = 0;
8739 
8740 	if (wc->stage == DROP_REFERENCE) {
8741 		if (wc->refs[level - 1] > 1) {
8742 			need_account = true;
8743 			if (level == 1 &&
8744 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8745 				goto skip;
8746 
8747 			if (!wc->update_ref ||
8748 			    generation <= root->root_key.offset)
8749 				goto skip;
8750 
8751 			btrfs_node_key_to_cpu(path->nodes[level], &key,
8752 					      path->slots[level]);
8753 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8754 			if (ret < 0)
8755 				goto skip;
8756 
8757 			wc->stage = UPDATE_BACKREF;
8758 			wc->shared_level = level - 1;
8759 		}
8760 	} else {
8761 		if (level == 1 &&
8762 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8763 			goto skip;
8764 	}
8765 
8766 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
8767 		btrfs_tree_unlock(next);
8768 		free_extent_buffer(next);
8769 		next = NULL;
8770 		*lookup_info = 1;
8771 	}
8772 
8773 	if (!next) {
8774 		if (reada && level == 1)
8775 			reada_walk_down(trans, root, wc, path);
8776 		next = read_tree_block(fs_info, bytenr, generation, level - 1,
8777 				       &first_key);
8778 		if (IS_ERR(next)) {
8779 			return PTR_ERR(next);
8780 		} else if (!extent_buffer_uptodate(next)) {
8781 			free_extent_buffer(next);
8782 			return -EIO;
8783 		}
8784 		btrfs_tree_lock(next);
8785 		btrfs_set_lock_blocking(next);
8786 	}
8787 
8788 	level--;
8789 	ASSERT(level == btrfs_header_level(next));
8790 	if (level != btrfs_header_level(next)) {
8791 		btrfs_err(root->fs_info, "mismatched level");
8792 		ret = -EIO;
8793 		goto out_unlock;
8794 	}
8795 	path->nodes[level] = next;
8796 	path->slots[level] = 0;
8797 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8798 	wc->level = level;
8799 	if (wc->level == 1)
8800 		wc->reada_slot = 0;
8801 	return 0;
8802 skip:
8803 	wc->refs[level - 1] = 0;
8804 	wc->flags[level - 1] = 0;
8805 	if (wc->stage == DROP_REFERENCE) {
8806 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8807 			parent = path->nodes[level]->start;
8808 		} else {
8809 			ASSERT(root->root_key.objectid ==
8810 			       btrfs_header_owner(path->nodes[level]));
8811 			if (root->root_key.objectid !=
8812 			    btrfs_header_owner(path->nodes[level])) {
8813 				btrfs_err(root->fs_info,
8814 						"mismatched block owner");
8815 				ret = -EIO;
8816 				goto out_unlock;
8817 			}
8818 			parent = 0;
8819 		}
8820 
8821 		if (need_account) {
8822 			ret = btrfs_qgroup_trace_subtree(trans, root, next,
8823 							 generation, level - 1);
8824 			if (ret) {
8825 				btrfs_err_rl(fs_info,
8826 					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8827 					     ret);
8828 			}
8829 		}
8830 		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8831 					parent, root->root_key.objectid,
8832 					level - 1, 0);
8833 		if (ret)
8834 			goto out_unlock;
8835 	}
8836 
8837 	*lookup_info = 1;
8838 	ret = 1;
8839 
8840 out_unlock:
8841 	btrfs_tree_unlock(next);
8842 	free_extent_buffer(next);
8843 
8844 	return ret;
8845 }
8846 
8847 /*
8848  * helper to process tree block while walking up the tree.
8849  *
8850  * when wc->stage == DROP_REFERENCE, this function drops
8851  * reference count on the block.
8852  *
8853  * when wc->stage == UPDATE_BACKREF, this function changes
8854  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8855  * to UPDATE_BACKREF previously while processing the block.
8856  *
8857  * NOTE: return value 1 means we should stop walking up.
8858  */
8859 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8860 				 struct btrfs_root *root,
8861 				 struct btrfs_path *path,
8862 				 struct walk_control *wc)
8863 {
8864 	struct btrfs_fs_info *fs_info = root->fs_info;
8865 	int ret;
8866 	int level = wc->level;
8867 	struct extent_buffer *eb = path->nodes[level];
8868 	u64 parent = 0;
8869 
8870 	if (wc->stage == UPDATE_BACKREF) {
8871 		BUG_ON(wc->shared_level < level);
8872 		if (level < wc->shared_level)
8873 			goto out;
8874 
8875 		ret = find_next_key(path, level + 1, &wc->update_progress);
8876 		if (ret > 0)
8877 			wc->update_ref = 0;
8878 
8879 		wc->stage = DROP_REFERENCE;
8880 		wc->shared_level = -1;
8881 		path->slots[level] = 0;
8882 
8883 		/*
8884 		 * check reference count again if the block isn't locked.
8885 		 * we should start walking down the tree again if reference
8886 		 * count is one.
8887 		 */
8888 		if (!path->locks[level]) {
8889 			BUG_ON(level == 0);
8890 			btrfs_tree_lock(eb);
8891 			btrfs_set_lock_blocking(eb);
8892 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8893 
8894 			ret = btrfs_lookup_extent_info(trans, fs_info,
8895 						       eb->start, level, 1,
8896 						       &wc->refs[level],
8897 						       &wc->flags[level]);
8898 			if (ret < 0) {
8899 				btrfs_tree_unlock_rw(eb, path->locks[level]);
8900 				path->locks[level] = 0;
8901 				return ret;
8902 			}
8903 			BUG_ON(wc->refs[level] == 0);
8904 			if (wc->refs[level] == 1) {
8905 				btrfs_tree_unlock_rw(eb, path->locks[level]);
8906 				path->locks[level] = 0;
8907 				return 1;
8908 			}
8909 		}
8910 	}
8911 
8912 	/* wc->stage == DROP_REFERENCE */
8913 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8914 
8915 	if (wc->refs[level] == 1) {
8916 		if (level == 0) {
8917 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8918 				ret = btrfs_dec_ref(trans, root, eb, 1);
8919 			else
8920 				ret = btrfs_dec_ref(trans, root, eb, 0);
8921 			BUG_ON(ret); /* -ENOMEM */
8922 			ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
8923 			if (ret) {
8924 				btrfs_err_rl(fs_info,
8925 					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
8926 					     ret);
8927 			}
8928 		}
8929 		/* make block locked assertion in clean_tree_block happy */
8930 		if (!path->locks[level] &&
8931 		    btrfs_header_generation(eb) == trans->transid) {
8932 			btrfs_tree_lock(eb);
8933 			btrfs_set_lock_blocking(eb);
8934 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8935 		}
8936 		clean_tree_block(fs_info, eb);
8937 	}
8938 
8939 	if (eb == root->node) {
8940 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8941 			parent = eb->start;
8942 		else
8943 			BUG_ON(root->root_key.objectid !=
8944 			       btrfs_header_owner(eb));
8945 	} else {
8946 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8947 			parent = path->nodes[level + 1]->start;
8948 		else
8949 			BUG_ON(root->root_key.objectid !=
8950 			       btrfs_header_owner(path->nodes[level + 1]));
8951 	}
8952 
8953 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8954 out:
8955 	wc->refs[level] = 0;
8956 	wc->flags[level] = 0;
8957 	return 0;
8958 }
8959 
8960 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8961 				   struct btrfs_root *root,
8962 				   struct btrfs_path *path,
8963 				   struct walk_control *wc)
8964 {
8965 	int level = wc->level;
8966 	int lookup_info = 1;
8967 	int ret;
8968 
8969 	while (level >= 0) {
8970 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
8971 		if (ret > 0)
8972 			break;
8973 
8974 		if (level == 0)
8975 			break;
8976 
8977 		if (path->slots[level] >=
8978 		    btrfs_header_nritems(path->nodes[level]))
8979 			break;
8980 
8981 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
8982 		if (ret > 0) {
8983 			path->slots[level]++;
8984 			continue;
8985 		} else if (ret < 0)
8986 			return ret;
8987 		level = wc->level;
8988 	}
8989 	return 0;
8990 }
8991 
8992 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8993 				 struct btrfs_root *root,
8994 				 struct btrfs_path *path,
8995 				 struct walk_control *wc, int max_level)
8996 {
8997 	int level = wc->level;
8998 	int ret;
8999 
9000 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9001 	while (level < max_level && path->nodes[level]) {
9002 		wc->level = level;
9003 		if (path->slots[level] + 1 <
9004 		    btrfs_header_nritems(path->nodes[level])) {
9005 			path->slots[level]++;
9006 			return 0;
9007 		} else {
9008 			ret = walk_up_proc(trans, root, path, wc);
9009 			if (ret > 0)
9010 				return 0;
9011 
9012 			if (path->locks[level]) {
9013 				btrfs_tree_unlock_rw(path->nodes[level],
9014 						     path->locks[level]);
9015 				path->locks[level] = 0;
9016 			}
9017 			free_extent_buffer(path->nodes[level]);
9018 			path->nodes[level] = NULL;
9019 			level++;
9020 		}
9021 	}
9022 	return 1;
9023 }
9024 
9025 /*
9026  * drop a subvolume tree.
9027  *
9028  * this function traverses the tree freeing any blocks that only
9029  * referenced by the tree.
9030  *
9031  * when a shared tree block is found. this function decreases its
9032  * reference count by one. if update_ref is true, this function
9033  * also make sure backrefs for the shared block and all lower level
9034  * blocks are properly updated.
9035  *
9036  * If called with for_reloc == 0, may exit early with -EAGAIN
9037  */
9038 int btrfs_drop_snapshot(struct btrfs_root *root,
9039 			 struct btrfs_block_rsv *block_rsv, int update_ref,
9040 			 int for_reloc)
9041 {
9042 	struct btrfs_fs_info *fs_info = root->fs_info;
9043 	struct btrfs_path *path;
9044 	struct btrfs_trans_handle *trans;
9045 	struct btrfs_root *tree_root = fs_info->tree_root;
9046 	struct btrfs_root_item *root_item = &root->root_item;
9047 	struct walk_control *wc;
9048 	struct btrfs_key key;
9049 	int err = 0;
9050 	int ret;
9051 	int level;
9052 	bool root_dropped = false;
9053 
9054 	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
9055 
9056 	path = btrfs_alloc_path();
9057 	if (!path) {
9058 		err = -ENOMEM;
9059 		goto out;
9060 	}
9061 
9062 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9063 	if (!wc) {
9064 		btrfs_free_path(path);
9065 		err = -ENOMEM;
9066 		goto out;
9067 	}
9068 
9069 	trans = btrfs_start_transaction(tree_root, 0);
9070 	if (IS_ERR(trans)) {
9071 		err = PTR_ERR(trans);
9072 		goto out_free;
9073 	}
9074 
9075 	if (block_rsv)
9076 		trans->block_rsv = block_rsv;
9077 
9078 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9079 		level = btrfs_header_level(root->node);
9080 		path->nodes[level] = btrfs_lock_root_node(root);
9081 		btrfs_set_lock_blocking(path->nodes[level]);
9082 		path->slots[level] = 0;
9083 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9084 		memset(&wc->update_progress, 0,
9085 		       sizeof(wc->update_progress));
9086 	} else {
9087 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9088 		memcpy(&wc->update_progress, &key,
9089 		       sizeof(wc->update_progress));
9090 
9091 		level = root_item->drop_level;
9092 		BUG_ON(level == 0);
9093 		path->lowest_level = level;
9094 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9095 		path->lowest_level = 0;
9096 		if (ret < 0) {
9097 			err = ret;
9098 			goto out_end_trans;
9099 		}
9100 		WARN_ON(ret > 0);
9101 
9102 		/*
9103 		 * unlock our path, this is safe because only this
9104 		 * function is allowed to delete this snapshot
9105 		 */
9106 		btrfs_unlock_up_safe(path, 0);
9107 
9108 		level = btrfs_header_level(root->node);
9109 		while (1) {
9110 			btrfs_tree_lock(path->nodes[level]);
9111 			btrfs_set_lock_blocking(path->nodes[level]);
9112 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9113 
9114 			ret = btrfs_lookup_extent_info(trans, fs_info,
9115 						path->nodes[level]->start,
9116 						level, 1, &wc->refs[level],
9117 						&wc->flags[level]);
9118 			if (ret < 0) {
9119 				err = ret;
9120 				goto out_end_trans;
9121 			}
9122 			BUG_ON(wc->refs[level] == 0);
9123 
9124 			if (level == root_item->drop_level)
9125 				break;
9126 
9127 			btrfs_tree_unlock(path->nodes[level]);
9128 			path->locks[level] = 0;
9129 			WARN_ON(wc->refs[level] != 1);
9130 			level--;
9131 		}
9132 	}
9133 
9134 	wc->level = level;
9135 	wc->shared_level = -1;
9136 	wc->stage = DROP_REFERENCE;
9137 	wc->update_ref = update_ref;
9138 	wc->keep_locks = 0;
9139 	wc->for_reloc = for_reloc;
9140 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9141 
9142 	while (1) {
9143 
9144 		ret = walk_down_tree(trans, root, path, wc);
9145 		if (ret < 0) {
9146 			err = ret;
9147 			break;
9148 		}
9149 
9150 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9151 		if (ret < 0) {
9152 			err = ret;
9153 			break;
9154 		}
9155 
9156 		if (ret > 0) {
9157 			BUG_ON(wc->stage != DROP_REFERENCE);
9158 			break;
9159 		}
9160 
9161 		if (wc->stage == DROP_REFERENCE) {
9162 			level = wc->level;
9163 			btrfs_node_key(path->nodes[level],
9164 				       &root_item->drop_progress,
9165 				       path->slots[level]);
9166 			root_item->drop_level = level;
9167 		}
9168 
9169 		BUG_ON(wc->level == 0);
9170 		if (btrfs_should_end_transaction(trans) ||
9171 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9172 			ret = btrfs_update_root(trans, tree_root,
9173 						&root->root_key,
9174 						root_item);
9175 			if (ret) {
9176 				btrfs_abort_transaction(trans, ret);
9177 				err = ret;
9178 				goto out_end_trans;
9179 			}
9180 
9181 			btrfs_end_transaction_throttle(trans);
9182 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9183 				btrfs_debug(fs_info,
9184 					    "drop snapshot early exit");
9185 				err = -EAGAIN;
9186 				goto out_free;
9187 			}
9188 
9189 			trans = btrfs_start_transaction(tree_root, 0);
9190 			if (IS_ERR(trans)) {
9191 				err = PTR_ERR(trans);
9192 				goto out_free;
9193 			}
9194 			if (block_rsv)
9195 				trans->block_rsv = block_rsv;
9196 		}
9197 	}
9198 	btrfs_release_path(path);
9199 	if (err)
9200 		goto out_end_trans;
9201 
9202 	ret = btrfs_del_root(trans, fs_info, &root->root_key);
9203 	if (ret) {
9204 		btrfs_abort_transaction(trans, ret);
9205 		err = ret;
9206 		goto out_end_trans;
9207 	}
9208 
9209 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9210 		ret = btrfs_find_root(tree_root, &root->root_key, path,
9211 				      NULL, NULL);
9212 		if (ret < 0) {
9213 			btrfs_abort_transaction(trans, ret);
9214 			err = ret;
9215 			goto out_end_trans;
9216 		} else if (ret > 0) {
9217 			/* if we fail to delete the orphan item this time
9218 			 * around, it'll get picked up the next time.
9219 			 *
9220 			 * The most common failure here is just -ENOENT.
9221 			 */
9222 			btrfs_del_orphan_item(trans, tree_root,
9223 					      root->root_key.objectid);
9224 		}
9225 	}
9226 
9227 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9228 		btrfs_add_dropped_root(trans, root);
9229 	} else {
9230 		free_extent_buffer(root->node);
9231 		free_extent_buffer(root->commit_root);
9232 		btrfs_put_fs_root(root);
9233 	}
9234 	root_dropped = true;
9235 out_end_trans:
9236 	btrfs_end_transaction_throttle(trans);
9237 out_free:
9238 	kfree(wc);
9239 	btrfs_free_path(path);
9240 out:
9241 	/*
9242 	 * So if we need to stop dropping the snapshot for whatever reason we
9243 	 * need to make sure to add it back to the dead root list so that we
9244 	 * keep trying to do the work later.  This also cleans up roots if we
9245 	 * don't have it in the radix (like when we recover after a power fail
9246 	 * or unmount) so we don't leak memory.
9247 	 */
9248 	if (!for_reloc && !root_dropped)
9249 		btrfs_add_dead_root(root);
9250 	if (err && err != -EAGAIN)
9251 		btrfs_handle_fs_error(fs_info, err, NULL);
9252 	return err;
9253 }
9254 
9255 /*
9256  * drop subtree rooted at tree block 'node'.
9257  *
9258  * NOTE: this function will unlock and release tree block 'node'
9259  * only used by relocation code
9260  */
9261 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9262 			struct btrfs_root *root,
9263 			struct extent_buffer *node,
9264 			struct extent_buffer *parent)
9265 {
9266 	struct btrfs_fs_info *fs_info = root->fs_info;
9267 	struct btrfs_path *path;
9268 	struct walk_control *wc;
9269 	int level;
9270 	int parent_level;
9271 	int ret = 0;
9272 	int wret;
9273 
9274 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9275 
9276 	path = btrfs_alloc_path();
9277 	if (!path)
9278 		return -ENOMEM;
9279 
9280 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9281 	if (!wc) {
9282 		btrfs_free_path(path);
9283 		return -ENOMEM;
9284 	}
9285 
9286 	btrfs_assert_tree_locked(parent);
9287 	parent_level = btrfs_header_level(parent);
9288 	extent_buffer_get(parent);
9289 	path->nodes[parent_level] = parent;
9290 	path->slots[parent_level] = btrfs_header_nritems(parent);
9291 
9292 	btrfs_assert_tree_locked(node);
9293 	level = btrfs_header_level(node);
9294 	path->nodes[level] = node;
9295 	path->slots[level] = 0;
9296 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9297 
9298 	wc->refs[parent_level] = 1;
9299 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9300 	wc->level = level;
9301 	wc->shared_level = -1;
9302 	wc->stage = DROP_REFERENCE;
9303 	wc->update_ref = 0;
9304 	wc->keep_locks = 1;
9305 	wc->for_reloc = 1;
9306 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9307 
9308 	while (1) {
9309 		wret = walk_down_tree(trans, root, path, wc);
9310 		if (wret < 0) {
9311 			ret = wret;
9312 			break;
9313 		}
9314 
9315 		wret = walk_up_tree(trans, root, path, wc, parent_level);
9316 		if (wret < 0)
9317 			ret = wret;
9318 		if (wret != 0)
9319 			break;
9320 	}
9321 
9322 	kfree(wc);
9323 	btrfs_free_path(path);
9324 	return ret;
9325 }
9326 
9327 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9328 {
9329 	u64 num_devices;
9330 	u64 stripped;
9331 
9332 	/*
9333 	 * if restripe for this chunk_type is on pick target profile and
9334 	 * return, otherwise do the usual balance
9335 	 */
9336 	stripped = get_restripe_target(fs_info, flags);
9337 	if (stripped)
9338 		return extended_to_chunk(stripped);
9339 
9340 	num_devices = fs_info->fs_devices->rw_devices;
9341 
9342 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
9343 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9344 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9345 
9346 	if (num_devices == 1) {
9347 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9348 		stripped = flags & ~stripped;
9349 
9350 		/* turn raid0 into single device chunks */
9351 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
9352 			return stripped;
9353 
9354 		/* turn mirroring into duplication */
9355 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9356 			     BTRFS_BLOCK_GROUP_RAID10))
9357 			return stripped | BTRFS_BLOCK_GROUP_DUP;
9358 	} else {
9359 		/* they already had raid on here, just return */
9360 		if (flags & stripped)
9361 			return flags;
9362 
9363 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9364 		stripped = flags & ~stripped;
9365 
9366 		/* switch duplicated blocks with raid1 */
9367 		if (flags & BTRFS_BLOCK_GROUP_DUP)
9368 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
9369 
9370 		/* this is drive concat, leave it alone */
9371 	}
9372 
9373 	return flags;
9374 }
9375 
9376 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9377 {
9378 	struct btrfs_space_info *sinfo = cache->space_info;
9379 	u64 num_bytes;
9380 	u64 min_allocable_bytes;
9381 	int ret = -ENOSPC;
9382 
9383 	/*
9384 	 * We need some metadata space and system metadata space for
9385 	 * allocating chunks in some corner cases until we force to set
9386 	 * it to be readonly.
9387 	 */
9388 	if ((sinfo->flags &
9389 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9390 	    !force)
9391 		min_allocable_bytes = SZ_1M;
9392 	else
9393 		min_allocable_bytes = 0;
9394 
9395 	spin_lock(&sinfo->lock);
9396 	spin_lock(&cache->lock);
9397 
9398 	if (cache->ro) {
9399 		cache->ro++;
9400 		ret = 0;
9401 		goto out;
9402 	}
9403 
9404 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9405 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
9406 
9407 	if (btrfs_space_info_used(sinfo, true) + num_bytes +
9408 	    min_allocable_bytes <= sinfo->total_bytes) {
9409 		sinfo->bytes_readonly += num_bytes;
9410 		cache->ro++;
9411 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9412 		ret = 0;
9413 	}
9414 out:
9415 	spin_unlock(&cache->lock);
9416 	spin_unlock(&sinfo->lock);
9417 	return ret;
9418 }
9419 
9420 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
9421 			     struct btrfs_block_group_cache *cache)
9422 
9423 {
9424 	struct btrfs_trans_handle *trans;
9425 	u64 alloc_flags;
9426 	int ret;
9427 
9428 again:
9429 	trans = btrfs_join_transaction(fs_info->extent_root);
9430 	if (IS_ERR(trans))
9431 		return PTR_ERR(trans);
9432 
9433 	/*
9434 	 * we're not allowed to set block groups readonly after the dirty
9435 	 * block groups cache has started writing.  If it already started,
9436 	 * back off and let this transaction commit
9437 	 */
9438 	mutex_lock(&fs_info->ro_block_group_mutex);
9439 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9440 		u64 transid = trans->transid;
9441 
9442 		mutex_unlock(&fs_info->ro_block_group_mutex);
9443 		btrfs_end_transaction(trans);
9444 
9445 		ret = btrfs_wait_for_commit(fs_info, transid);
9446 		if (ret)
9447 			return ret;
9448 		goto again;
9449 	}
9450 
9451 	/*
9452 	 * if we are changing raid levels, try to allocate a corresponding
9453 	 * block group with the new raid level.
9454 	 */
9455 	alloc_flags = update_block_group_flags(fs_info, cache->flags);
9456 	if (alloc_flags != cache->flags) {
9457 		ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9458 				     CHUNK_ALLOC_FORCE);
9459 		/*
9460 		 * ENOSPC is allowed here, we may have enough space
9461 		 * already allocated at the new raid level to
9462 		 * carry on
9463 		 */
9464 		if (ret == -ENOSPC)
9465 			ret = 0;
9466 		if (ret < 0)
9467 			goto out;
9468 	}
9469 
9470 	ret = inc_block_group_ro(cache, 0);
9471 	if (!ret)
9472 		goto out;
9473 	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9474 	ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9475 			     CHUNK_ALLOC_FORCE);
9476 	if (ret < 0)
9477 		goto out;
9478 	ret = inc_block_group_ro(cache, 0);
9479 out:
9480 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9481 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
9482 		mutex_lock(&fs_info->chunk_mutex);
9483 		check_system_chunk(trans, fs_info, alloc_flags);
9484 		mutex_unlock(&fs_info->chunk_mutex);
9485 	}
9486 	mutex_unlock(&fs_info->ro_block_group_mutex);
9487 
9488 	btrfs_end_transaction(trans);
9489 	return ret;
9490 }
9491 
9492 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9493 			    struct btrfs_fs_info *fs_info, u64 type)
9494 {
9495 	u64 alloc_flags = get_alloc_profile(fs_info, type);
9496 
9497 	return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
9498 }
9499 
9500 /*
9501  * helper to account the unused space of all the readonly block group in the
9502  * space_info. takes mirrors into account.
9503  */
9504 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9505 {
9506 	struct btrfs_block_group_cache *block_group;
9507 	u64 free_bytes = 0;
9508 	int factor;
9509 
9510 	/* It's df, we don't care if it's racy */
9511 	if (list_empty(&sinfo->ro_bgs))
9512 		return 0;
9513 
9514 	spin_lock(&sinfo->lock);
9515 	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9516 		spin_lock(&block_group->lock);
9517 
9518 		if (!block_group->ro) {
9519 			spin_unlock(&block_group->lock);
9520 			continue;
9521 		}
9522 
9523 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9524 					  BTRFS_BLOCK_GROUP_RAID10 |
9525 					  BTRFS_BLOCK_GROUP_DUP))
9526 			factor = 2;
9527 		else
9528 			factor = 1;
9529 
9530 		free_bytes += (block_group->key.offset -
9531 			       btrfs_block_group_used(&block_group->item)) *
9532 			       factor;
9533 
9534 		spin_unlock(&block_group->lock);
9535 	}
9536 	spin_unlock(&sinfo->lock);
9537 
9538 	return free_bytes;
9539 }
9540 
9541 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9542 {
9543 	struct btrfs_space_info *sinfo = cache->space_info;
9544 	u64 num_bytes;
9545 
9546 	BUG_ON(!cache->ro);
9547 
9548 	spin_lock(&sinfo->lock);
9549 	spin_lock(&cache->lock);
9550 	if (!--cache->ro) {
9551 		num_bytes = cache->key.offset - cache->reserved -
9552 			    cache->pinned - cache->bytes_super -
9553 			    btrfs_block_group_used(&cache->item);
9554 		sinfo->bytes_readonly -= num_bytes;
9555 		list_del_init(&cache->ro_list);
9556 	}
9557 	spin_unlock(&cache->lock);
9558 	spin_unlock(&sinfo->lock);
9559 }
9560 
9561 /*
9562  * checks to see if its even possible to relocate this block group.
9563  *
9564  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9565  * ok to go ahead and try.
9566  */
9567 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9568 {
9569 	struct btrfs_root *root = fs_info->extent_root;
9570 	struct btrfs_block_group_cache *block_group;
9571 	struct btrfs_space_info *space_info;
9572 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9573 	struct btrfs_device *device;
9574 	struct btrfs_trans_handle *trans;
9575 	u64 min_free;
9576 	u64 dev_min = 1;
9577 	u64 dev_nr = 0;
9578 	u64 target;
9579 	int debug;
9580 	int index;
9581 	int full = 0;
9582 	int ret = 0;
9583 
9584 	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9585 
9586 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
9587 
9588 	/* odd, couldn't find the block group, leave it alone */
9589 	if (!block_group) {
9590 		if (debug)
9591 			btrfs_warn(fs_info,
9592 				   "can't find block group for bytenr %llu",
9593 				   bytenr);
9594 		return -1;
9595 	}
9596 
9597 	min_free = btrfs_block_group_used(&block_group->item);
9598 
9599 	/* no bytes used, we're good */
9600 	if (!min_free)
9601 		goto out;
9602 
9603 	space_info = block_group->space_info;
9604 	spin_lock(&space_info->lock);
9605 
9606 	full = space_info->full;
9607 
9608 	/*
9609 	 * if this is the last block group we have in this space, we can't
9610 	 * relocate it unless we're able to allocate a new chunk below.
9611 	 *
9612 	 * Otherwise, we need to make sure we have room in the space to handle
9613 	 * all of the extents from this block group.  If we can, we're good
9614 	 */
9615 	if ((space_info->total_bytes != block_group->key.offset) &&
9616 	    (btrfs_space_info_used(space_info, false) + min_free <
9617 	     space_info->total_bytes)) {
9618 		spin_unlock(&space_info->lock);
9619 		goto out;
9620 	}
9621 	spin_unlock(&space_info->lock);
9622 
9623 	/*
9624 	 * ok we don't have enough space, but maybe we have free space on our
9625 	 * devices to allocate new chunks for relocation, so loop through our
9626 	 * alloc devices and guess if we have enough space.  if this block
9627 	 * group is going to be restriped, run checks against the target
9628 	 * profile instead of the current one.
9629 	 */
9630 	ret = -1;
9631 
9632 	/*
9633 	 * index:
9634 	 *      0: raid10
9635 	 *      1: raid1
9636 	 *      2: dup
9637 	 *      3: raid0
9638 	 *      4: single
9639 	 */
9640 	target = get_restripe_target(fs_info, block_group->flags);
9641 	if (target) {
9642 		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9643 	} else {
9644 		/*
9645 		 * this is just a balance, so if we were marked as full
9646 		 * we know there is no space for a new chunk
9647 		 */
9648 		if (full) {
9649 			if (debug)
9650 				btrfs_warn(fs_info,
9651 					   "no space to alloc new chunk for block group %llu",
9652 					   block_group->key.objectid);
9653 			goto out;
9654 		}
9655 
9656 		index = btrfs_bg_flags_to_raid_index(block_group->flags);
9657 	}
9658 
9659 	if (index == BTRFS_RAID_RAID10) {
9660 		dev_min = 4;
9661 		/* Divide by 2 */
9662 		min_free >>= 1;
9663 	} else if (index == BTRFS_RAID_RAID1) {
9664 		dev_min = 2;
9665 	} else if (index == BTRFS_RAID_DUP) {
9666 		/* Multiply by 2 */
9667 		min_free <<= 1;
9668 	} else if (index == BTRFS_RAID_RAID0) {
9669 		dev_min = fs_devices->rw_devices;
9670 		min_free = div64_u64(min_free, dev_min);
9671 	}
9672 
9673 	/* We need to do this so that we can look at pending chunks */
9674 	trans = btrfs_join_transaction(root);
9675 	if (IS_ERR(trans)) {
9676 		ret = PTR_ERR(trans);
9677 		goto out;
9678 	}
9679 
9680 	mutex_lock(&fs_info->chunk_mutex);
9681 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9682 		u64 dev_offset;
9683 
9684 		/*
9685 		 * check to make sure we can actually find a chunk with enough
9686 		 * space to fit our block group in.
9687 		 */
9688 		if (device->total_bytes > device->bytes_used + min_free &&
9689 		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9690 			ret = find_free_dev_extent(trans, device, min_free,
9691 						   &dev_offset, NULL);
9692 			if (!ret)
9693 				dev_nr++;
9694 
9695 			if (dev_nr >= dev_min)
9696 				break;
9697 
9698 			ret = -1;
9699 		}
9700 	}
9701 	if (debug && ret == -1)
9702 		btrfs_warn(fs_info,
9703 			   "no space to allocate a new chunk for block group %llu",
9704 			   block_group->key.objectid);
9705 	mutex_unlock(&fs_info->chunk_mutex);
9706 	btrfs_end_transaction(trans);
9707 out:
9708 	btrfs_put_block_group(block_group);
9709 	return ret;
9710 }
9711 
9712 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9713 				  struct btrfs_path *path,
9714 				  struct btrfs_key *key)
9715 {
9716 	struct btrfs_root *root = fs_info->extent_root;
9717 	int ret = 0;
9718 	struct btrfs_key found_key;
9719 	struct extent_buffer *leaf;
9720 	int slot;
9721 
9722 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9723 	if (ret < 0)
9724 		goto out;
9725 
9726 	while (1) {
9727 		slot = path->slots[0];
9728 		leaf = path->nodes[0];
9729 		if (slot >= btrfs_header_nritems(leaf)) {
9730 			ret = btrfs_next_leaf(root, path);
9731 			if (ret == 0)
9732 				continue;
9733 			if (ret < 0)
9734 				goto out;
9735 			break;
9736 		}
9737 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
9738 
9739 		if (found_key.objectid >= key->objectid &&
9740 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9741 			struct extent_map_tree *em_tree;
9742 			struct extent_map *em;
9743 
9744 			em_tree = &root->fs_info->mapping_tree.map_tree;
9745 			read_lock(&em_tree->lock);
9746 			em = lookup_extent_mapping(em_tree, found_key.objectid,
9747 						   found_key.offset);
9748 			read_unlock(&em_tree->lock);
9749 			if (!em) {
9750 				btrfs_err(fs_info,
9751 			"logical %llu len %llu found bg but no related chunk",
9752 					  found_key.objectid, found_key.offset);
9753 				ret = -ENOENT;
9754 			} else {
9755 				ret = 0;
9756 			}
9757 			free_extent_map(em);
9758 			goto out;
9759 		}
9760 		path->slots[0]++;
9761 	}
9762 out:
9763 	return ret;
9764 }
9765 
9766 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9767 {
9768 	struct btrfs_block_group_cache *block_group;
9769 	u64 last = 0;
9770 
9771 	while (1) {
9772 		struct inode *inode;
9773 
9774 		block_group = btrfs_lookup_first_block_group(info, last);
9775 		while (block_group) {
9776 			spin_lock(&block_group->lock);
9777 			if (block_group->iref)
9778 				break;
9779 			spin_unlock(&block_group->lock);
9780 			block_group = next_block_group(info, block_group);
9781 		}
9782 		if (!block_group) {
9783 			if (last == 0)
9784 				break;
9785 			last = 0;
9786 			continue;
9787 		}
9788 
9789 		inode = block_group->inode;
9790 		block_group->iref = 0;
9791 		block_group->inode = NULL;
9792 		spin_unlock(&block_group->lock);
9793 		ASSERT(block_group->io_ctl.inode == NULL);
9794 		iput(inode);
9795 		last = block_group->key.objectid + block_group->key.offset;
9796 		btrfs_put_block_group(block_group);
9797 	}
9798 }
9799 
9800 /*
9801  * Must be called only after stopping all workers, since we could have block
9802  * group caching kthreads running, and therefore they could race with us if we
9803  * freed the block groups before stopping them.
9804  */
9805 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9806 {
9807 	struct btrfs_block_group_cache *block_group;
9808 	struct btrfs_space_info *space_info;
9809 	struct btrfs_caching_control *caching_ctl;
9810 	struct rb_node *n;
9811 
9812 	down_write(&info->commit_root_sem);
9813 	while (!list_empty(&info->caching_block_groups)) {
9814 		caching_ctl = list_entry(info->caching_block_groups.next,
9815 					 struct btrfs_caching_control, list);
9816 		list_del(&caching_ctl->list);
9817 		put_caching_control(caching_ctl);
9818 	}
9819 	up_write(&info->commit_root_sem);
9820 
9821 	spin_lock(&info->unused_bgs_lock);
9822 	while (!list_empty(&info->unused_bgs)) {
9823 		block_group = list_first_entry(&info->unused_bgs,
9824 					       struct btrfs_block_group_cache,
9825 					       bg_list);
9826 		list_del_init(&block_group->bg_list);
9827 		btrfs_put_block_group(block_group);
9828 	}
9829 	spin_unlock(&info->unused_bgs_lock);
9830 
9831 	spin_lock(&info->block_group_cache_lock);
9832 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9833 		block_group = rb_entry(n, struct btrfs_block_group_cache,
9834 				       cache_node);
9835 		rb_erase(&block_group->cache_node,
9836 			 &info->block_group_cache_tree);
9837 		RB_CLEAR_NODE(&block_group->cache_node);
9838 		spin_unlock(&info->block_group_cache_lock);
9839 
9840 		down_write(&block_group->space_info->groups_sem);
9841 		list_del(&block_group->list);
9842 		up_write(&block_group->space_info->groups_sem);
9843 
9844 		/*
9845 		 * We haven't cached this block group, which means we could
9846 		 * possibly have excluded extents on this block group.
9847 		 */
9848 		if (block_group->cached == BTRFS_CACHE_NO ||
9849 		    block_group->cached == BTRFS_CACHE_ERROR)
9850 			free_excluded_extents(info, block_group);
9851 
9852 		btrfs_remove_free_space_cache(block_group);
9853 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9854 		ASSERT(list_empty(&block_group->dirty_list));
9855 		ASSERT(list_empty(&block_group->io_list));
9856 		ASSERT(list_empty(&block_group->bg_list));
9857 		ASSERT(atomic_read(&block_group->count) == 1);
9858 		btrfs_put_block_group(block_group);
9859 
9860 		spin_lock(&info->block_group_cache_lock);
9861 	}
9862 	spin_unlock(&info->block_group_cache_lock);
9863 
9864 	/* now that all the block groups are freed, go through and
9865 	 * free all the space_info structs.  This is only called during
9866 	 * the final stages of unmount, and so we know nobody is
9867 	 * using them.  We call synchronize_rcu() once before we start,
9868 	 * just to be on the safe side.
9869 	 */
9870 	synchronize_rcu();
9871 
9872 	release_global_block_rsv(info);
9873 
9874 	while (!list_empty(&info->space_info)) {
9875 		int i;
9876 
9877 		space_info = list_entry(info->space_info.next,
9878 					struct btrfs_space_info,
9879 					list);
9880 
9881 		/*
9882 		 * Do not hide this behind enospc_debug, this is actually
9883 		 * important and indicates a real bug if this happens.
9884 		 */
9885 		if (WARN_ON(space_info->bytes_pinned > 0 ||
9886 			    space_info->bytes_reserved > 0 ||
9887 			    space_info->bytes_may_use > 0))
9888 			dump_space_info(info, space_info, 0, 0);
9889 		list_del(&space_info->list);
9890 		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9891 			struct kobject *kobj;
9892 			kobj = space_info->block_group_kobjs[i];
9893 			space_info->block_group_kobjs[i] = NULL;
9894 			if (kobj) {
9895 				kobject_del(kobj);
9896 				kobject_put(kobj);
9897 			}
9898 		}
9899 		kobject_del(&space_info->kobj);
9900 		kobject_put(&space_info->kobj);
9901 	}
9902 	return 0;
9903 }
9904 
9905 /* link_block_group will queue up kobjects to add when we're reclaim-safe */
9906 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9907 {
9908 	struct btrfs_space_info *space_info;
9909 	struct raid_kobject *rkobj;
9910 	LIST_HEAD(list);
9911 	int index;
9912 	int ret = 0;
9913 
9914 	spin_lock(&fs_info->pending_raid_kobjs_lock);
9915 	list_splice_init(&fs_info->pending_raid_kobjs, &list);
9916 	spin_unlock(&fs_info->pending_raid_kobjs_lock);
9917 
9918 	list_for_each_entry(rkobj, &list, list) {
9919 		space_info = __find_space_info(fs_info, rkobj->flags);
9920 		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9921 
9922 		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9923 				  "%s", get_raid_name(index));
9924 		if (ret) {
9925 			kobject_put(&rkobj->kobj);
9926 			break;
9927 		}
9928 	}
9929 	if (ret)
9930 		btrfs_warn(fs_info,
9931 			   "failed to add kobject for block cache, ignoring");
9932 }
9933 
9934 static void link_block_group(struct btrfs_block_group_cache *cache)
9935 {
9936 	struct btrfs_space_info *space_info = cache->space_info;
9937 	struct btrfs_fs_info *fs_info = cache->fs_info;
9938 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
9939 	bool first = false;
9940 
9941 	down_write(&space_info->groups_sem);
9942 	if (list_empty(&space_info->block_groups[index]))
9943 		first = true;
9944 	list_add_tail(&cache->list, &space_info->block_groups[index]);
9945 	up_write(&space_info->groups_sem);
9946 
9947 	if (first) {
9948 		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9949 		if (!rkobj) {
9950 			btrfs_warn(cache->fs_info,
9951 				"couldn't alloc memory for raid level kobject");
9952 			return;
9953 		}
9954 		rkobj->flags = cache->flags;
9955 		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9956 
9957 		spin_lock(&fs_info->pending_raid_kobjs_lock);
9958 		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9959 		spin_unlock(&fs_info->pending_raid_kobjs_lock);
9960 		space_info->block_group_kobjs[index] = &rkobj->kobj;
9961 	}
9962 }
9963 
9964 static struct btrfs_block_group_cache *
9965 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9966 			       u64 start, u64 size)
9967 {
9968 	struct btrfs_block_group_cache *cache;
9969 
9970 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
9971 	if (!cache)
9972 		return NULL;
9973 
9974 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9975 					GFP_NOFS);
9976 	if (!cache->free_space_ctl) {
9977 		kfree(cache);
9978 		return NULL;
9979 	}
9980 
9981 	cache->key.objectid = start;
9982 	cache->key.offset = size;
9983 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9984 
9985 	cache->fs_info = fs_info;
9986 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9987 	set_free_space_tree_thresholds(cache);
9988 
9989 	atomic_set(&cache->count, 1);
9990 	spin_lock_init(&cache->lock);
9991 	init_rwsem(&cache->data_rwsem);
9992 	INIT_LIST_HEAD(&cache->list);
9993 	INIT_LIST_HEAD(&cache->cluster_list);
9994 	INIT_LIST_HEAD(&cache->bg_list);
9995 	INIT_LIST_HEAD(&cache->ro_list);
9996 	INIT_LIST_HEAD(&cache->dirty_list);
9997 	INIT_LIST_HEAD(&cache->io_list);
9998 	btrfs_init_free_space_ctl(cache);
9999 	atomic_set(&cache->trimming, 0);
10000 	mutex_init(&cache->free_space_lock);
10001 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10002 
10003 	return cache;
10004 }
10005 
10006 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10007 {
10008 	struct btrfs_path *path;
10009 	int ret;
10010 	struct btrfs_block_group_cache *cache;
10011 	struct btrfs_space_info *space_info;
10012 	struct btrfs_key key;
10013 	struct btrfs_key found_key;
10014 	struct extent_buffer *leaf;
10015 	int need_clear = 0;
10016 	u64 cache_gen;
10017 	u64 feature;
10018 	int mixed;
10019 
10020 	feature = btrfs_super_incompat_flags(info->super_copy);
10021 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10022 
10023 	key.objectid = 0;
10024 	key.offset = 0;
10025 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10026 	path = btrfs_alloc_path();
10027 	if (!path)
10028 		return -ENOMEM;
10029 	path->reada = READA_FORWARD;
10030 
10031 	cache_gen = btrfs_super_cache_generation(info->super_copy);
10032 	if (btrfs_test_opt(info, SPACE_CACHE) &&
10033 	    btrfs_super_generation(info->super_copy) != cache_gen)
10034 		need_clear = 1;
10035 	if (btrfs_test_opt(info, CLEAR_CACHE))
10036 		need_clear = 1;
10037 
10038 	while (1) {
10039 		ret = find_first_block_group(info, path, &key);
10040 		if (ret > 0)
10041 			break;
10042 		if (ret != 0)
10043 			goto error;
10044 
10045 		leaf = path->nodes[0];
10046 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10047 
10048 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
10049 						       found_key.offset);
10050 		if (!cache) {
10051 			ret = -ENOMEM;
10052 			goto error;
10053 		}
10054 
10055 		if (need_clear) {
10056 			/*
10057 			 * When we mount with old space cache, we need to
10058 			 * set BTRFS_DC_CLEAR and set dirty flag.
10059 			 *
10060 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10061 			 *    truncate the old free space cache inode and
10062 			 *    setup a new one.
10063 			 * b) Setting 'dirty flag' makes sure that we flush
10064 			 *    the new space cache info onto disk.
10065 			 */
10066 			if (btrfs_test_opt(info, SPACE_CACHE))
10067 				cache->disk_cache_state = BTRFS_DC_CLEAR;
10068 		}
10069 
10070 		read_extent_buffer(leaf, &cache->item,
10071 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
10072 				   sizeof(cache->item));
10073 		cache->flags = btrfs_block_group_flags(&cache->item);
10074 		if (!mixed &&
10075 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10076 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10077 			btrfs_err(info,
10078 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10079 				  cache->key.objectid);
10080 			ret = -EINVAL;
10081 			goto error;
10082 		}
10083 
10084 		key.objectid = found_key.objectid + found_key.offset;
10085 		btrfs_release_path(path);
10086 
10087 		/*
10088 		 * We need to exclude the super stripes now so that the space
10089 		 * info has super bytes accounted for, otherwise we'll think
10090 		 * we have more space than we actually do.
10091 		 */
10092 		ret = exclude_super_stripes(info, cache);
10093 		if (ret) {
10094 			/*
10095 			 * We may have excluded something, so call this just in
10096 			 * case.
10097 			 */
10098 			free_excluded_extents(info, cache);
10099 			btrfs_put_block_group(cache);
10100 			goto error;
10101 		}
10102 
10103 		/*
10104 		 * check for two cases, either we are full, and therefore
10105 		 * don't need to bother with the caching work since we won't
10106 		 * find any space, or we are empty, and we can just add all
10107 		 * the space in and be done with it.  This saves us _alot_ of
10108 		 * time, particularly in the full case.
10109 		 */
10110 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10111 			cache->last_byte_to_unpin = (u64)-1;
10112 			cache->cached = BTRFS_CACHE_FINISHED;
10113 			free_excluded_extents(info, cache);
10114 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10115 			cache->last_byte_to_unpin = (u64)-1;
10116 			cache->cached = BTRFS_CACHE_FINISHED;
10117 			add_new_free_space(cache, found_key.objectid,
10118 					   found_key.objectid +
10119 					   found_key.offset);
10120 			free_excluded_extents(info, cache);
10121 		}
10122 
10123 		ret = btrfs_add_block_group_cache(info, cache);
10124 		if (ret) {
10125 			btrfs_remove_free_space_cache(cache);
10126 			btrfs_put_block_group(cache);
10127 			goto error;
10128 		}
10129 
10130 		trace_btrfs_add_block_group(info, cache, 0);
10131 		update_space_info(info, cache->flags, found_key.offset,
10132 				  btrfs_block_group_used(&cache->item),
10133 				  cache->bytes_super, &space_info);
10134 
10135 		cache->space_info = space_info;
10136 
10137 		link_block_group(cache);
10138 
10139 		set_avail_alloc_bits(info, cache->flags);
10140 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10141 			inc_block_group_ro(cache, 1);
10142 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10143 			spin_lock(&info->unused_bgs_lock);
10144 			/* Should always be true but just in case. */
10145 			if (list_empty(&cache->bg_list)) {
10146 				btrfs_get_block_group(cache);
10147 				trace_btrfs_add_unused_block_group(cache);
10148 				list_add_tail(&cache->bg_list,
10149 					      &info->unused_bgs);
10150 			}
10151 			spin_unlock(&info->unused_bgs_lock);
10152 		}
10153 	}
10154 
10155 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
10156 		if (!(get_alloc_profile(info, space_info->flags) &
10157 		      (BTRFS_BLOCK_GROUP_RAID10 |
10158 		       BTRFS_BLOCK_GROUP_RAID1 |
10159 		       BTRFS_BLOCK_GROUP_RAID5 |
10160 		       BTRFS_BLOCK_GROUP_RAID6 |
10161 		       BTRFS_BLOCK_GROUP_DUP)))
10162 			continue;
10163 		/*
10164 		 * avoid allocating from un-mirrored block group if there are
10165 		 * mirrored block groups.
10166 		 */
10167 		list_for_each_entry(cache,
10168 				&space_info->block_groups[BTRFS_RAID_RAID0],
10169 				list)
10170 			inc_block_group_ro(cache, 1);
10171 		list_for_each_entry(cache,
10172 				&space_info->block_groups[BTRFS_RAID_SINGLE],
10173 				list)
10174 			inc_block_group_ro(cache, 1);
10175 	}
10176 
10177 	btrfs_add_raid_kobjects(info);
10178 	init_global_block_rsv(info);
10179 	ret = 0;
10180 error:
10181 	btrfs_free_path(path);
10182 	return ret;
10183 }
10184 
10185 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10186 {
10187 	struct btrfs_fs_info *fs_info = trans->fs_info;
10188 	struct btrfs_block_group_cache *block_group, *tmp;
10189 	struct btrfs_root *extent_root = fs_info->extent_root;
10190 	struct btrfs_block_group_item item;
10191 	struct btrfs_key key;
10192 	int ret = 0;
10193 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10194 
10195 	trans->can_flush_pending_bgs = false;
10196 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10197 		if (ret)
10198 			goto next;
10199 
10200 		spin_lock(&block_group->lock);
10201 		memcpy(&item, &block_group->item, sizeof(item));
10202 		memcpy(&key, &block_group->key, sizeof(key));
10203 		spin_unlock(&block_group->lock);
10204 
10205 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10206 					sizeof(item));
10207 		if (ret)
10208 			btrfs_abort_transaction(trans, ret);
10209 		ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
10210 					       key.offset);
10211 		if (ret)
10212 			btrfs_abort_transaction(trans, ret);
10213 		add_block_group_free_space(trans, block_group);
10214 		/* already aborted the transaction if it failed. */
10215 next:
10216 		list_del_init(&block_group->bg_list);
10217 	}
10218 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
10219 }
10220 
10221 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10222 			   struct btrfs_fs_info *fs_info, u64 bytes_used,
10223 			   u64 type, u64 chunk_offset, u64 size)
10224 {
10225 	struct btrfs_block_group_cache *cache;
10226 	int ret;
10227 
10228 	btrfs_set_log_full_commit(fs_info, trans);
10229 
10230 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10231 	if (!cache)
10232 		return -ENOMEM;
10233 
10234 	btrfs_set_block_group_used(&cache->item, bytes_used);
10235 	btrfs_set_block_group_chunk_objectid(&cache->item,
10236 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10237 	btrfs_set_block_group_flags(&cache->item, type);
10238 
10239 	cache->flags = type;
10240 	cache->last_byte_to_unpin = (u64)-1;
10241 	cache->cached = BTRFS_CACHE_FINISHED;
10242 	cache->needs_free_space = 1;
10243 	ret = exclude_super_stripes(fs_info, cache);
10244 	if (ret) {
10245 		/*
10246 		 * We may have excluded something, so call this just in
10247 		 * case.
10248 		 */
10249 		free_excluded_extents(fs_info, cache);
10250 		btrfs_put_block_group(cache);
10251 		return ret;
10252 	}
10253 
10254 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
10255 
10256 	free_excluded_extents(fs_info, cache);
10257 
10258 #ifdef CONFIG_BTRFS_DEBUG
10259 	if (btrfs_should_fragment_free_space(cache)) {
10260 		u64 new_bytes_used = size - bytes_used;
10261 
10262 		bytes_used += new_bytes_used >> 1;
10263 		fragment_free_space(cache);
10264 	}
10265 #endif
10266 	/*
10267 	 * Ensure the corresponding space_info object is created and
10268 	 * assigned to our block group. We want our bg to be added to the rbtree
10269 	 * with its ->space_info set.
10270 	 */
10271 	cache->space_info = __find_space_info(fs_info, cache->flags);
10272 	ASSERT(cache->space_info);
10273 
10274 	ret = btrfs_add_block_group_cache(fs_info, cache);
10275 	if (ret) {
10276 		btrfs_remove_free_space_cache(cache);
10277 		btrfs_put_block_group(cache);
10278 		return ret;
10279 	}
10280 
10281 	/*
10282 	 * Now that our block group has its ->space_info set and is inserted in
10283 	 * the rbtree, update the space info's counters.
10284 	 */
10285 	trace_btrfs_add_block_group(fs_info, cache, 1);
10286 	update_space_info(fs_info, cache->flags, size, bytes_used,
10287 				cache->bytes_super, &cache->space_info);
10288 	update_global_block_rsv(fs_info);
10289 
10290 	link_block_group(cache);
10291 
10292 	list_add_tail(&cache->bg_list, &trans->new_bgs);
10293 
10294 	set_avail_alloc_bits(fs_info, type);
10295 	return 0;
10296 }
10297 
10298 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10299 {
10300 	u64 extra_flags = chunk_to_extended(flags) &
10301 				BTRFS_EXTENDED_PROFILE_MASK;
10302 
10303 	write_seqlock(&fs_info->profiles_lock);
10304 	if (flags & BTRFS_BLOCK_GROUP_DATA)
10305 		fs_info->avail_data_alloc_bits &= ~extra_flags;
10306 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10307 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10308 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10309 		fs_info->avail_system_alloc_bits &= ~extra_flags;
10310 	write_sequnlock(&fs_info->profiles_lock);
10311 }
10312 
10313 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10314 			     struct btrfs_fs_info *fs_info, u64 group_start,
10315 			     struct extent_map *em)
10316 {
10317 	struct btrfs_root *root = fs_info->extent_root;
10318 	struct btrfs_path *path;
10319 	struct btrfs_block_group_cache *block_group;
10320 	struct btrfs_free_cluster *cluster;
10321 	struct btrfs_root *tree_root = fs_info->tree_root;
10322 	struct btrfs_key key;
10323 	struct inode *inode;
10324 	struct kobject *kobj = NULL;
10325 	int ret;
10326 	int index;
10327 	int factor;
10328 	struct btrfs_caching_control *caching_ctl = NULL;
10329 	bool remove_em;
10330 
10331 	block_group = btrfs_lookup_block_group(fs_info, group_start);
10332 	BUG_ON(!block_group);
10333 	BUG_ON(!block_group->ro);
10334 
10335 	trace_btrfs_remove_block_group(block_group);
10336 	/*
10337 	 * Free the reserved super bytes from this block group before
10338 	 * remove it.
10339 	 */
10340 	free_excluded_extents(fs_info, block_group);
10341 	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10342 				  block_group->key.offset);
10343 
10344 	memcpy(&key, &block_group->key, sizeof(key));
10345 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
10346 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10347 				  BTRFS_BLOCK_GROUP_RAID1 |
10348 				  BTRFS_BLOCK_GROUP_RAID10))
10349 		factor = 2;
10350 	else
10351 		factor = 1;
10352 
10353 	/* make sure this block group isn't part of an allocation cluster */
10354 	cluster = &fs_info->data_alloc_cluster;
10355 	spin_lock(&cluster->refill_lock);
10356 	btrfs_return_cluster_to_free_space(block_group, cluster);
10357 	spin_unlock(&cluster->refill_lock);
10358 
10359 	/*
10360 	 * make sure this block group isn't part of a metadata
10361 	 * allocation cluster
10362 	 */
10363 	cluster = &fs_info->meta_alloc_cluster;
10364 	spin_lock(&cluster->refill_lock);
10365 	btrfs_return_cluster_to_free_space(block_group, cluster);
10366 	spin_unlock(&cluster->refill_lock);
10367 
10368 	path = btrfs_alloc_path();
10369 	if (!path) {
10370 		ret = -ENOMEM;
10371 		goto out;
10372 	}
10373 
10374 	/*
10375 	 * get the inode first so any iput calls done for the io_list
10376 	 * aren't the final iput (no unlinks allowed now)
10377 	 */
10378 	inode = lookup_free_space_inode(fs_info, block_group, path);
10379 
10380 	mutex_lock(&trans->transaction->cache_write_mutex);
10381 	/*
10382 	 * make sure our free spache cache IO is done before remove the
10383 	 * free space inode
10384 	 */
10385 	spin_lock(&trans->transaction->dirty_bgs_lock);
10386 	if (!list_empty(&block_group->io_list)) {
10387 		list_del_init(&block_group->io_list);
10388 
10389 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10390 
10391 		spin_unlock(&trans->transaction->dirty_bgs_lock);
10392 		btrfs_wait_cache_io(trans, block_group, path);
10393 		btrfs_put_block_group(block_group);
10394 		spin_lock(&trans->transaction->dirty_bgs_lock);
10395 	}
10396 
10397 	if (!list_empty(&block_group->dirty_list)) {
10398 		list_del_init(&block_group->dirty_list);
10399 		btrfs_put_block_group(block_group);
10400 	}
10401 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10402 	mutex_unlock(&trans->transaction->cache_write_mutex);
10403 
10404 	if (!IS_ERR(inode)) {
10405 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10406 		if (ret) {
10407 			btrfs_add_delayed_iput(inode);
10408 			goto out;
10409 		}
10410 		clear_nlink(inode);
10411 		/* One for the block groups ref */
10412 		spin_lock(&block_group->lock);
10413 		if (block_group->iref) {
10414 			block_group->iref = 0;
10415 			block_group->inode = NULL;
10416 			spin_unlock(&block_group->lock);
10417 			iput(inode);
10418 		} else {
10419 			spin_unlock(&block_group->lock);
10420 		}
10421 		/* One for our lookup ref */
10422 		btrfs_add_delayed_iput(inode);
10423 	}
10424 
10425 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10426 	key.offset = block_group->key.objectid;
10427 	key.type = 0;
10428 
10429 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10430 	if (ret < 0)
10431 		goto out;
10432 	if (ret > 0)
10433 		btrfs_release_path(path);
10434 	if (ret == 0) {
10435 		ret = btrfs_del_item(trans, tree_root, path);
10436 		if (ret)
10437 			goto out;
10438 		btrfs_release_path(path);
10439 	}
10440 
10441 	spin_lock(&fs_info->block_group_cache_lock);
10442 	rb_erase(&block_group->cache_node,
10443 		 &fs_info->block_group_cache_tree);
10444 	RB_CLEAR_NODE(&block_group->cache_node);
10445 
10446 	if (fs_info->first_logical_byte == block_group->key.objectid)
10447 		fs_info->first_logical_byte = (u64)-1;
10448 	spin_unlock(&fs_info->block_group_cache_lock);
10449 
10450 	down_write(&block_group->space_info->groups_sem);
10451 	/*
10452 	 * we must use list_del_init so people can check to see if they
10453 	 * are still on the list after taking the semaphore
10454 	 */
10455 	list_del_init(&block_group->list);
10456 	if (list_empty(&block_group->space_info->block_groups[index])) {
10457 		kobj = block_group->space_info->block_group_kobjs[index];
10458 		block_group->space_info->block_group_kobjs[index] = NULL;
10459 		clear_avail_alloc_bits(fs_info, block_group->flags);
10460 	}
10461 	up_write(&block_group->space_info->groups_sem);
10462 	if (kobj) {
10463 		kobject_del(kobj);
10464 		kobject_put(kobj);
10465 	}
10466 
10467 	if (block_group->has_caching_ctl)
10468 		caching_ctl = get_caching_control(block_group);
10469 	if (block_group->cached == BTRFS_CACHE_STARTED)
10470 		wait_block_group_cache_done(block_group);
10471 	if (block_group->has_caching_ctl) {
10472 		down_write(&fs_info->commit_root_sem);
10473 		if (!caching_ctl) {
10474 			struct btrfs_caching_control *ctl;
10475 
10476 			list_for_each_entry(ctl,
10477 				    &fs_info->caching_block_groups, list)
10478 				if (ctl->block_group == block_group) {
10479 					caching_ctl = ctl;
10480 					refcount_inc(&caching_ctl->count);
10481 					break;
10482 				}
10483 		}
10484 		if (caching_ctl)
10485 			list_del_init(&caching_ctl->list);
10486 		up_write(&fs_info->commit_root_sem);
10487 		if (caching_ctl) {
10488 			/* Once for the caching bgs list and once for us. */
10489 			put_caching_control(caching_ctl);
10490 			put_caching_control(caching_ctl);
10491 		}
10492 	}
10493 
10494 	spin_lock(&trans->transaction->dirty_bgs_lock);
10495 	if (!list_empty(&block_group->dirty_list)) {
10496 		WARN_ON(1);
10497 	}
10498 	if (!list_empty(&block_group->io_list)) {
10499 		WARN_ON(1);
10500 	}
10501 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10502 	btrfs_remove_free_space_cache(block_group);
10503 
10504 	spin_lock(&block_group->space_info->lock);
10505 	list_del_init(&block_group->ro_list);
10506 
10507 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10508 		WARN_ON(block_group->space_info->total_bytes
10509 			< block_group->key.offset);
10510 		WARN_ON(block_group->space_info->bytes_readonly
10511 			< block_group->key.offset);
10512 		WARN_ON(block_group->space_info->disk_total
10513 			< block_group->key.offset * factor);
10514 	}
10515 	block_group->space_info->total_bytes -= block_group->key.offset;
10516 	block_group->space_info->bytes_readonly -= block_group->key.offset;
10517 	block_group->space_info->disk_total -= block_group->key.offset * factor;
10518 
10519 	spin_unlock(&block_group->space_info->lock);
10520 
10521 	memcpy(&key, &block_group->key, sizeof(key));
10522 
10523 	mutex_lock(&fs_info->chunk_mutex);
10524 	if (!list_empty(&em->list)) {
10525 		/* We're in the transaction->pending_chunks list. */
10526 		free_extent_map(em);
10527 	}
10528 	spin_lock(&block_group->lock);
10529 	block_group->removed = 1;
10530 	/*
10531 	 * At this point trimming can't start on this block group, because we
10532 	 * removed the block group from the tree fs_info->block_group_cache_tree
10533 	 * so no one can't find it anymore and even if someone already got this
10534 	 * block group before we removed it from the rbtree, they have already
10535 	 * incremented block_group->trimming - if they didn't, they won't find
10536 	 * any free space entries because we already removed them all when we
10537 	 * called btrfs_remove_free_space_cache().
10538 	 *
10539 	 * And we must not remove the extent map from the fs_info->mapping_tree
10540 	 * to prevent the same logical address range and physical device space
10541 	 * ranges from being reused for a new block group. This is because our
10542 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10543 	 * completely transactionless, so while it is trimming a range the
10544 	 * currently running transaction might finish and a new one start,
10545 	 * allowing for new block groups to be created that can reuse the same
10546 	 * physical device locations unless we take this special care.
10547 	 *
10548 	 * There may also be an implicit trim operation if the file system
10549 	 * is mounted with -odiscard. The same protections must remain
10550 	 * in place until the extents have been discarded completely when
10551 	 * the transaction commit has completed.
10552 	 */
10553 	remove_em = (atomic_read(&block_group->trimming) == 0);
10554 	/*
10555 	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10556 	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10557 	 * before checking block_group->removed).
10558 	 */
10559 	if (!remove_em) {
10560 		/*
10561 		 * Our em might be in trans->transaction->pending_chunks which
10562 		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10563 		 * and so is the fs_info->pinned_chunks list.
10564 		 *
10565 		 * So at this point we must be holding the chunk_mutex to avoid
10566 		 * any races with chunk allocation (more specifically at
10567 		 * volumes.c:contains_pending_extent()), to ensure it always
10568 		 * sees the em, either in the pending_chunks list or in the
10569 		 * pinned_chunks list.
10570 		 */
10571 		list_move_tail(&em->list, &fs_info->pinned_chunks);
10572 	}
10573 	spin_unlock(&block_group->lock);
10574 
10575 	if (remove_em) {
10576 		struct extent_map_tree *em_tree;
10577 
10578 		em_tree = &fs_info->mapping_tree.map_tree;
10579 		write_lock(&em_tree->lock);
10580 		/*
10581 		 * The em might be in the pending_chunks list, so make sure the
10582 		 * chunk mutex is locked, since remove_extent_mapping() will
10583 		 * delete us from that list.
10584 		 */
10585 		remove_extent_mapping(em_tree, em);
10586 		write_unlock(&em_tree->lock);
10587 		/* once for the tree */
10588 		free_extent_map(em);
10589 	}
10590 
10591 	mutex_unlock(&fs_info->chunk_mutex);
10592 
10593 	ret = remove_block_group_free_space(trans, block_group);
10594 	if (ret)
10595 		goto out;
10596 
10597 	btrfs_put_block_group(block_group);
10598 	btrfs_put_block_group(block_group);
10599 
10600 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10601 	if (ret > 0)
10602 		ret = -EIO;
10603 	if (ret < 0)
10604 		goto out;
10605 
10606 	ret = btrfs_del_item(trans, root, path);
10607 out:
10608 	btrfs_free_path(path);
10609 	return ret;
10610 }
10611 
10612 struct btrfs_trans_handle *
10613 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10614 				     const u64 chunk_offset)
10615 {
10616 	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10617 	struct extent_map *em;
10618 	struct map_lookup *map;
10619 	unsigned int num_items;
10620 
10621 	read_lock(&em_tree->lock);
10622 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10623 	read_unlock(&em_tree->lock);
10624 	ASSERT(em && em->start == chunk_offset);
10625 
10626 	/*
10627 	 * We need to reserve 3 + N units from the metadata space info in order
10628 	 * to remove a block group (done at btrfs_remove_chunk() and at
10629 	 * btrfs_remove_block_group()), which are used for:
10630 	 *
10631 	 * 1 unit for adding the free space inode's orphan (located in the tree
10632 	 * of tree roots).
10633 	 * 1 unit for deleting the block group item (located in the extent
10634 	 * tree).
10635 	 * 1 unit for deleting the free space item (located in tree of tree
10636 	 * roots).
10637 	 * N units for deleting N device extent items corresponding to each
10638 	 * stripe (located in the device tree).
10639 	 *
10640 	 * In order to remove a block group we also need to reserve units in the
10641 	 * system space info in order to update the chunk tree (update one or
10642 	 * more device items and remove one chunk item), but this is done at
10643 	 * btrfs_remove_chunk() through a call to check_system_chunk().
10644 	 */
10645 	map = em->map_lookup;
10646 	num_items = 3 + map->num_stripes;
10647 	free_extent_map(em);
10648 
10649 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10650 							   num_items, 1);
10651 }
10652 
10653 /*
10654  * Process the unused_bgs list and remove any that don't have any allocated
10655  * space inside of them.
10656  */
10657 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10658 {
10659 	struct btrfs_block_group_cache *block_group;
10660 	struct btrfs_space_info *space_info;
10661 	struct btrfs_trans_handle *trans;
10662 	int ret = 0;
10663 
10664 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10665 		return;
10666 
10667 	spin_lock(&fs_info->unused_bgs_lock);
10668 	while (!list_empty(&fs_info->unused_bgs)) {
10669 		u64 start, end;
10670 		int trimming;
10671 
10672 		block_group = list_first_entry(&fs_info->unused_bgs,
10673 					       struct btrfs_block_group_cache,
10674 					       bg_list);
10675 		list_del_init(&block_group->bg_list);
10676 
10677 		space_info = block_group->space_info;
10678 
10679 		if (ret || btrfs_mixed_space_info(space_info)) {
10680 			btrfs_put_block_group(block_group);
10681 			continue;
10682 		}
10683 		spin_unlock(&fs_info->unused_bgs_lock);
10684 
10685 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10686 
10687 		/* Don't want to race with allocators so take the groups_sem */
10688 		down_write(&space_info->groups_sem);
10689 		spin_lock(&block_group->lock);
10690 		if (block_group->reserved ||
10691 		    btrfs_block_group_used(&block_group->item) ||
10692 		    block_group->ro ||
10693 		    list_is_singular(&block_group->list)) {
10694 			/*
10695 			 * We want to bail if we made new allocations or have
10696 			 * outstanding allocations in this block group.  We do
10697 			 * the ro check in case balance is currently acting on
10698 			 * this block group.
10699 			 */
10700 			trace_btrfs_skip_unused_block_group(block_group);
10701 			spin_unlock(&block_group->lock);
10702 			up_write(&space_info->groups_sem);
10703 			goto next;
10704 		}
10705 		spin_unlock(&block_group->lock);
10706 
10707 		/* We don't want to force the issue, only flip if it's ok. */
10708 		ret = inc_block_group_ro(block_group, 0);
10709 		up_write(&space_info->groups_sem);
10710 		if (ret < 0) {
10711 			ret = 0;
10712 			goto next;
10713 		}
10714 
10715 		/*
10716 		 * Want to do this before we do anything else so we can recover
10717 		 * properly if we fail to join the transaction.
10718 		 */
10719 		trans = btrfs_start_trans_remove_block_group(fs_info,
10720 						     block_group->key.objectid);
10721 		if (IS_ERR(trans)) {
10722 			btrfs_dec_block_group_ro(block_group);
10723 			ret = PTR_ERR(trans);
10724 			goto next;
10725 		}
10726 
10727 		/*
10728 		 * We could have pending pinned extents for this block group,
10729 		 * just delete them, we don't care about them anymore.
10730 		 */
10731 		start = block_group->key.objectid;
10732 		end = start + block_group->key.offset - 1;
10733 		/*
10734 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10735 		 * btrfs_finish_extent_commit(). If we are at transaction N,
10736 		 * another task might be running finish_extent_commit() for the
10737 		 * previous transaction N - 1, and have seen a range belonging
10738 		 * to the block group in freed_extents[] before we were able to
10739 		 * clear the whole block group range from freed_extents[]. This
10740 		 * means that task can lookup for the block group after we
10741 		 * unpinned it from freed_extents[] and removed it, leading to
10742 		 * a BUG_ON() at btrfs_unpin_extent_range().
10743 		 */
10744 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
10745 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10746 				  EXTENT_DIRTY);
10747 		if (ret) {
10748 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10749 			btrfs_dec_block_group_ro(block_group);
10750 			goto end_trans;
10751 		}
10752 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10753 				  EXTENT_DIRTY);
10754 		if (ret) {
10755 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10756 			btrfs_dec_block_group_ro(block_group);
10757 			goto end_trans;
10758 		}
10759 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10760 
10761 		/* Reset pinned so btrfs_put_block_group doesn't complain */
10762 		spin_lock(&space_info->lock);
10763 		spin_lock(&block_group->lock);
10764 
10765 		space_info->bytes_pinned -= block_group->pinned;
10766 		space_info->bytes_readonly += block_group->pinned;
10767 		percpu_counter_add(&space_info->total_bytes_pinned,
10768 				   -block_group->pinned);
10769 		block_group->pinned = 0;
10770 
10771 		spin_unlock(&block_group->lock);
10772 		spin_unlock(&space_info->lock);
10773 
10774 		/* DISCARD can flip during remount */
10775 		trimming = btrfs_test_opt(fs_info, DISCARD);
10776 
10777 		/* Implicit trim during transaction commit. */
10778 		if (trimming)
10779 			btrfs_get_block_group_trimming(block_group);
10780 
10781 		/*
10782 		 * Btrfs_remove_chunk will abort the transaction if things go
10783 		 * horribly wrong.
10784 		 */
10785 		ret = btrfs_remove_chunk(trans, fs_info,
10786 					 block_group->key.objectid);
10787 
10788 		if (ret) {
10789 			if (trimming)
10790 				btrfs_put_block_group_trimming(block_group);
10791 			goto end_trans;
10792 		}
10793 
10794 		/*
10795 		 * If we're not mounted with -odiscard, we can just forget
10796 		 * about this block group. Otherwise we'll need to wait
10797 		 * until transaction commit to do the actual discard.
10798 		 */
10799 		if (trimming) {
10800 			spin_lock(&fs_info->unused_bgs_lock);
10801 			/*
10802 			 * A concurrent scrub might have added us to the list
10803 			 * fs_info->unused_bgs, so use a list_move operation
10804 			 * to add the block group to the deleted_bgs list.
10805 			 */
10806 			list_move(&block_group->bg_list,
10807 				  &trans->transaction->deleted_bgs);
10808 			spin_unlock(&fs_info->unused_bgs_lock);
10809 			btrfs_get_block_group(block_group);
10810 		}
10811 end_trans:
10812 		btrfs_end_transaction(trans);
10813 next:
10814 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10815 		btrfs_put_block_group(block_group);
10816 		spin_lock(&fs_info->unused_bgs_lock);
10817 	}
10818 	spin_unlock(&fs_info->unused_bgs_lock);
10819 }
10820 
10821 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10822 {
10823 	struct btrfs_super_block *disk_super;
10824 	u64 features;
10825 	u64 flags;
10826 	int mixed = 0;
10827 	int ret;
10828 
10829 	disk_super = fs_info->super_copy;
10830 	if (!btrfs_super_root(disk_super))
10831 		return -EINVAL;
10832 
10833 	features = btrfs_super_incompat_flags(disk_super);
10834 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10835 		mixed = 1;
10836 
10837 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
10838 	ret = create_space_info(fs_info, flags);
10839 	if (ret)
10840 		goto out;
10841 
10842 	if (mixed) {
10843 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10844 		ret = create_space_info(fs_info, flags);
10845 	} else {
10846 		flags = BTRFS_BLOCK_GROUP_METADATA;
10847 		ret = create_space_info(fs_info, flags);
10848 		if (ret)
10849 			goto out;
10850 
10851 		flags = BTRFS_BLOCK_GROUP_DATA;
10852 		ret = create_space_info(fs_info, flags);
10853 	}
10854 out:
10855 	return ret;
10856 }
10857 
10858 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10859 				   u64 start, u64 end)
10860 {
10861 	return unpin_extent_range(fs_info, start, end, false);
10862 }
10863 
10864 /*
10865  * It used to be that old block groups would be left around forever.
10866  * Iterating over them would be enough to trim unused space.  Since we
10867  * now automatically remove them, we also need to iterate over unallocated
10868  * space.
10869  *
10870  * We don't want a transaction for this since the discard may take a
10871  * substantial amount of time.  We don't require that a transaction be
10872  * running, but we do need to take a running transaction into account
10873  * to ensure that we're not discarding chunks that were released in
10874  * the current transaction.
10875  *
10876  * Holding the chunks lock will prevent other threads from allocating
10877  * or releasing chunks, but it won't prevent a running transaction
10878  * from committing and releasing the memory that the pending chunks
10879  * list head uses.  For that, we need to take a reference to the
10880  * transaction.
10881  */
10882 static int btrfs_trim_free_extents(struct btrfs_device *device,
10883 				   u64 minlen, u64 *trimmed)
10884 {
10885 	u64 start = 0, len = 0;
10886 	int ret;
10887 
10888 	*trimmed = 0;
10889 
10890 	/* Not writeable = nothing to do. */
10891 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10892 		return 0;
10893 
10894 	/* No free space = nothing to do. */
10895 	if (device->total_bytes <= device->bytes_used)
10896 		return 0;
10897 
10898 	ret = 0;
10899 
10900 	while (1) {
10901 		struct btrfs_fs_info *fs_info = device->fs_info;
10902 		struct btrfs_transaction *trans;
10903 		u64 bytes;
10904 
10905 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10906 		if (ret)
10907 			return ret;
10908 
10909 		down_read(&fs_info->commit_root_sem);
10910 
10911 		spin_lock(&fs_info->trans_lock);
10912 		trans = fs_info->running_transaction;
10913 		if (trans)
10914 			refcount_inc(&trans->use_count);
10915 		spin_unlock(&fs_info->trans_lock);
10916 
10917 		ret = find_free_dev_extent_start(trans, device, minlen, start,
10918 						 &start, &len);
10919 		if (trans)
10920 			btrfs_put_transaction(trans);
10921 
10922 		if (ret) {
10923 			up_read(&fs_info->commit_root_sem);
10924 			mutex_unlock(&fs_info->chunk_mutex);
10925 			if (ret == -ENOSPC)
10926 				ret = 0;
10927 			break;
10928 		}
10929 
10930 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10931 		up_read(&fs_info->commit_root_sem);
10932 		mutex_unlock(&fs_info->chunk_mutex);
10933 
10934 		if (ret)
10935 			break;
10936 
10937 		start += len;
10938 		*trimmed += bytes;
10939 
10940 		if (fatal_signal_pending(current)) {
10941 			ret = -ERESTARTSYS;
10942 			break;
10943 		}
10944 
10945 		cond_resched();
10946 	}
10947 
10948 	return ret;
10949 }
10950 
10951 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10952 {
10953 	struct btrfs_block_group_cache *cache = NULL;
10954 	struct btrfs_device *device;
10955 	struct list_head *devices;
10956 	u64 group_trimmed;
10957 	u64 start;
10958 	u64 end;
10959 	u64 trimmed = 0;
10960 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10961 	int ret = 0;
10962 
10963 	/*
10964 	 * try to trim all FS space, our block group may start from non-zero.
10965 	 */
10966 	if (range->len == total_bytes)
10967 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
10968 	else
10969 		cache = btrfs_lookup_block_group(fs_info, range->start);
10970 
10971 	while (cache) {
10972 		if (cache->key.objectid >= (range->start + range->len)) {
10973 			btrfs_put_block_group(cache);
10974 			break;
10975 		}
10976 
10977 		start = max(range->start, cache->key.objectid);
10978 		end = min(range->start + range->len,
10979 				cache->key.objectid + cache->key.offset);
10980 
10981 		if (end - start >= range->minlen) {
10982 			if (!block_group_cache_done(cache)) {
10983 				ret = cache_block_group(cache, 0);
10984 				if (ret) {
10985 					btrfs_put_block_group(cache);
10986 					break;
10987 				}
10988 				ret = wait_block_group_cache_done(cache);
10989 				if (ret) {
10990 					btrfs_put_block_group(cache);
10991 					break;
10992 				}
10993 			}
10994 			ret = btrfs_trim_block_group(cache,
10995 						     &group_trimmed,
10996 						     start,
10997 						     end,
10998 						     range->minlen);
10999 
11000 			trimmed += group_trimmed;
11001 			if (ret) {
11002 				btrfs_put_block_group(cache);
11003 				break;
11004 			}
11005 		}
11006 
11007 		cache = next_block_group(fs_info, cache);
11008 	}
11009 
11010 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
11011 	devices = &fs_info->fs_devices->alloc_list;
11012 	list_for_each_entry(device, devices, dev_alloc_list) {
11013 		ret = btrfs_trim_free_extents(device, range->minlen,
11014 					      &group_trimmed);
11015 		if (ret)
11016 			break;
11017 
11018 		trimmed += group_trimmed;
11019 	}
11020 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11021 
11022 	range->len = trimmed;
11023 	return ret;
11024 }
11025 
11026 /*
11027  * btrfs_{start,end}_write_no_snapshotting() are similar to
11028  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11029  * data into the page cache through nocow before the subvolume is snapshoted,
11030  * but flush the data into disk after the snapshot creation, or to prevent
11031  * operations while snapshotting is ongoing and that cause the snapshot to be
11032  * inconsistent (writes followed by expanding truncates for example).
11033  */
11034 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11035 {
11036 	percpu_counter_dec(&root->subv_writers->counter);
11037 	cond_wake_up(&root->subv_writers->wait);
11038 }
11039 
11040 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11041 {
11042 	if (atomic_read(&root->will_be_snapshotted))
11043 		return 0;
11044 
11045 	percpu_counter_inc(&root->subv_writers->counter);
11046 	/*
11047 	 * Make sure counter is updated before we check for snapshot creation.
11048 	 */
11049 	smp_mb();
11050 	if (atomic_read(&root->will_be_snapshotted)) {
11051 		btrfs_end_write_no_snapshotting(root);
11052 		return 0;
11053 	}
11054 	return 1;
11055 }
11056 
11057 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11058 {
11059 	while (true) {
11060 		int ret;
11061 
11062 		ret = btrfs_start_write_no_snapshotting(root);
11063 		if (ret)
11064 			break;
11065 		wait_var_event(&root->will_be_snapshotted,
11066 			       !atomic_read(&root->will_be_snapshotted));
11067 	}
11068 }
11069