xref: /linux/fs/btrfs/extent-tree.c (revision 3ff78451b8e446e9a548b98a0d4dd8d24dc5780b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "discard.h"
34 #include "zoned.h"
35 #include "dev-replace.h"
36 #include "fs.h"
37 #include "accessors.h"
38 #include "root-tree.h"
39 #include "file-item.h"
40 #include "orphan.h"
41 #include "tree-checker.h"
42 #include "raid-stripe-tree.h"
43 
44 #undef SCRAMBLE_DELAYED_REFS
45 
46 
47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
48 			       struct btrfs_delayed_ref_head *href,
49 			       struct btrfs_delayed_ref_node *node,
50 			       struct btrfs_delayed_extent_op *extra_op);
51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52 				    struct extent_buffer *leaf,
53 				    struct btrfs_extent_item *ei);
54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
55 				      u64 parent, u64 root_objectid,
56 				      u64 flags, u64 owner, u64 offset,
57 				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
59 				     struct btrfs_delayed_ref_node *node,
60 				     struct btrfs_delayed_extent_op *extent_op);
61 static int find_next_key(struct btrfs_path *path, int level,
62 			 struct btrfs_key *key);
63 
64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
65 {
66 	return (cache->flags & bits) == bits;
67 }
68 
69 /* simple helper to search for an existing data extent at a given offset */
70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
71 {
72 	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
73 	int ret;
74 	struct btrfs_key key;
75 	struct btrfs_path *path;
76 
77 	path = btrfs_alloc_path();
78 	if (!path)
79 		return -ENOMEM;
80 
81 	key.objectid = start;
82 	key.offset = len;
83 	key.type = BTRFS_EXTENT_ITEM_KEY;
84 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
85 	btrfs_free_path(path);
86 	return ret;
87 }
88 
89 /*
90  * helper function to lookup reference count and flags of a tree block.
91  *
92  * the head node for delayed ref is used to store the sum of all the
93  * reference count modifications queued up in the rbtree. the head
94  * node may also store the extent flags to set. This way you can check
95  * to see what the reference count and extent flags would be if all of
96  * the delayed refs are not processed.
97  */
98 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
99 			     struct btrfs_fs_info *fs_info, u64 bytenr,
100 			     u64 offset, int metadata, u64 *refs, u64 *flags,
101 			     u64 *owning_root)
102 {
103 	struct btrfs_root *extent_root;
104 	struct btrfs_delayed_ref_head *head;
105 	struct btrfs_delayed_ref_root *delayed_refs;
106 	struct btrfs_path *path;
107 	struct btrfs_extent_item *ei;
108 	struct extent_buffer *leaf;
109 	struct btrfs_key key;
110 	u32 item_size;
111 	u64 num_refs;
112 	u64 extent_flags;
113 	u64 owner = 0;
114 	int ret;
115 
116 	/*
117 	 * If we don't have skinny metadata, don't bother doing anything
118 	 * different
119 	 */
120 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
121 		offset = fs_info->nodesize;
122 		metadata = 0;
123 	}
124 
125 	path = btrfs_alloc_path();
126 	if (!path)
127 		return -ENOMEM;
128 
129 	if (!trans) {
130 		path->skip_locking = 1;
131 		path->search_commit_root = 1;
132 	}
133 
134 search_again:
135 	key.objectid = bytenr;
136 	key.offset = offset;
137 	if (metadata)
138 		key.type = BTRFS_METADATA_ITEM_KEY;
139 	else
140 		key.type = BTRFS_EXTENT_ITEM_KEY;
141 
142 	extent_root = btrfs_extent_root(fs_info, bytenr);
143 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
144 	if (ret < 0)
145 		goto out_free;
146 
147 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
148 		if (path->slots[0]) {
149 			path->slots[0]--;
150 			btrfs_item_key_to_cpu(path->nodes[0], &key,
151 					      path->slots[0]);
152 			if (key.objectid == bytenr &&
153 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
154 			    key.offset == fs_info->nodesize)
155 				ret = 0;
156 		}
157 	}
158 
159 	if (ret == 0) {
160 		leaf = path->nodes[0];
161 		item_size = btrfs_item_size(leaf, path->slots[0]);
162 		if (item_size >= sizeof(*ei)) {
163 			ei = btrfs_item_ptr(leaf, path->slots[0],
164 					    struct btrfs_extent_item);
165 			num_refs = btrfs_extent_refs(leaf, ei);
166 			extent_flags = btrfs_extent_flags(leaf, ei);
167 			owner = btrfs_get_extent_owner_root(fs_info, leaf,
168 							    path->slots[0]);
169 		} else {
170 			ret = -EUCLEAN;
171 			btrfs_err(fs_info,
172 			"unexpected extent item size, has %u expect >= %zu",
173 				  item_size, sizeof(*ei));
174 			if (trans)
175 				btrfs_abort_transaction(trans, ret);
176 			else
177 				btrfs_handle_fs_error(fs_info, ret, NULL);
178 
179 			goto out_free;
180 		}
181 
182 		BUG_ON(num_refs == 0);
183 	} else {
184 		num_refs = 0;
185 		extent_flags = 0;
186 		ret = 0;
187 	}
188 
189 	if (!trans)
190 		goto out;
191 
192 	delayed_refs = &trans->transaction->delayed_refs;
193 	spin_lock(&delayed_refs->lock);
194 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
195 	if (head) {
196 		if (!mutex_trylock(&head->mutex)) {
197 			refcount_inc(&head->refs);
198 			spin_unlock(&delayed_refs->lock);
199 
200 			btrfs_release_path(path);
201 
202 			/*
203 			 * Mutex was contended, block until it's released and try
204 			 * again
205 			 */
206 			mutex_lock(&head->mutex);
207 			mutex_unlock(&head->mutex);
208 			btrfs_put_delayed_ref_head(head);
209 			goto search_again;
210 		}
211 		spin_lock(&head->lock);
212 		if (head->extent_op && head->extent_op->update_flags)
213 			extent_flags |= head->extent_op->flags_to_set;
214 		else
215 			BUG_ON(num_refs == 0);
216 
217 		num_refs += head->ref_mod;
218 		spin_unlock(&head->lock);
219 		mutex_unlock(&head->mutex);
220 	}
221 	spin_unlock(&delayed_refs->lock);
222 out:
223 	WARN_ON(num_refs == 0);
224 	if (refs)
225 		*refs = num_refs;
226 	if (flags)
227 		*flags = extent_flags;
228 	if (owning_root)
229 		*owning_root = owner;
230 out_free:
231 	btrfs_free_path(path);
232 	return ret;
233 }
234 
235 /*
236  * Back reference rules.  Back refs have three main goals:
237  *
238  * 1) differentiate between all holders of references to an extent so that
239  *    when a reference is dropped we can make sure it was a valid reference
240  *    before freeing the extent.
241  *
242  * 2) Provide enough information to quickly find the holders of an extent
243  *    if we notice a given block is corrupted or bad.
244  *
245  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
246  *    maintenance.  This is actually the same as #2, but with a slightly
247  *    different use case.
248  *
249  * There are two kinds of back refs. The implicit back refs is optimized
250  * for pointers in non-shared tree blocks. For a given pointer in a block,
251  * back refs of this kind provide information about the block's owner tree
252  * and the pointer's key. These information allow us to find the block by
253  * b-tree searching. The full back refs is for pointers in tree blocks not
254  * referenced by their owner trees. The location of tree block is recorded
255  * in the back refs. Actually the full back refs is generic, and can be
256  * used in all cases the implicit back refs is used. The major shortcoming
257  * of the full back refs is its overhead. Every time a tree block gets
258  * COWed, we have to update back refs entry for all pointers in it.
259  *
260  * For a newly allocated tree block, we use implicit back refs for
261  * pointers in it. This means most tree related operations only involve
262  * implicit back refs. For a tree block created in old transaction, the
263  * only way to drop a reference to it is COW it. So we can detect the
264  * event that tree block loses its owner tree's reference and do the
265  * back refs conversion.
266  *
267  * When a tree block is COWed through a tree, there are four cases:
268  *
269  * The reference count of the block is one and the tree is the block's
270  * owner tree. Nothing to do in this case.
271  *
272  * The reference count of the block is one and the tree is not the
273  * block's owner tree. In this case, full back refs is used for pointers
274  * in the block. Remove these full back refs, add implicit back refs for
275  * every pointers in the new block.
276  *
277  * The reference count of the block is greater than one and the tree is
278  * the block's owner tree. In this case, implicit back refs is used for
279  * pointers in the block. Add full back refs for every pointers in the
280  * block, increase lower level extents' reference counts. The original
281  * implicit back refs are entailed to the new block.
282  *
283  * The reference count of the block is greater than one and the tree is
284  * not the block's owner tree. Add implicit back refs for every pointer in
285  * the new block, increase lower level extents' reference count.
286  *
287  * Back Reference Key composing:
288  *
289  * The key objectid corresponds to the first byte in the extent,
290  * The key type is used to differentiate between types of back refs.
291  * There are different meanings of the key offset for different types
292  * of back refs.
293  *
294  * File extents can be referenced by:
295  *
296  * - multiple snapshots, subvolumes, or different generations in one subvol
297  * - different files inside a single subvolume
298  * - different offsets inside a file (bookend extents in file.c)
299  *
300  * The extent ref structure for the implicit back refs has fields for:
301  *
302  * - Objectid of the subvolume root
303  * - objectid of the file holding the reference
304  * - original offset in the file
305  * - how many bookend extents
306  *
307  * The key offset for the implicit back refs is hash of the first
308  * three fields.
309  *
310  * The extent ref structure for the full back refs has field for:
311  *
312  * - number of pointers in the tree leaf
313  *
314  * The key offset for the implicit back refs is the first byte of
315  * the tree leaf
316  *
317  * When a file extent is allocated, The implicit back refs is used.
318  * the fields are filled in:
319  *
320  *     (root_key.objectid, inode objectid, offset in file, 1)
321  *
322  * When a file extent is removed file truncation, we find the
323  * corresponding implicit back refs and check the following fields:
324  *
325  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
326  *
327  * Btree extents can be referenced by:
328  *
329  * - Different subvolumes
330  *
331  * Both the implicit back refs and the full back refs for tree blocks
332  * only consist of key. The key offset for the implicit back refs is
333  * objectid of block's owner tree. The key offset for the full back refs
334  * is the first byte of parent block.
335  *
336  * When implicit back refs is used, information about the lowest key and
337  * level of the tree block are required. These information are stored in
338  * tree block info structure.
339  */
340 
341 /*
342  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
343  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
344  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
345  */
346 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
347 				     struct btrfs_extent_inline_ref *iref,
348 				     enum btrfs_inline_ref_type is_data)
349 {
350 	struct btrfs_fs_info *fs_info = eb->fs_info;
351 	int type = btrfs_extent_inline_ref_type(eb, iref);
352 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
353 
354 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
355 		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
356 		return type;
357 	}
358 
359 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
360 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
361 	    type == BTRFS_SHARED_DATA_REF_KEY ||
362 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
363 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
364 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
365 				return type;
366 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
367 				ASSERT(fs_info);
368 				/*
369 				 * Every shared one has parent tree block,
370 				 * which must be aligned to sector size.
371 				 */
372 				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
373 					return type;
374 			}
375 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
376 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
377 				return type;
378 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
379 				ASSERT(fs_info);
380 				/*
381 				 * Every shared one has parent tree block,
382 				 * which must be aligned to sector size.
383 				 */
384 				if (offset &&
385 				    IS_ALIGNED(offset, fs_info->sectorsize))
386 					return type;
387 			}
388 		} else {
389 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
390 			return type;
391 		}
392 	}
393 
394 	WARN_ON(1);
395 	btrfs_print_leaf(eb);
396 	btrfs_err(fs_info,
397 		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
398 		  eb->start, (unsigned long)iref, type);
399 
400 	return BTRFS_REF_TYPE_INVALID;
401 }
402 
403 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
404 {
405 	u32 high_crc = ~(u32)0;
406 	u32 low_crc = ~(u32)0;
407 	__le64 lenum;
408 
409 	lenum = cpu_to_le64(root_objectid);
410 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
411 	lenum = cpu_to_le64(owner);
412 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
413 	lenum = cpu_to_le64(offset);
414 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
415 
416 	return ((u64)high_crc << 31) ^ (u64)low_crc;
417 }
418 
419 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
420 				     struct btrfs_extent_data_ref *ref)
421 {
422 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
423 				    btrfs_extent_data_ref_objectid(leaf, ref),
424 				    btrfs_extent_data_ref_offset(leaf, ref));
425 }
426 
427 static int match_extent_data_ref(struct extent_buffer *leaf,
428 				 struct btrfs_extent_data_ref *ref,
429 				 u64 root_objectid, u64 owner, u64 offset)
430 {
431 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
432 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
433 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
434 		return 0;
435 	return 1;
436 }
437 
438 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
439 					   struct btrfs_path *path,
440 					   u64 bytenr, u64 parent,
441 					   u64 root_objectid,
442 					   u64 owner, u64 offset)
443 {
444 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
445 	struct btrfs_key key;
446 	struct btrfs_extent_data_ref *ref;
447 	struct extent_buffer *leaf;
448 	u32 nritems;
449 	int recow;
450 	int ret;
451 
452 	key.objectid = bytenr;
453 	if (parent) {
454 		key.type = BTRFS_SHARED_DATA_REF_KEY;
455 		key.offset = parent;
456 	} else {
457 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
458 		key.offset = hash_extent_data_ref(root_objectid,
459 						  owner, offset);
460 	}
461 again:
462 	recow = 0;
463 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
464 	if (ret < 0)
465 		return ret;
466 
467 	if (parent) {
468 		if (ret)
469 			return -ENOENT;
470 		return 0;
471 	}
472 
473 	ret = -ENOENT;
474 	leaf = path->nodes[0];
475 	nritems = btrfs_header_nritems(leaf);
476 	while (1) {
477 		if (path->slots[0] >= nritems) {
478 			ret = btrfs_next_leaf(root, path);
479 			if (ret) {
480 				if (ret > 0)
481 					return -ENOENT;
482 				return ret;
483 			}
484 
485 			leaf = path->nodes[0];
486 			nritems = btrfs_header_nritems(leaf);
487 			recow = 1;
488 		}
489 
490 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
491 		if (key.objectid != bytenr ||
492 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
493 			goto fail;
494 
495 		ref = btrfs_item_ptr(leaf, path->slots[0],
496 				     struct btrfs_extent_data_ref);
497 
498 		if (match_extent_data_ref(leaf, ref, root_objectid,
499 					  owner, offset)) {
500 			if (recow) {
501 				btrfs_release_path(path);
502 				goto again;
503 			}
504 			ret = 0;
505 			break;
506 		}
507 		path->slots[0]++;
508 	}
509 fail:
510 	return ret;
511 }
512 
513 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
514 					   struct btrfs_path *path,
515 					   struct btrfs_delayed_ref_node *node,
516 					   u64 bytenr)
517 {
518 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
519 	struct btrfs_key key;
520 	struct extent_buffer *leaf;
521 	u64 owner = btrfs_delayed_ref_owner(node);
522 	u64 offset = btrfs_delayed_ref_offset(node);
523 	u32 size;
524 	u32 num_refs;
525 	int ret;
526 
527 	key.objectid = bytenr;
528 	if (node->parent) {
529 		key.type = BTRFS_SHARED_DATA_REF_KEY;
530 		key.offset = node->parent;
531 		size = sizeof(struct btrfs_shared_data_ref);
532 	} else {
533 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
534 		key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
535 		size = sizeof(struct btrfs_extent_data_ref);
536 	}
537 
538 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
539 	if (ret && ret != -EEXIST)
540 		goto fail;
541 
542 	leaf = path->nodes[0];
543 	if (node->parent) {
544 		struct btrfs_shared_data_ref *ref;
545 		ref = btrfs_item_ptr(leaf, path->slots[0],
546 				     struct btrfs_shared_data_ref);
547 		if (ret == 0) {
548 			btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
549 		} else {
550 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
551 			num_refs += node->ref_mod;
552 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
553 		}
554 	} else {
555 		struct btrfs_extent_data_ref *ref;
556 		while (ret == -EEXIST) {
557 			ref = btrfs_item_ptr(leaf, path->slots[0],
558 					     struct btrfs_extent_data_ref);
559 			if (match_extent_data_ref(leaf, ref, node->ref_root,
560 						  owner, offset))
561 				break;
562 			btrfs_release_path(path);
563 			key.offset++;
564 			ret = btrfs_insert_empty_item(trans, root, path, &key,
565 						      size);
566 			if (ret && ret != -EEXIST)
567 				goto fail;
568 
569 			leaf = path->nodes[0];
570 		}
571 		ref = btrfs_item_ptr(leaf, path->slots[0],
572 				     struct btrfs_extent_data_ref);
573 		if (ret == 0) {
574 			btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
575 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
576 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
577 			btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
578 		} else {
579 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
580 			num_refs += node->ref_mod;
581 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
582 		}
583 	}
584 	btrfs_mark_buffer_dirty(trans, leaf);
585 	ret = 0;
586 fail:
587 	btrfs_release_path(path);
588 	return ret;
589 }
590 
591 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
592 					   struct btrfs_root *root,
593 					   struct btrfs_path *path,
594 					   int refs_to_drop)
595 {
596 	struct btrfs_key key;
597 	struct btrfs_extent_data_ref *ref1 = NULL;
598 	struct btrfs_shared_data_ref *ref2 = NULL;
599 	struct extent_buffer *leaf;
600 	u32 num_refs = 0;
601 	int ret = 0;
602 
603 	leaf = path->nodes[0];
604 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
605 
606 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
607 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
608 				      struct btrfs_extent_data_ref);
609 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
610 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
611 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
612 				      struct btrfs_shared_data_ref);
613 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
614 	} else {
615 		btrfs_err(trans->fs_info,
616 			  "unrecognized backref key (%llu %u %llu)",
617 			  key.objectid, key.type, key.offset);
618 		btrfs_abort_transaction(trans, -EUCLEAN);
619 		return -EUCLEAN;
620 	}
621 
622 	BUG_ON(num_refs < refs_to_drop);
623 	num_refs -= refs_to_drop;
624 
625 	if (num_refs == 0) {
626 		ret = btrfs_del_item(trans, root, path);
627 	} else {
628 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
629 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
630 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
631 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
632 		btrfs_mark_buffer_dirty(trans, leaf);
633 	}
634 	return ret;
635 }
636 
637 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
638 					  struct btrfs_extent_inline_ref *iref)
639 {
640 	struct btrfs_key key;
641 	struct extent_buffer *leaf;
642 	struct btrfs_extent_data_ref *ref1;
643 	struct btrfs_shared_data_ref *ref2;
644 	u32 num_refs = 0;
645 	int type;
646 
647 	leaf = path->nodes[0];
648 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
649 
650 	if (iref) {
651 		/*
652 		 * If type is invalid, we should have bailed out earlier than
653 		 * this call.
654 		 */
655 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
656 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
657 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
658 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
659 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
660 		} else {
661 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
662 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
663 		}
664 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
665 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
666 				      struct btrfs_extent_data_ref);
667 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
668 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
669 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
670 				      struct btrfs_shared_data_ref);
671 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
672 	} else {
673 		WARN_ON(1);
674 	}
675 	return num_refs;
676 }
677 
678 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
679 					  struct btrfs_path *path,
680 					  u64 bytenr, u64 parent,
681 					  u64 root_objectid)
682 {
683 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
684 	struct btrfs_key key;
685 	int ret;
686 
687 	key.objectid = bytenr;
688 	if (parent) {
689 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
690 		key.offset = parent;
691 	} else {
692 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
693 		key.offset = root_objectid;
694 	}
695 
696 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
697 	if (ret > 0)
698 		ret = -ENOENT;
699 	return ret;
700 }
701 
702 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
703 					  struct btrfs_path *path,
704 					  struct btrfs_delayed_ref_node *node,
705 					  u64 bytenr)
706 {
707 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
708 	struct btrfs_key key;
709 	int ret;
710 
711 	key.objectid = bytenr;
712 	if (node->parent) {
713 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
714 		key.offset = node->parent;
715 	} else {
716 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
717 		key.offset = node->ref_root;
718 	}
719 
720 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
721 	btrfs_release_path(path);
722 	return ret;
723 }
724 
725 static inline int extent_ref_type(u64 parent, u64 owner)
726 {
727 	int type;
728 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
729 		if (parent > 0)
730 			type = BTRFS_SHARED_BLOCK_REF_KEY;
731 		else
732 			type = BTRFS_TREE_BLOCK_REF_KEY;
733 	} else {
734 		if (parent > 0)
735 			type = BTRFS_SHARED_DATA_REF_KEY;
736 		else
737 			type = BTRFS_EXTENT_DATA_REF_KEY;
738 	}
739 	return type;
740 }
741 
742 static int find_next_key(struct btrfs_path *path, int level,
743 			 struct btrfs_key *key)
744 
745 {
746 	for (; level < BTRFS_MAX_LEVEL; level++) {
747 		if (!path->nodes[level])
748 			break;
749 		if (path->slots[level] + 1 >=
750 		    btrfs_header_nritems(path->nodes[level]))
751 			continue;
752 		if (level == 0)
753 			btrfs_item_key_to_cpu(path->nodes[level], key,
754 					      path->slots[level] + 1);
755 		else
756 			btrfs_node_key_to_cpu(path->nodes[level], key,
757 					      path->slots[level] + 1);
758 		return 0;
759 	}
760 	return 1;
761 }
762 
763 /*
764  * look for inline back ref. if back ref is found, *ref_ret is set
765  * to the address of inline back ref, and 0 is returned.
766  *
767  * if back ref isn't found, *ref_ret is set to the address where it
768  * should be inserted, and -ENOENT is returned.
769  *
770  * if insert is true and there are too many inline back refs, the path
771  * points to the extent item, and -EAGAIN is returned.
772  *
773  * NOTE: inline back refs are ordered in the same way that back ref
774  *	 items in the tree are ordered.
775  */
776 static noinline_for_stack
777 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
778 				 struct btrfs_path *path,
779 				 struct btrfs_extent_inline_ref **ref_ret,
780 				 u64 bytenr, u64 num_bytes,
781 				 u64 parent, u64 root_objectid,
782 				 u64 owner, u64 offset, int insert)
783 {
784 	struct btrfs_fs_info *fs_info = trans->fs_info;
785 	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
786 	struct btrfs_key key;
787 	struct extent_buffer *leaf;
788 	struct btrfs_extent_item *ei;
789 	struct btrfs_extent_inline_ref *iref;
790 	u64 flags;
791 	u64 item_size;
792 	unsigned long ptr;
793 	unsigned long end;
794 	int extra_size;
795 	int type;
796 	int want;
797 	int ret;
798 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
799 	int needed;
800 
801 	key.objectid = bytenr;
802 	key.type = BTRFS_EXTENT_ITEM_KEY;
803 	key.offset = num_bytes;
804 
805 	want = extent_ref_type(parent, owner);
806 	if (insert) {
807 		extra_size = btrfs_extent_inline_ref_size(want);
808 		path->search_for_extension = 1;
809 		path->keep_locks = 1;
810 	} else
811 		extra_size = -1;
812 
813 	/*
814 	 * Owner is our level, so we can just add one to get the level for the
815 	 * block we are interested in.
816 	 */
817 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
818 		key.type = BTRFS_METADATA_ITEM_KEY;
819 		key.offset = owner;
820 	}
821 
822 again:
823 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
824 	if (ret < 0)
825 		goto out;
826 
827 	/*
828 	 * We may be a newly converted file system which still has the old fat
829 	 * extent entries for metadata, so try and see if we have one of those.
830 	 */
831 	if (ret > 0 && skinny_metadata) {
832 		skinny_metadata = false;
833 		if (path->slots[0]) {
834 			path->slots[0]--;
835 			btrfs_item_key_to_cpu(path->nodes[0], &key,
836 					      path->slots[0]);
837 			if (key.objectid == bytenr &&
838 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
839 			    key.offset == num_bytes)
840 				ret = 0;
841 		}
842 		if (ret) {
843 			key.objectid = bytenr;
844 			key.type = BTRFS_EXTENT_ITEM_KEY;
845 			key.offset = num_bytes;
846 			btrfs_release_path(path);
847 			goto again;
848 		}
849 	}
850 
851 	if (ret && !insert) {
852 		ret = -ENOENT;
853 		goto out;
854 	} else if (WARN_ON(ret)) {
855 		btrfs_print_leaf(path->nodes[0]);
856 		btrfs_err(fs_info,
857 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
858 			  bytenr, num_bytes, parent, root_objectid, owner,
859 			  offset);
860 		ret = -EUCLEAN;
861 		goto out;
862 	}
863 
864 	leaf = path->nodes[0];
865 	item_size = btrfs_item_size(leaf, path->slots[0]);
866 	if (unlikely(item_size < sizeof(*ei))) {
867 		ret = -EUCLEAN;
868 		btrfs_err(fs_info,
869 			  "unexpected extent item size, has %llu expect >= %zu",
870 			  item_size, sizeof(*ei));
871 		btrfs_abort_transaction(trans, ret);
872 		goto out;
873 	}
874 
875 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
876 	flags = btrfs_extent_flags(leaf, ei);
877 
878 	ptr = (unsigned long)(ei + 1);
879 	end = (unsigned long)ei + item_size;
880 
881 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
882 		ptr += sizeof(struct btrfs_tree_block_info);
883 		BUG_ON(ptr > end);
884 	}
885 
886 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
887 		needed = BTRFS_REF_TYPE_DATA;
888 	else
889 		needed = BTRFS_REF_TYPE_BLOCK;
890 
891 	ret = -ENOENT;
892 	while (ptr < end) {
893 		iref = (struct btrfs_extent_inline_ref *)ptr;
894 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
895 		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
896 			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
897 			ptr += btrfs_extent_inline_ref_size(type);
898 			continue;
899 		}
900 		if (type == BTRFS_REF_TYPE_INVALID) {
901 			ret = -EUCLEAN;
902 			goto out;
903 		}
904 
905 		if (want < type)
906 			break;
907 		if (want > type) {
908 			ptr += btrfs_extent_inline_ref_size(type);
909 			continue;
910 		}
911 
912 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
913 			struct btrfs_extent_data_ref *dref;
914 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
915 			if (match_extent_data_ref(leaf, dref, root_objectid,
916 						  owner, offset)) {
917 				ret = 0;
918 				break;
919 			}
920 			if (hash_extent_data_ref_item(leaf, dref) <
921 			    hash_extent_data_ref(root_objectid, owner, offset))
922 				break;
923 		} else {
924 			u64 ref_offset;
925 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
926 			if (parent > 0) {
927 				if (parent == ref_offset) {
928 					ret = 0;
929 					break;
930 				}
931 				if (ref_offset < parent)
932 					break;
933 			} else {
934 				if (root_objectid == ref_offset) {
935 					ret = 0;
936 					break;
937 				}
938 				if (ref_offset < root_objectid)
939 					break;
940 			}
941 		}
942 		ptr += btrfs_extent_inline_ref_size(type);
943 	}
944 
945 	if (unlikely(ptr > end)) {
946 		ret = -EUCLEAN;
947 		btrfs_print_leaf(path->nodes[0]);
948 		btrfs_crit(fs_info,
949 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
950 			   path->slots[0], root_objectid, owner, offset, parent);
951 		goto out;
952 	}
953 
954 	if (ret == -ENOENT && insert) {
955 		if (item_size + extra_size >=
956 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
957 			ret = -EAGAIN;
958 			goto out;
959 		}
960 		/*
961 		 * To add new inline back ref, we have to make sure
962 		 * there is no corresponding back ref item.
963 		 * For simplicity, we just do not add new inline back
964 		 * ref if there is any kind of item for this block
965 		 */
966 		if (find_next_key(path, 0, &key) == 0 &&
967 		    key.objectid == bytenr &&
968 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
969 			ret = -EAGAIN;
970 			goto out;
971 		}
972 	}
973 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
974 out:
975 	if (insert) {
976 		path->keep_locks = 0;
977 		path->search_for_extension = 0;
978 		btrfs_unlock_up_safe(path, 1);
979 	}
980 	return ret;
981 }
982 
983 /*
984  * helper to add new inline back ref
985  */
986 static noinline_for_stack
987 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
988 				 struct btrfs_path *path,
989 				 struct btrfs_extent_inline_ref *iref,
990 				 u64 parent, u64 root_objectid,
991 				 u64 owner, u64 offset, int refs_to_add,
992 				 struct btrfs_delayed_extent_op *extent_op)
993 {
994 	struct extent_buffer *leaf;
995 	struct btrfs_extent_item *ei;
996 	unsigned long ptr;
997 	unsigned long end;
998 	unsigned long item_offset;
999 	u64 refs;
1000 	int size;
1001 	int type;
1002 
1003 	leaf = path->nodes[0];
1004 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1005 	item_offset = (unsigned long)iref - (unsigned long)ei;
1006 
1007 	type = extent_ref_type(parent, owner);
1008 	size = btrfs_extent_inline_ref_size(type);
1009 
1010 	btrfs_extend_item(trans, path, size);
1011 
1012 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1013 	refs = btrfs_extent_refs(leaf, ei);
1014 	refs += refs_to_add;
1015 	btrfs_set_extent_refs(leaf, ei, refs);
1016 	if (extent_op)
1017 		__run_delayed_extent_op(extent_op, leaf, ei);
1018 
1019 	ptr = (unsigned long)ei + item_offset;
1020 	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1021 	if (ptr < end - size)
1022 		memmove_extent_buffer(leaf, ptr + size, ptr,
1023 				      end - size - ptr);
1024 
1025 	iref = (struct btrfs_extent_inline_ref *)ptr;
1026 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1027 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1028 		struct btrfs_extent_data_ref *dref;
1029 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1030 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1031 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1032 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1033 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1034 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1035 		struct btrfs_shared_data_ref *sref;
1036 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1037 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1038 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1039 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1040 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1041 	} else {
1042 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1043 	}
1044 	btrfs_mark_buffer_dirty(trans, leaf);
1045 }
1046 
1047 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1048 				 struct btrfs_path *path,
1049 				 struct btrfs_extent_inline_ref **ref_ret,
1050 				 u64 bytenr, u64 num_bytes, u64 parent,
1051 				 u64 root_objectid, u64 owner, u64 offset)
1052 {
1053 	int ret;
1054 
1055 	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1056 					   num_bytes, parent, root_objectid,
1057 					   owner, offset, 0);
1058 	if (ret != -ENOENT)
1059 		return ret;
1060 
1061 	btrfs_release_path(path);
1062 	*ref_ret = NULL;
1063 
1064 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1065 		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1066 					    root_objectid);
1067 	} else {
1068 		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1069 					     root_objectid, owner, offset);
1070 	}
1071 	return ret;
1072 }
1073 
1074 /*
1075  * helper to update/remove inline back ref
1076  */
1077 static noinline_for_stack int update_inline_extent_backref(
1078 				  struct btrfs_trans_handle *trans,
1079 				  struct btrfs_path *path,
1080 				  struct btrfs_extent_inline_ref *iref,
1081 				  int refs_to_mod,
1082 				  struct btrfs_delayed_extent_op *extent_op)
1083 {
1084 	struct extent_buffer *leaf = path->nodes[0];
1085 	struct btrfs_fs_info *fs_info = leaf->fs_info;
1086 	struct btrfs_extent_item *ei;
1087 	struct btrfs_extent_data_ref *dref = NULL;
1088 	struct btrfs_shared_data_ref *sref = NULL;
1089 	unsigned long ptr;
1090 	unsigned long end;
1091 	u32 item_size;
1092 	int size;
1093 	int type;
1094 	u64 refs;
1095 
1096 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1097 	refs = btrfs_extent_refs(leaf, ei);
1098 	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1099 		struct btrfs_key key;
1100 		u32 extent_size;
1101 
1102 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1103 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1104 			extent_size = fs_info->nodesize;
1105 		else
1106 			extent_size = key.offset;
1107 		btrfs_print_leaf(leaf);
1108 		btrfs_err(fs_info,
1109 	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1110 			  key.objectid, extent_size, refs_to_mod, refs);
1111 		return -EUCLEAN;
1112 	}
1113 	refs += refs_to_mod;
1114 	btrfs_set_extent_refs(leaf, ei, refs);
1115 	if (extent_op)
1116 		__run_delayed_extent_op(extent_op, leaf, ei);
1117 
1118 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1119 	/*
1120 	 * Function btrfs_get_extent_inline_ref_type() has already printed
1121 	 * error messages.
1122 	 */
1123 	if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1124 		return -EUCLEAN;
1125 
1126 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1127 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1128 		refs = btrfs_extent_data_ref_count(leaf, dref);
1129 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1130 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1131 		refs = btrfs_shared_data_ref_count(leaf, sref);
1132 	} else {
1133 		refs = 1;
1134 		/*
1135 		 * For tree blocks we can only drop one ref for it, and tree
1136 		 * blocks should not have refs > 1.
1137 		 *
1138 		 * Furthermore if we're inserting a new inline backref, we
1139 		 * won't reach this path either. That would be
1140 		 * setup_inline_extent_backref().
1141 		 */
1142 		if (unlikely(refs_to_mod != -1)) {
1143 			struct btrfs_key key;
1144 
1145 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1146 
1147 			btrfs_print_leaf(leaf);
1148 			btrfs_err(fs_info,
1149 			"invalid refs_to_mod for tree block %llu, has %d expect -1",
1150 				  key.objectid, refs_to_mod);
1151 			return -EUCLEAN;
1152 		}
1153 	}
1154 
1155 	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1156 		struct btrfs_key key;
1157 		u32 extent_size;
1158 
1159 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1160 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1161 			extent_size = fs_info->nodesize;
1162 		else
1163 			extent_size = key.offset;
1164 		btrfs_print_leaf(leaf);
1165 		btrfs_err(fs_info,
1166 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1167 			  (unsigned long)iref, key.objectid, extent_size,
1168 			  refs_to_mod, refs);
1169 		return -EUCLEAN;
1170 	}
1171 	refs += refs_to_mod;
1172 
1173 	if (refs > 0) {
1174 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1175 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1176 		else
1177 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1178 	} else {
1179 		size =  btrfs_extent_inline_ref_size(type);
1180 		item_size = btrfs_item_size(leaf, path->slots[0]);
1181 		ptr = (unsigned long)iref;
1182 		end = (unsigned long)ei + item_size;
1183 		if (ptr + size < end)
1184 			memmove_extent_buffer(leaf, ptr, ptr + size,
1185 					      end - ptr - size);
1186 		item_size -= size;
1187 		btrfs_truncate_item(trans, path, item_size, 1);
1188 	}
1189 	btrfs_mark_buffer_dirty(trans, leaf);
1190 	return 0;
1191 }
1192 
1193 static noinline_for_stack
1194 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1195 				 struct btrfs_path *path,
1196 				 u64 bytenr, u64 num_bytes, u64 parent,
1197 				 u64 root_objectid, u64 owner,
1198 				 u64 offset, int refs_to_add,
1199 				 struct btrfs_delayed_extent_op *extent_op)
1200 {
1201 	struct btrfs_extent_inline_ref *iref;
1202 	int ret;
1203 
1204 	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1205 					   num_bytes, parent, root_objectid,
1206 					   owner, offset, 1);
1207 	if (ret == 0) {
1208 		/*
1209 		 * We're adding refs to a tree block we already own, this
1210 		 * should not happen at all.
1211 		 */
1212 		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1213 			btrfs_print_leaf(path->nodes[0]);
1214 			btrfs_crit(trans->fs_info,
1215 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1216 				   bytenr, num_bytes, root_objectid, path->slots[0]);
1217 			return -EUCLEAN;
1218 		}
1219 		ret = update_inline_extent_backref(trans, path, iref,
1220 						   refs_to_add, extent_op);
1221 	} else if (ret == -ENOENT) {
1222 		setup_inline_extent_backref(trans, path, iref, parent,
1223 					    root_objectid, owner, offset,
1224 					    refs_to_add, extent_op);
1225 		ret = 0;
1226 	}
1227 	return ret;
1228 }
1229 
1230 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1231 				 struct btrfs_root *root,
1232 				 struct btrfs_path *path,
1233 				 struct btrfs_extent_inline_ref *iref,
1234 				 int refs_to_drop, int is_data)
1235 {
1236 	int ret = 0;
1237 
1238 	BUG_ON(!is_data && refs_to_drop != 1);
1239 	if (iref)
1240 		ret = update_inline_extent_backref(trans, path, iref,
1241 						   -refs_to_drop, NULL);
1242 	else if (is_data)
1243 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1244 	else
1245 		ret = btrfs_del_item(trans, root, path);
1246 	return ret;
1247 }
1248 
1249 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1250 			       u64 *discarded_bytes)
1251 {
1252 	int j, ret = 0;
1253 	u64 bytes_left, end;
1254 	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1255 
1256 	/* Adjust the range to be aligned to 512B sectors if necessary. */
1257 	if (start != aligned_start) {
1258 		len -= aligned_start - start;
1259 		len = round_down(len, 1 << SECTOR_SHIFT);
1260 		start = aligned_start;
1261 	}
1262 
1263 	*discarded_bytes = 0;
1264 
1265 	if (!len)
1266 		return 0;
1267 
1268 	end = start + len;
1269 	bytes_left = len;
1270 
1271 	/* Skip any superblocks on this device. */
1272 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1273 		u64 sb_start = btrfs_sb_offset(j);
1274 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1275 		u64 size = sb_start - start;
1276 
1277 		if (!in_range(sb_start, start, bytes_left) &&
1278 		    !in_range(sb_end, start, bytes_left) &&
1279 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1280 			continue;
1281 
1282 		/*
1283 		 * Superblock spans beginning of range.  Adjust start and
1284 		 * try again.
1285 		 */
1286 		if (sb_start <= start) {
1287 			start += sb_end - start;
1288 			if (start > end) {
1289 				bytes_left = 0;
1290 				break;
1291 			}
1292 			bytes_left = end - start;
1293 			continue;
1294 		}
1295 
1296 		if (size) {
1297 			ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1298 						   size >> SECTOR_SHIFT,
1299 						   GFP_NOFS);
1300 			if (!ret)
1301 				*discarded_bytes += size;
1302 			else if (ret != -EOPNOTSUPP)
1303 				return ret;
1304 		}
1305 
1306 		start = sb_end;
1307 		if (start > end) {
1308 			bytes_left = 0;
1309 			break;
1310 		}
1311 		bytes_left = end - start;
1312 	}
1313 
1314 	if (bytes_left) {
1315 		ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1316 					   bytes_left >> SECTOR_SHIFT,
1317 					   GFP_NOFS);
1318 		if (!ret)
1319 			*discarded_bytes += bytes_left;
1320 	}
1321 	return ret;
1322 }
1323 
1324 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1325 {
1326 	struct btrfs_device *dev = stripe->dev;
1327 	struct btrfs_fs_info *fs_info = dev->fs_info;
1328 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1329 	u64 phys = stripe->physical;
1330 	u64 len = stripe->length;
1331 	u64 discarded = 0;
1332 	int ret = 0;
1333 
1334 	/* Zone reset on a zoned filesystem */
1335 	if (btrfs_can_zone_reset(dev, phys, len)) {
1336 		u64 src_disc;
1337 
1338 		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1339 		if (ret)
1340 			goto out;
1341 
1342 		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1343 		    dev != dev_replace->srcdev)
1344 			goto out;
1345 
1346 		src_disc = discarded;
1347 
1348 		/* Send to replace target as well */
1349 		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1350 					      &discarded);
1351 		discarded += src_disc;
1352 	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1353 		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1354 	} else {
1355 		ret = 0;
1356 		*bytes = 0;
1357 	}
1358 
1359 out:
1360 	*bytes = discarded;
1361 	return ret;
1362 }
1363 
1364 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1365 			 u64 num_bytes, u64 *actual_bytes)
1366 {
1367 	int ret = 0;
1368 	u64 discarded_bytes = 0;
1369 	u64 end = bytenr + num_bytes;
1370 	u64 cur = bytenr;
1371 
1372 	/*
1373 	 * Avoid races with device replace and make sure the devices in the
1374 	 * stripes don't go away while we are discarding.
1375 	 */
1376 	btrfs_bio_counter_inc_blocked(fs_info);
1377 	while (cur < end) {
1378 		struct btrfs_discard_stripe *stripes;
1379 		unsigned int num_stripes;
1380 		int i;
1381 
1382 		num_bytes = end - cur;
1383 		stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1384 		if (IS_ERR(stripes)) {
1385 			ret = PTR_ERR(stripes);
1386 			if (ret == -EOPNOTSUPP)
1387 				ret = 0;
1388 			break;
1389 		}
1390 
1391 		for (i = 0; i < num_stripes; i++) {
1392 			struct btrfs_discard_stripe *stripe = stripes + i;
1393 			u64 bytes;
1394 
1395 			if (!stripe->dev->bdev) {
1396 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1397 				continue;
1398 			}
1399 
1400 			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1401 					&stripe->dev->dev_state))
1402 				continue;
1403 
1404 			ret = do_discard_extent(stripe, &bytes);
1405 			if (ret) {
1406 				/*
1407 				 * Keep going if discard is not supported by the
1408 				 * device.
1409 				 */
1410 				if (ret != -EOPNOTSUPP)
1411 					break;
1412 				ret = 0;
1413 			} else {
1414 				discarded_bytes += bytes;
1415 			}
1416 		}
1417 		kfree(stripes);
1418 		if (ret)
1419 			break;
1420 		cur += num_bytes;
1421 	}
1422 	btrfs_bio_counter_dec(fs_info);
1423 	if (actual_bytes)
1424 		*actual_bytes = discarded_bytes;
1425 	return ret;
1426 }
1427 
1428 /* Can return -ENOMEM */
1429 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1430 			 struct btrfs_ref *generic_ref)
1431 {
1432 	struct btrfs_fs_info *fs_info = trans->fs_info;
1433 	int ret;
1434 
1435 	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1436 	       generic_ref->action);
1437 	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1438 	       generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
1439 
1440 	if (generic_ref->type == BTRFS_REF_METADATA)
1441 		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1442 	else
1443 		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1444 
1445 	btrfs_ref_tree_mod(fs_info, generic_ref);
1446 
1447 	return ret;
1448 }
1449 
1450 /*
1451  * Insert backreference for a given extent.
1452  *
1453  * The counterpart is in __btrfs_free_extent(), with examples and more details
1454  * how it works.
1455  *
1456  * @trans:	    Handle of transaction
1457  *
1458  * @node:	    The delayed ref node used to get the bytenr/length for
1459  *		    extent whose references are incremented.
1460  *
1461  * @extent_op       Pointer to a structure, holding information necessary when
1462  *                  updating a tree block's flags
1463  *
1464  */
1465 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1466 				  struct btrfs_delayed_ref_node *node,
1467 				  struct btrfs_delayed_extent_op *extent_op)
1468 {
1469 	struct btrfs_path *path;
1470 	struct extent_buffer *leaf;
1471 	struct btrfs_extent_item *item;
1472 	struct btrfs_key key;
1473 	u64 bytenr = node->bytenr;
1474 	u64 num_bytes = node->num_bytes;
1475 	u64 owner = btrfs_delayed_ref_owner(node);
1476 	u64 offset = btrfs_delayed_ref_offset(node);
1477 	u64 refs;
1478 	int refs_to_add = node->ref_mod;
1479 	int ret;
1480 
1481 	path = btrfs_alloc_path();
1482 	if (!path)
1483 		return -ENOMEM;
1484 
1485 	/* this will setup the path even if it fails to insert the back ref */
1486 	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1487 					   node->parent, node->ref_root, owner,
1488 					   offset, refs_to_add, extent_op);
1489 	if ((ret < 0 && ret != -EAGAIN) || !ret)
1490 		goto out;
1491 
1492 	/*
1493 	 * Ok we had -EAGAIN which means we didn't have space to insert and
1494 	 * inline extent ref, so just update the reference count and add a
1495 	 * normal backref.
1496 	 */
1497 	leaf = path->nodes[0];
1498 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1499 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1500 	refs = btrfs_extent_refs(leaf, item);
1501 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1502 	if (extent_op)
1503 		__run_delayed_extent_op(extent_op, leaf, item);
1504 
1505 	btrfs_mark_buffer_dirty(trans, leaf);
1506 	btrfs_release_path(path);
1507 
1508 	/* now insert the actual backref */
1509 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1510 		ret = insert_tree_block_ref(trans, path, node, bytenr);
1511 	else
1512 		ret = insert_extent_data_ref(trans, path, node, bytenr);
1513 
1514 	if (ret)
1515 		btrfs_abort_transaction(trans, ret);
1516 out:
1517 	btrfs_free_path(path);
1518 	return ret;
1519 }
1520 
1521 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1522 				     struct btrfs_delayed_ref_head *href)
1523 {
1524 	u64 root = href->owning_root;
1525 
1526 	/*
1527 	 * Don't check must_insert_reserved, as this is called from contexts
1528 	 * where it has already been unset.
1529 	 */
1530 	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1531 	    !href->is_data || !is_fstree(root))
1532 		return;
1533 
1534 	btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1535 				  BTRFS_QGROUP_RSV_DATA);
1536 }
1537 
1538 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1539 				struct btrfs_delayed_ref_head *href,
1540 				struct btrfs_delayed_ref_node *node,
1541 				struct btrfs_delayed_extent_op *extent_op,
1542 				bool insert_reserved)
1543 {
1544 	int ret = 0;
1545 	u64 parent = 0;
1546 	u64 flags = 0;
1547 
1548 	trace_run_delayed_data_ref(trans->fs_info, node);
1549 
1550 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1551 		parent = node->parent;
1552 
1553 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1554 		struct btrfs_key key;
1555 		struct btrfs_squota_delta delta = {
1556 			.root = href->owning_root,
1557 			.num_bytes = node->num_bytes,
1558 			.is_data = true,
1559 			.is_inc	= true,
1560 			.generation = trans->transid,
1561 		};
1562 		u64 owner = btrfs_delayed_ref_owner(node);
1563 		u64 offset = btrfs_delayed_ref_offset(node);
1564 
1565 		if (extent_op)
1566 			flags |= extent_op->flags_to_set;
1567 
1568 		key.objectid = node->bytenr;
1569 		key.type = BTRFS_EXTENT_ITEM_KEY;
1570 		key.offset = node->num_bytes;
1571 
1572 		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
1573 						 flags, owner, offset, &key,
1574 						 node->ref_mod,
1575 						 href->owning_root);
1576 		free_head_ref_squota_rsv(trans->fs_info, href);
1577 		if (!ret)
1578 			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
1579 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1580 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1581 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1582 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1583 	} else {
1584 		BUG();
1585 	}
1586 	return ret;
1587 }
1588 
1589 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1590 				    struct extent_buffer *leaf,
1591 				    struct btrfs_extent_item *ei)
1592 {
1593 	u64 flags = btrfs_extent_flags(leaf, ei);
1594 	if (extent_op->update_flags) {
1595 		flags |= extent_op->flags_to_set;
1596 		btrfs_set_extent_flags(leaf, ei, flags);
1597 	}
1598 
1599 	if (extent_op->update_key) {
1600 		struct btrfs_tree_block_info *bi;
1601 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1602 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1603 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1604 	}
1605 }
1606 
1607 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1608 				 struct btrfs_delayed_ref_head *head,
1609 				 struct btrfs_delayed_extent_op *extent_op)
1610 {
1611 	struct btrfs_fs_info *fs_info = trans->fs_info;
1612 	struct btrfs_root *root;
1613 	struct btrfs_key key;
1614 	struct btrfs_path *path;
1615 	struct btrfs_extent_item *ei;
1616 	struct extent_buffer *leaf;
1617 	u32 item_size;
1618 	int ret;
1619 	int metadata = 1;
1620 
1621 	if (TRANS_ABORTED(trans))
1622 		return 0;
1623 
1624 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1625 		metadata = 0;
1626 
1627 	path = btrfs_alloc_path();
1628 	if (!path)
1629 		return -ENOMEM;
1630 
1631 	key.objectid = head->bytenr;
1632 
1633 	if (metadata) {
1634 		key.type = BTRFS_METADATA_ITEM_KEY;
1635 		key.offset = extent_op->level;
1636 	} else {
1637 		key.type = BTRFS_EXTENT_ITEM_KEY;
1638 		key.offset = head->num_bytes;
1639 	}
1640 
1641 	root = btrfs_extent_root(fs_info, key.objectid);
1642 again:
1643 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1644 	if (ret < 0) {
1645 		goto out;
1646 	} else if (ret > 0) {
1647 		if (metadata) {
1648 			if (path->slots[0] > 0) {
1649 				path->slots[0]--;
1650 				btrfs_item_key_to_cpu(path->nodes[0], &key,
1651 						      path->slots[0]);
1652 				if (key.objectid == head->bytenr &&
1653 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1654 				    key.offset == head->num_bytes)
1655 					ret = 0;
1656 			}
1657 			if (ret > 0) {
1658 				btrfs_release_path(path);
1659 				metadata = 0;
1660 
1661 				key.objectid = head->bytenr;
1662 				key.offset = head->num_bytes;
1663 				key.type = BTRFS_EXTENT_ITEM_KEY;
1664 				goto again;
1665 			}
1666 		} else {
1667 			ret = -EUCLEAN;
1668 			btrfs_err(fs_info,
1669 		  "missing extent item for extent %llu num_bytes %llu level %d",
1670 				  head->bytenr, head->num_bytes, extent_op->level);
1671 			goto out;
1672 		}
1673 	}
1674 
1675 	leaf = path->nodes[0];
1676 	item_size = btrfs_item_size(leaf, path->slots[0]);
1677 
1678 	if (unlikely(item_size < sizeof(*ei))) {
1679 		ret = -EUCLEAN;
1680 		btrfs_err(fs_info,
1681 			  "unexpected extent item size, has %u expect >= %zu",
1682 			  item_size, sizeof(*ei));
1683 		btrfs_abort_transaction(trans, ret);
1684 		goto out;
1685 	}
1686 
1687 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1688 	__run_delayed_extent_op(extent_op, leaf, ei);
1689 
1690 	btrfs_mark_buffer_dirty(trans, leaf);
1691 out:
1692 	btrfs_free_path(path);
1693 	return ret;
1694 }
1695 
1696 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1697 				struct btrfs_delayed_ref_head *href,
1698 				struct btrfs_delayed_ref_node *node,
1699 				struct btrfs_delayed_extent_op *extent_op,
1700 				bool insert_reserved)
1701 {
1702 	int ret = 0;
1703 	struct btrfs_fs_info *fs_info = trans->fs_info;
1704 	u64 parent = 0;
1705 	u64 ref_root = 0;
1706 
1707 	trace_run_delayed_tree_ref(trans->fs_info, node);
1708 
1709 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1710 		parent = node->parent;
1711 	ref_root = node->ref_root;
1712 
1713 	if (unlikely(node->ref_mod != 1)) {
1714 		btrfs_err(trans->fs_info,
1715 	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1716 			  node->bytenr, node->ref_mod, node->action, ref_root,
1717 			  parent);
1718 		return -EUCLEAN;
1719 	}
1720 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1721 		struct btrfs_squota_delta delta = {
1722 			.root = href->owning_root,
1723 			.num_bytes = fs_info->nodesize,
1724 			.is_data = false,
1725 			.is_inc = true,
1726 			.generation = trans->transid,
1727 		};
1728 
1729 		BUG_ON(!extent_op || !extent_op->update_flags);
1730 		ret = alloc_reserved_tree_block(trans, node, extent_op);
1731 		if (!ret)
1732 			btrfs_record_squota_delta(fs_info, &delta);
1733 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1734 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1735 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1736 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1737 	} else {
1738 		BUG();
1739 	}
1740 	return ret;
1741 }
1742 
1743 /* helper function to actually process a single delayed ref entry */
1744 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1745 			       struct btrfs_delayed_ref_head *href,
1746 			       struct btrfs_delayed_ref_node *node,
1747 			       struct btrfs_delayed_extent_op *extent_op,
1748 			       bool insert_reserved)
1749 {
1750 	int ret = 0;
1751 
1752 	if (TRANS_ABORTED(trans)) {
1753 		if (insert_reserved) {
1754 			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1755 			free_head_ref_squota_rsv(trans->fs_info, href);
1756 		}
1757 		return 0;
1758 	}
1759 
1760 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1761 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1762 		ret = run_delayed_tree_ref(trans, href, node, extent_op,
1763 					   insert_reserved);
1764 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1765 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1766 		ret = run_delayed_data_ref(trans, href, node, extent_op,
1767 					   insert_reserved);
1768 	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1769 		ret = 0;
1770 	else
1771 		BUG();
1772 	if (ret && insert_reserved)
1773 		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1774 	if (ret < 0)
1775 		btrfs_err(trans->fs_info,
1776 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1777 			  node->bytenr, node->num_bytes, node->type,
1778 			  node->action, node->ref_mod, ret);
1779 	return ret;
1780 }
1781 
1782 static inline struct btrfs_delayed_ref_node *
1783 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1784 {
1785 	struct btrfs_delayed_ref_node *ref;
1786 
1787 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1788 		return NULL;
1789 
1790 	/*
1791 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1792 	 * This is to prevent a ref count from going down to zero, which deletes
1793 	 * the extent item from the extent tree, when there still are references
1794 	 * to add, which would fail because they would not find the extent item.
1795 	 */
1796 	if (!list_empty(&head->ref_add_list))
1797 		return list_first_entry(&head->ref_add_list,
1798 				struct btrfs_delayed_ref_node, add_list);
1799 
1800 	ref = rb_entry(rb_first_cached(&head->ref_tree),
1801 		       struct btrfs_delayed_ref_node, ref_node);
1802 	ASSERT(list_empty(&ref->add_list));
1803 	return ref;
1804 }
1805 
1806 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1807 				      struct btrfs_delayed_ref_head *head)
1808 {
1809 	spin_lock(&delayed_refs->lock);
1810 	head->processing = false;
1811 	delayed_refs->num_heads_ready++;
1812 	spin_unlock(&delayed_refs->lock);
1813 	btrfs_delayed_ref_unlock(head);
1814 }
1815 
1816 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1817 				struct btrfs_delayed_ref_head *head)
1818 {
1819 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1820 
1821 	if (!extent_op)
1822 		return NULL;
1823 
1824 	if (head->must_insert_reserved) {
1825 		head->extent_op = NULL;
1826 		btrfs_free_delayed_extent_op(extent_op);
1827 		return NULL;
1828 	}
1829 	return extent_op;
1830 }
1831 
1832 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1833 				     struct btrfs_delayed_ref_head *head)
1834 {
1835 	struct btrfs_delayed_extent_op *extent_op;
1836 	int ret;
1837 
1838 	extent_op = cleanup_extent_op(head);
1839 	if (!extent_op)
1840 		return 0;
1841 	head->extent_op = NULL;
1842 	spin_unlock(&head->lock);
1843 	ret = run_delayed_extent_op(trans, head, extent_op);
1844 	btrfs_free_delayed_extent_op(extent_op);
1845 	return ret ? ret : 1;
1846 }
1847 
1848 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1849 				  struct btrfs_delayed_ref_root *delayed_refs,
1850 				  struct btrfs_delayed_ref_head *head)
1851 {
1852 	u64 ret = 0;
1853 
1854 	/*
1855 	 * We had csum deletions accounted for in our delayed refs rsv, we need
1856 	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1857 	 */
1858 	if (head->total_ref_mod < 0 && head->is_data) {
1859 		int nr_csums;
1860 
1861 		spin_lock(&delayed_refs->lock);
1862 		delayed_refs->pending_csums -= head->num_bytes;
1863 		spin_unlock(&delayed_refs->lock);
1864 		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1865 
1866 		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
1867 
1868 		ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
1869 	}
1870 	/* must_insert_reserved can be set only if we didn't run the head ref. */
1871 	if (head->must_insert_reserved)
1872 		free_head_ref_squota_rsv(fs_info, head);
1873 
1874 	return ret;
1875 }
1876 
1877 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1878 			    struct btrfs_delayed_ref_head *head,
1879 			    u64 *bytes_released)
1880 {
1881 
1882 	struct btrfs_fs_info *fs_info = trans->fs_info;
1883 	struct btrfs_delayed_ref_root *delayed_refs;
1884 	int ret;
1885 
1886 	delayed_refs = &trans->transaction->delayed_refs;
1887 
1888 	ret = run_and_cleanup_extent_op(trans, head);
1889 	if (ret < 0) {
1890 		unselect_delayed_ref_head(delayed_refs, head);
1891 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1892 		return ret;
1893 	} else if (ret) {
1894 		return ret;
1895 	}
1896 
1897 	/*
1898 	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1899 	 * and then re-check to make sure nobody got added.
1900 	 */
1901 	spin_unlock(&head->lock);
1902 	spin_lock(&delayed_refs->lock);
1903 	spin_lock(&head->lock);
1904 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1905 		spin_unlock(&head->lock);
1906 		spin_unlock(&delayed_refs->lock);
1907 		return 1;
1908 	}
1909 	btrfs_delete_ref_head(delayed_refs, head);
1910 	spin_unlock(&head->lock);
1911 	spin_unlock(&delayed_refs->lock);
1912 
1913 	if (head->must_insert_reserved) {
1914 		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1915 		if (head->is_data) {
1916 			struct btrfs_root *csum_root;
1917 
1918 			csum_root = btrfs_csum_root(fs_info, head->bytenr);
1919 			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1920 					      head->num_bytes);
1921 		}
1922 	}
1923 
1924 	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1925 
1926 	trace_run_delayed_ref_head(fs_info, head, 0);
1927 	btrfs_delayed_ref_unlock(head);
1928 	btrfs_put_delayed_ref_head(head);
1929 	return ret;
1930 }
1931 
1932 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1933 					struct btrfs_trans_handle *trans)
1934 {
1935 	struct btrfs_delayed_ref_root *delayed_refs =
1936 		&trans->transaction->delayed_refs;
1937 	struct btrfs_delayed_ref_head *head = NULL;
1938 	int ret;
1939 
1940 	spin_lock(&delayed_refs->lock);
1941 	head = btrfs_select_ref_head(delayed_refs);
1942 	if (!head) {
1943 		spin_unlock(&delayed_refs->lock);
1944 		return head;
1945 	}
1946 
1947 	/*
1948 	 * Grab the lock that says we are going to process all the refs for
1949 	 * this head
1950 	 */
1951 	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1952 	spin_unlock(&delayed_refs->lock);
1953 
1954 	/*
1955 	 * We may have dropped the spin lock to get the head mutex lock, and
1956 	 * that might have given someone else time to free the head.  If that's
1957 	 * true, it has been removed from our list and we can move on.
1958 	 */
1959 	if (ret == -EAGAIN)
1960 		head = ERR_PTR(-EAGAIN);
1961 
1962 	return head;
1963 }
1964 
1965 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1966 					   struct btrfs_delayed_ref_head *locked_ref,
1967 					   u64 *bytes_released)
1968 {
1969 	struct btrfs_fs_info *fs_info = trans->fs_info;
1970 	struct btrfs_delayed_ref_root *delayed_refs;
1971 	struct btrfs_delayed_extent_op *extent_op;
1972 	struct btrfs_delayed_ref_node *ref;
1973 	bool must_insert_reserved;
1974 	int ret;
1975 
1976 	delayed_refs = &trans->transaction->delayed_refs;
1977 
1978 	lockdep_assert_held(&locked_ref->mutex);
1979 	lockdep_assert_held(&locked_ref->lock);
1980 
1981 	while ((ref = select_delayed_ref(locked_ref))) {
1982 		if (ref->seq &&
1983 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1984 			spin_unlock(&locked_ref->lock);
1985 			unselect_delayed_ref_head(delayed_refs, locked_ref);
1986 			return -EAGAIN;
1987 		}
1988 
1989 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1990 		RB_CLEAR_NODE(&ref->ref_node);
1991 		if (!list_empty(&ref->add_list))
1992 			list_del(&ref->add_list);
1993 		/*
1994 		 * When we play the delayed ref, also correct the ref_mod on
1995 		 * head
1996 		 */
1997 		switch (ref->action) {
1998 		case BTRFS_ADD_DELAYED_REF:
1999 		case BTRFS_ADD_DELAYED_EXTENT:
2000 			locked_ref->ref_mod -= ref->ref_mod;
2001 			break;
2002 		case BTRFS_DROP_DELAYED_REF:
2003 			locked_ref->ref_mod += ref->ref_mod;
2004 			break;
2005 		default:
2006 			WARN_ON(1);
2007 		}
2008 		atomic_dec(&delayed_refs->num_entries);
2009 
2010 		/*
2011 		 * Record the must_insert_reserved flag before we drop the
2012 		 * spin lock.
2013 		 */
2014 		must_insert_reserved = locked_ref->must_insert_reserved;
2015 		/*
2016 		 * Unsetting this on the head ref relinquishes ownership of
2017 		 * the rsv_bytes, so it is critical that every possible code
2018 		 * path from here forward frees all reserves including qgroup
2019 		 * reserve.
2020 		 */
2021 		locked_ref->must_insert_reserved = false;
2022 
2023 		extent_op = locked_ref->extent_op;
2024 		locked_ref->extent_op = NULL;
2025 		spin_unlock(&locked_ref->lock);
2026 
2027 		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
2028 					  must_insert_reserved);
2029 		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
2030 		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
2031 
2032 		btrfs_free_delayed_extent_op(extent_op);
2033 		if (ret) {
2034 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2035 			btrfs_put_delayed_ref(ref);
2036 			return ret;
2037 		}
2038 
2039 		btrfs_put_delayed_ref(ref);
2040 		cond_resched();
2041 
2042 		spin_lock(&locked_ref->lock);
2043 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2044 	}
2045 
2046 	return 0;
2047 }
2048 
2049 /*
2050  * Returns 0 on success or if called with an already aborted transaction.
2051  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2052  */
2053 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2054 					     u64 min_bytes)
2055 {
2056 	struct btrfs_fs_info *fs_info = trans->fs_info;
2057 	struct btrfs_delayed_ref_root *delayed_refs;
2058 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2059 	int ret;
2060 	unsigned long count = 0;
2061 	unsigned long max_count = 0;
2062 	u64 bytes_processed = 0;
2063 
2064 	delayed_refs = &trans->transaction->delayed_refs;
2065 	if (min_bytes == 0) {
2066 		max_count = delayed_refs->num_heads_ready;
2067 		min_bytes = U64_MAX;
2068 	}
2069 
2070 	do {
2071 		if (!locked_ref) {
2072 			locked_ref = btrfs_obtain_ref_head(trans);
2073 			if (IS_ERR_OR_NULL(locked_ref)) {
2074 				if (PTR_ERR(locked_ref) == -EAGAIN) {
2075 					continue;
2076 				} else {
2077 					break;
2078 				}
2079 			}
2080 			count++;
2081 		}
2082 		/*
2083 		 * We need to try and merge add/drops of the same ref since we
2084 		 * can run into issues with relocate dropping the implicit ref
2085 		 * and then it being added back again before the drop can
2086 		 * finish.  If we merged anything we need to re-loop so we can
2087 		 * get a good ref.
2088 		 * Or we can get node references of the same type that weren't
2089 		 * merged when created due to bumps in the tree mod seq, and
2090 		 * we need to merge them to prevent adding an inline extent
2091 		 * backref before dropping it (triggering a BUG_ON at
2092 		 * insert_inline_extent_backref()).
2093 		 */
2094 		spin_lock(&locked_ref->lock);
2095 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2096 
2097 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
2098 		if (ret < 0 && ret != -EAGAIN) {
2099 			/*
2100 			 * Error, btrfs_run_delayed_refs_for_head already
2101 			 * unlocked everything so just bail out
2102 			 */
2103 			return ret;
2104 		} else if (!ret) {
2105 			/*
2106 			 * Success, perform the usual cleanup of a processed
2107 			 * head
2108 			 */
2109 			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
2110 			if (ret > 0 ) {
2111 				/* We dropped our lock, we need to loop. */
2112 				ret = 0;
2113 				continue;
2114 			} else if (ret) {
2115 				return ret;
2116 			}
2117 		}
2118 
2119 		/*
2120 		 * Either success case or btrfs_run_delayed_refs_for_head
2121 		 * returned -EAGAIN, meaning we need to select another head
2122 		 */
2123 
2124 		locked_ref = NULL;
2125 		cond_resched();
2126 	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2127 		 (max_count > 0 && count < max_count) ||
2128 		 locked_ref);
2129 
2130 	return 0;
2131 }
2132 
2133 #ifdef SCRAMBLE_DELAYED_REFS
2134 /*
2135  * Normally delayed refs get processed in ascending bytenr order. This
2136  * correlates in most cases to the order added. To expose dependencies on this
2137  * order, we start to process the tree in the middle instead of the beginning
2138  */
2139 static u64 find_middle(struct rb_root *root)
2140 {
2141 	struct rb_node *n = root->rb_node;
2142 	struct btrfs_delayed_ref_node *entry;
2143 	int alt = 1;
2144 	u64 middle;
2145 	u64 first = 0, last = 0;
2146 
2147 	n = rb_first(root);
2148 	if (n) {
2149 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2150 		first = entry->bytenr;
2151 	}
2152 	n = rb_last(root);
2153 	if (n) {
2154 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2155 		last = entry->bytenr;
2156 	}
2157 	n = root->rb_node;
2158 
2159 	while (n) {
2160 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2161 		WARN_ON(!entry->in_tree);
2162 
2163 		middle = entry->bytenr;
2164 
2165 		if (alt)
2166 			n = n->rb_left;
2167 		else
2168 			n = n->rb_right;
2169 
2170 		alt = 1 - alt;
2171 	}
2172 	return middle;
2173 }
2174 #endif
2175 
2176 /*
2177  * Start processing the delayed reference count updates and extent insertions
2178  * we have queued up so far.
2179  *
2180  * @trans:	Transaction handle.
2181  * @min_bytes:	How many bytes of delayed references to process. After this
2182  *		many bytes we stop processing delayed references if there are
2183  *		any more. If 0 it means to run all existing delayed references,
2184  *		but not new ones added after running all existing ones.
2185  *		Use (u64)-1 (U64_MAX) to run all existing delayed references
2186  *		plus any new ones that are added.
2187  *
2188  * Returns 0 on success or if called with an aborted transaction
2189  * Returns <0 on error and aborts the transaction
2190  */
2191 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
2192 {
2193 	struct btrfs_fs_info *fs_info = trans->fs_info;
2194 	struct btrfs_delayed_ref_root *delayed_refs;
2195 	int ret;
2196 
2197 	/* We'll clean this up in btrfs_cleanup_transaction */
2198 	if (TRANS_ABORTED(trans))
2199 		return 0;
2200 
2201 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2202 		return 0;
2203 
2204 	delayed_refs = &trans->transaction->delayed_refs;
2205 again:
2206 #ifdef SCRAMBLE_DELAYED_REFS
2207 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2208 #endif
2209 	ret = __btrfs_run_delayed_refs(trans, min_bytes);
2210 	if (ret < 0) {
2211 		btrfs_abort_transaction(trans, ret);
2212 		return ret;
2213 	}
2214 
2215 	if (min_bytes == U64_MAX) {
2216 		btrfs_create_pending_block_groups(trans);
2217 
2218 		spin_lock(&delayed_refs->lock);
2219 		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2220 			spin_unlock(&delayed_refs->lock);
2221 			return 0;
2222 		}
2223 		spin_unlock(&delayed_refs->lock);
2224 
2225 		cond_resched();
2226 		goto again;
2227 	}
2228 
2229 	return 0;
2230 }
2231 
2232 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2233 				struct extent_buffer *eb, u64 flags)
2234 {
2235 	struct btrfs_delayed_extent_op *extent_op;
2236 	int level = btrfs_header_level(eb);
2237 	int ret;
2238 
2239 	extent_op = btrfs_alloc_delayed_extent_op();
2240 	if (!extent_op)
2241 		return -ENOMEM;
2242 
2243 	extent_op->flags_to_set = flags;
2244 	extent_op->update_flags = true;
2245 	extent_op->update_key = false;
2246 	extent_op->level = level;
2247 
2248 	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2249 	if (ret)
2250 		btrfs_free_delayed_extent_op(extent_op);
2251 	return ret;
2252 }
2253 
2254 static noinline int check_delayed_ref(struct btrfs_root *root,
2255 				      struct btrfs_path *path,
2256 				      u64 objectid, u64 offset, u64 bytenr)
2257 {
2258 	struct btrfs_delayed_ref_head *head;
2259 	struct btrfs_delayed_ref_node *ref;
2260 	struct btrfs_delayed_ref_root *delayed_refs;
2261 	struct btrfs_transaction *cur_trans;
2262 	struct rb_node *node;
2263 	int ret = 0;
2264 
2265 	spin_lock(&root->fs_info->trans_lock);
2266 	cur_trans = root->fs_info->running_transaction;
2267 	if (cur_trans)
2268 		refcount_inc(&cur_trans->use_count);
2269 	spin_unlock(&root->fs_info->trans_lock);
2270 	if (!cur_trans)
2271 		return 0;
2272 
2273 	delayed_refs = &cur_trans->delayed_refs;
2274 	spin_lock(&delayed_refs->lock);
2275 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2276 	if (!head) {
2277 		spin_unlock(&delayed_refs->lock);
2278 		btrfs_put_transaction(cur_trans);
2279 		return 0;
2280 	}
2281 
2282 	if (!mutex_trylock(&head->mutex)) {
2283 		if (path->nowait) {
2284 			spin_unlock(&delayed_refs->lock);
2285 			btrfs_put_transaction(cur_trans);
2286 			return -EAGAIN;
2287 		}
2288 
2289 		refcount_inc(&head->refs);
2290 		spin_unlock(&delayed_refs->lock);
2291 
2292 		btrfs_release_path(path);
2293 
2294 		/*
2295 		 * Mutex was contended, block until it's released and let
2296 		 * caller try again
2297 		 */
2298 		mutex_lock(&head->mutex);
2299 		mutex_unlock(&head->mutex);
2300 		btrfs_put_delayed_ref_head(head);
2301 		btrfs_put_transaction(cur_trans);
2302 		return -EAGAIN;
2303 	}
2304 	spin_unlock(&delayed_refs->lock);
2305 
2306 	spin_lock(&head->lock);
2307 	/*
2308 	 * XXX: We should replace this with a proper search function in the
2309 	 * future.
2310 	 */
2311 	for (node = rb_first_cached(&head->ref_tree); node;
2312 	     node = rb_next(node)) {
2313 		u64 ref_owner;
2314 		u64 ref_offset;
2315 
2316 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2317 		/* If it's a shared ref we know a cross reference exists */
2318 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2319 			ret = 1;
2320 			break;
2321 		}
2322 
2323 		ref_owner = btrfs_delayed_ref_owner(ref);
2324 		ref_offset = btrfs_delayed_ref_offset(ref);
2325 
2326 		/*
2327 		 * If our ref doesn't match the one we're currently looking at
2328 		 * then we have a cross reference.
2329 		 */
2330 		if (ref->ref_root != btrfs_root_id(root) ||
2331 		    ref_owner != objectid || ref_offset != offset) {
2332 			ret = 1;
2333 			break;
2334 		}
2335 	}
2336 	spin_unlock(&head->lock);
2337 	mutex_unlock(&head->mutex);
2338 	btrfs_put_transaction(cur_trans);
2339 	return ret;
2340 }
2341 
2342 static noinline int check_committed_ref(struct btrfs_root *root,
2343 					struct btrfs_path *path,
2344 					u64 objectid, u64 offset, u64 bytenr,
2345 					bool strict)
2346 {
2347 	struct btrfs_fs_info *fs_info = root->fs_info;
2348 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2349 	struct extent_buffer *leaf;
2350 	struct btrfs_extent_data_ref *ref;
2351 	struct btrfs_extent_inline_ref *iref;
2352 	struct btrfs_extent_item *ei;
2353 	struct btrfs_key key;
2354 	u32 item_size;
2355 	u32 expected_size;
2356 	int type;
2357 	int ret;
2358 
2359 	key.objectid = bytenr;
2360 	key.offset = (u64)-1;
2361 	key.type = BTRFS_EXTENT_ITEM_KEY;
2362 
2363 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2364 	if (ret < 0)
2365 		goto out;
2366 	if (ret == 0) {
2367 		/*
2368 		 * Key with offset -1 found, there would have to exist an extent
2369 		 * item with such offset, but this is out of the valid range.
2370 		 */
2371 		ret = -EUCLEAN;
2372 		goto out;
2373 	}
2374 
2375 	ret = -ENOENT;
2376 	if (path->slots[0] == 0)
2377 		goto out;
2378 
2379 	path->slots[0]--;
2380 	leaf = path->nodes[0];
2381 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2382 
2383 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2384 		goto out;
2385 
2386 	ret = 1;
2387 	item_size = btrfs_item_size(leaf, path->slots[0]);
2388 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2389 	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2390 
2391 	/* No inline refs; we need to bail before checking for owner ref. */
2392 	if (item_size == sizeof(*ei))
2393 		goto out;
2394 
2395 	/* Check for an owner ref; skip over it to the real inline refs. */
2396 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2397 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2398 	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2399 		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2400 		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2401 	}
2402 
2403 	/* If extent item has more than 1 inline ref then it's shared */
2404 	if (item_size != expected_size)
2405 		goto out;
2406 
2407 	/*
2408 	 * If extent created before last snapshot => it's shared unless the
2409 	 * snapshot has been deleted. Use the heuristic if strict is false.
2410 	 */
2411 	if (!strict &&
2412 	    (btrfs_extent_generation(leaf, ei) <=
2413 	     btrfs_root_last_snapshot(&root->root_item)))
2414 		goto out;
2415 
2416 	/* If this extent has SHARED_DATA_REF then it's shared */
2417 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2418 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2419 		goto out;
2420 
2421 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2422 	if (btrfs_extent_refs(leaf, ei) !=
2423 	    btrfs_extent_data_ref_count(leaf, ref) ||
2424 	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
2425 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2426 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2427 		goto out;
2428 
2429 	ret = 0;
2430 out:
2431 	return ret;
2432 }
2433 
2434 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2435 			  u64 bytenr, bool strict, struct btrfs_path *path)
2436 {
2437 	int ret;
2438 
2439 	do {
2440 		ret = check_committed_ref(root, path, objectid,
2441 					  offset, bytenr, strict);
2442 		if (ret && ret != -ENOENT)
2443 			goto out;
2444 
2445 		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2446 	} while (ret == -EAGAIN);
2447 
2448 out:
2449 	btrfs_release_path(path);
2450 	if (btrfs_is_data_reloc_root(root))
2451 		WARN_ON(ret > 0);
2452 	return ret;
2453 }
2454 
2455 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2456 			   struct btrfs_root *root,
2457 			   struct extent_buffer *buf,
2458 			   int full_backref, int inc)
2459 {
2460 	struct btrfs_fs_info *fs_info = root->fs_info;
2461 	u64 parent;
2462 	u64 ref_root;
2463 	u32 nritems;
2464 	struct btrfs_key key;
2465 	struct btrfs_file_extent_item *fi;
2466 	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2467 	int i;
2468 	int action;
2469 	int level;
2470 	int ret = 0;
2471 
2472 	if (btrfs_is_testing(fs_info))
2473 		return 0;
2474 
2475 	ref_root = btrfs_header_owner(buf);
2476 	nritems = btrfs_header_nritems(buf);
2477 	level = btrfs_header_level(buf);
2478 
2479 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2480 		return 0;
2481 
2482 	if (full_backref)
2483 		parent = buf->start;
2484 	else
2485 		parent = 0;
2486 	if (inc)
2487 		action = BTRFS_ADD_DELAYED_REF;
2488 	else
2489 		action = BTRFS_DROP_DELAYED_REF;
2490 
2491 	for (i = 0; i < nritems; i++) {
2492 		struct btrfs_ref ref = {
2493 			.action = action,
2494 			.parent = parent,
2495 			.ref_root = ref_root,
2496 		};
2497 
2498 		if (level == 0) {
2499 			btrfs_item_key_to_cpu(buf, &key, i);
2500 			if (key.type != BTRFS_EXTENT_DATA_KEY)
2501 				continue;
2502 			fi = btrfs_item_ptr(buf, i,
2503 					    struct btrfs_file_extent_item);
2504 			if (btrfs_file_extent_type(buf, fi) ==
2505 			    BTRFS_FILE_EXTENT_INLINE)
2506 				continue;
2507 			ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2508 			if (ref.bytenr == 0)
2509 				continue;
2510 
2511 			ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2512 			ref.owning_root = ref_root;
2513 
2514 			key.offset -= btrfs_file_extent_offset(buf, fi);
2515 			btrfs_init_data_ref(&ref, key.objectid, key.offset,
2516 					    btrfs_root_id(root), for_reloc);
2517 			if (inc)
2518 				ret = btrfs_inc_extent_ref(trans, &ref);
2519 			else
2520 				ret = btrfs_free_extent(trans, &ref);
2521 			if (ret)
2522 				goto fail;
2523 		} else {
2524 			/* We don't know the owning_root, leave as 0. */
2525 			ref.bytenr = btrfs_node_blockptr(buf, i);
2526 			ref.num_bytes = fs_info->nodesize;
2527 
2528 			btrfs_init_tree_ref(&ref, level - 1,
2529 					    btrfs_root_id(root), for_reloc);
2530 			if (inc)
2531 				ret = btrfs_inc_extent_ref(trans, &ref);
2532 			else
2533 				ret = btrfs_free_extent(trans, &ref);
2534 			if (ret)
2535 				goto fail;
2536 		}
2537 	}
2538 	return 0;
2539 fail:
2540 	return ret;
2541 }
2542 
2543 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2544 		  struct extent_buffer *buf, int full_backref)
2545 {
2546 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2547 }
2548 
2549 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2550 		  struct extent_buffer *buf, int full_backref)
2551 {
2552 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2553 }
2554 
2555 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2556 {
2557 	struct btrfs_fs_info *fs_info = root->fs_info;
2558 	u64 flags;
2559 	u64 ret;
2560 
2561 	if (data)
2562 		flags = BTRFS_BLOCK_GROUP_DATA;
2563 	else if (root == fs_info->chunk_root)
2564 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2565 	else
2566 		flags = BTRFS_BLOCK_GROUP_METADATA;
2567 
2568 	ret = btrfs_get_alloc_profile(fs_info, flags);
2569 	return ret;
2570 }
2571 
2572 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2573 {
2574 	struct rb_node *leftmost;
2575 	u64 bytenr = 0;
2576 
2577 	read_lock(&fs_info->block_group_cache_lock);
2578 	/* Get the block group with the lowest logical start address. */
2579 	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2580 	if (leftmost) {
2581 		struct btrfs_block_group *bg;
2582 
2583 		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2584 		bytenr = bg->start;
2585 	}
2586 	read_unlock(&fs_info->block_group_cache_lock);
2587 
2588 	return bytenr;
2589 }
2590 
2591 static int pin_down_extent(struct btrfs_trans_handle *trans,
2592 			   struct btrfs_block_group *cache,
2593 			   u64 bytenr, u64 num_bytes, int reserved)
2594 {
2595 	struct btrfs_fs_info *fs_info = cache->fs_info;
2596 
2597 	spin_lock(&cache->space_info->lock);
2598 	spin_lock(&cache->lock);
2599 	cache->pinned += num_bytes;
2600 	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2601 					     num_bytes);
2602 	if (reserved) {
2603 		cache->reserved -= num_bytes;
2604 		cache->space_info->bytes_reserved -= num_bytes;
2605 	}
2606 	spin_unlock(&cache->lock);
2607 	spin_unlock(&cache->space_info->lock);
2608 
2609 	set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2610 		       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2611 	return 0;
2612 }
2613 
2614 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2615 		     u64 bytenr, u64 num_bytes, int reserved)
2616 {
2617 	struct btrfs_block_group *cache;
2618 
2619 	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2620 	BUG_ON(!cache); /* Logic error */
2621 
2622 	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2623 
2624 	btrfs_put_block_group(cache);
2625 	return 0;
2626 }
2627 
2628 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2629 				    const struct extent_buffer *eb)
2630 {
2631 	struct btrfs_block_group *cache;
2632 	int ret;
2633 
2634 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2635 	if (!cache)
2636 		return -EINVAL;
2637 
2638 	/*
2639 	 * Fully cache the free space first so that our pin removes the free space
2640 	 * from the cache.
2641 	 */
2642 	ret = btrfs_cache_block_group(cache, true);
2643 	if (ret)
2644 		goto out;
2645 
2646 	pin_down_extent(trans, cache, eb->start, eb->len, 0);
2647 
2648 	/* remove us from the free space cache (if we're there at all) */
2649 	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2650 out:
2651 	btrfs_put_block_group(cache);
2652 	return ret;
2653 }
2654 
2655 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2656 				   u64 start, u64 num_bytes)
2657 {
2658 	int ret;
2659 	struct btrfs_block_group *block_group;
2660 
2661 	block_group = btrfs_lookup_block_group(fs_info, start);
2662 	if (!block_group)
2663 		return -EINVAL;
2664 
2665 	ret = btrfs_cache_block_group(block_group, true);
2666 	if (ret)
2667 		goto out;
2668 
2669 	ret = btrfs_remove_free_space(block_group, start, num_bytes);
2670 out:
2671 	btrfs_put_block_group(block_group);
2672 	return ret;
2673 }
2674 
2675 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2676 {
2677 	struct btrfs_fs_info *fs_info = eb->fs_info;
2678 	struct btrfs_file_extent_item *item;
2679 	struct btrfs_key key;
2680 	int found_type;
2681 	int i;
2682 	int ret = 0;
2683 
2684 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2685 		return 0;
2686 
2687 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2688 		btrfs_item_key_to_cpu(eb, &key, i);
2689 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2690 			continue;
2691 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2692 		found_type = btrfs_file_extent_type(eb, item);
2693 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2694 			continue;
2695 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2696 			continue;
2697 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2698 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2699 		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2700 		if (ret)
2701 			break;
2702 	}
2703 
2704 	return ret;
2705 }
2706 
2707 static void
2708 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2709 {
2710 	atomic_inc(&bg->reservations);
2711 }
2712 
2713 /*
2714  * Returns the free cluster for the given space info and sets empty_cluster to
2715  * what it should be based on the mount options.
2716  */
2717 static struct btrfs_free_cluster *
2718 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2719 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2720 {
2721 	struct btrfs_free_cluster *ret = NULL;
2722 
2723 	*empty_cluster = 0;
2724 	if (btrfs_mixed_space_info(space_info))
2725 		return ret;
2726 
2727 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2728 		ret = &fs_info->meta_alloc_cluster;
2729 		if (btrfs_test_opt(fs_info, SSD))
2730 			*empty_cluster = SZ_2M;
2731 		else
2732 			*empty_cluster = SZ_64K;
2733 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2734 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2735 		*empty_cluster = SZ_2M;
2736 		ret = &fs_info->data_alloc_cluster;
2737 	}
2738 
2739 	return ret;
2740 }
2741 
2742 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2743 			      u64 start, u64 end,
2744 			      const bool return_free_space)
2745 {
2746 	struct btrfs_block_group *cache = NULL;
2747 	struct btrfs_space_info *space_info;
2748 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2749 	struct btrfs_free_cluster *cluster = NULL;
2750 	u64 len;
2751 	u64 total_unpinned = 0;
2752 	u64 empty_cluster = 0;
2753 	bool readonly;
2754 	int ret = 0;
2755 
2756 	while (start <= end) {
2757 		readonly = false;
2758 		if (!cache ||
2759 		    start >= cache->start + cache->length) {
2760 			if (cache)
2761 				btrfs_put_block_group(cache);
2762 			total_unpinned = 0;
2763 			cache = btrfs_lookup_block_group(fs_info, start);
2764 			if (cache == NULL) {
2765 				/* Logic error, something removed the block group. */
2766 				ret = -EUCLEAN;
2767 				goto out;
2768 			}
2769 
2770 			cluster = fetch_cluster_info(fs_info,
2771 						     cache->space_info,
2772 						     &empty_cluster);
2773 			empty_cluster <<= 1;
2774 		}
2775 
2776 		len = cache->start + cache->length - start;
2777 		len = min(len, end + 1 - start);
2778 
2779 		if (return_free_space)
2780 			btrfs_add_free_space(cache, start, len);
2781 
2782 		start += len;
2783 		total_unpinned += len;
2784 		space_info = cache->space_info;
2785 
2786 		/*
2787 		 * If this space cluster has been marked as fragmented and we've
2788 		 * unpinned enough in this block group to potentially allow a
2789 		 * cluster to be created inside of it go ahead and clear the
2790 		 * fragmented check.
2791 		 */
2792 		if (cluster && cluster->fragmented &&
2793 		    total_unpinned > empty_cluster) {
2794 			spin_lock(&cluster->lock);
2795 			cluster->fragmented = 0;
2796 			spin_unlock(&cluster->lock);
2797 		}
2798 
2799 		spin_lock(&space_info->lock);
2800 		spin_lock(&cache->lock);
2801 		cache->pinned -= len;
2802 		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2803 		space_info->max_extent_size = 0;
2804 		if (cache->ro) {
2805 			space_info->bytes_readonly += len;
2806 			readonly = true;
2807 		} else if (btrfs_is_zoned(fs_info)) {
2808 			/* Need reset before reusing in a zoned block group */
2809 			space_info->bytes_zone_unusable += len;
2810 			readonly = true;
2811 		}
2812 		spin_unlock(&cache->lock);
2813 		if (!readonly && return_free_space &&
2814 		    global_rsv->space_info == space_info) {
2815 			spin_lock(&global_rsv->lock);
2816 			if (!global_rsv->full) {
2817 				u64 to_add = min(len, global_rsv->size -
2818 						      global_rsv->reserved);
2819 
2820 				global_rsv->reserved += to_add;
2821 				btrfs_space_info_update_bytes_may_use(fs_info,
2822 						space_info, to_add);
2823 				if (global_rsv->reserved >= global_rsv->size)
2824 					global_rsv->full = 1;
2825 				len -= to_add;
2826 			}
2827 			spin_unlock(&global_rsv->lock);
2828 		}
2829 		/* Add to any tickets we may have */
2830 		if (!readonly && return_free_space && len)
2831 			btrfs_try_granting_tickets(fs_info, space_info);
2832 		spin_unlock(&space_info->lock);
2833 	}
2834 
2835 	if (cache)
2836 		btrfs_put_block_group(cache);
2837 out:
2838 	return ret;
2839 }
2840 
2841 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2842 {
2843 	struct btrfs_fs_info *fs_info = trans->fs_info;
2844 	struct btrfs_block_group *block_group, *tmp;
2845 	struct list_head *deleted_bgs;
2846 	struct extent_io_tree *unpin;
2847 	u64 start;
2848 	u64 end;
2849 	int ret;
2850 
2851 	unpin = &trans->transaction->pinned_extents;
2852 
2853 	while (!TRANS_ABORTED(trans)) {
2854 		struct extent_state *cached_state = NULL;
2855 
2856 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2857 		if (!find_first_extent_bit(unpin, 0, &start, &end,
2858 					   EXTENT_DIRTY, &cached_state)) {
2859 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2860 			break;
2861 		}
2862 
2863 		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2864 			ret = btrfs_discard_extent(fs_info, start,
2865 						   end + 1 - start, NULL);
2866 
2867 		clear_extent_dirty(unpin, start, end, &cached_state);
2868 		ret = unpin_extent_range(fs_info, start, end, true);
2869 		BUG_ON(ret);
2870 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2871 		free_extent_state(cached_state);
2872 		cond_resched();
2873 	}
2874 
2875 	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2876 		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2877 		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2878 	}
2879 
2880 	/*
2881 	 * Transaction is finished.  We don't need the lock anymore.  We
2882 	 * do need to clean up the block groups in case of a transaction
2883 	 * abort.
2884 	 */
2885 	deleted_bgs = &trans->transaction->deleted_bgs;
2886 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2887 		u64 trimmed = 0;
2888 
2889 		ret = -EROFS;
2890 		if (!TRANS_ABORTED(trans))
2891 			ret = btrfs_discard_extent(fs_info,
2892 						   block_group->start,
2893 						   block_group->length,
2894 						   &trimmed);
2895 
2896 		list_del_init(&block_group->bg_list);
2897 		btrfs_unfreeze_block_group(block_group);
2898 		btrfs_put_block_group(block_group);
2899 
2900 		if (ret) {
2901 			const char *errstr = btrfs_decode_error(ret);
2902 			btrfs_warn(fs_info,
2903 			   "discard failed while removing blockgroup: errno=%d %s",
2904 				   ret, errstr);
2905 		}
2906 	}
2907 
2908 	return 0;
2909 }
2910 
2911 /*
2912  * Parse an extent item's inline extents looking for a simple quotas owner ref.
2913  *
2914  * @fs_info:	the btrfs_fs_info for this mount
2915  * @leaf:	a leaf in the extent tree containing the extent item
2916  * @slot:	the slot in the leaf where the extent item is found
2917  *
2918  * Returns the objectid of the root that originally allocated the extent item
2919  * if the inline owner ref is expected and present, otherwise 0.
2920  *
2921  * If an extent item has an owner ref item, it will be the first inline ref
2922  * item. Therefore the logic is to check whether there are any inline ref
2923  * items, then check the type of the first one.
2924  */
2925 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2926 				struct extent_buffer *leaf, int slot)
2927 {
2928 	struct btrfs_extent_item *ei;
2929 	struct btrfs_extent_inline_ref *iref;
2930 	struct btrfs_extent_owner_ref *oref;
2931 	unsigned long ptr;
2932 	unsigned long end;
2933 	int type;
2934 
2935 	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2936 		return 0;
2937 
2938 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2939 	ptr = (unsigned long)(ei + 1);
2940 	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2941 
2942 	/* No inline ref items of any kind, can't check type. */
2943 	if (ptr == end)
2944 		return 0;
2945 
2946 	iref = (struct btrfs_extent_inline_ref *)ptr;
2947 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2948 
2949 	/* We found an owner ref, get the root out of it. */
2950 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2951 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2952 		return btrfs_extent_owner_ref_root_id(leaf, oref);
2953 	}
2954 
2955 	/* We have inline refs, but not an owner ref. */
2956 	return 0;
2957 }
2958 
2959 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2960 				     u64 bytenr, struct btrfs_squota_delta *delta)
2961 {
2962 	int ret;
2963 	u64 num_bytes = delta->num_bytes;
2964 
2965 	if (delta->is_data) {
2966 		struct btrfs_root *csum_root;
2967 
2968 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2969 		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2970 		if (ret) {
2971 			btrfs_abort_transaction(trans, ret);
2972 			return ret;
2973 		}
2974 
2975 		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2976 		if (ret) {
2977 			btrfs_abort_transaction(trans, ret);
2978 			return ret;
2979 		}
2980 	}
2981 
2982 	ret = btrfs_record_squota_delta(trans->fs_info, delta);
2983 	if (ret) {
2984 		btrfs_abort_transaction(trans, ret);
2985 		return ret;
2986 	}
2987 
2988 	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2989 	if (ret) {
2990 		btrfs_abort_transaction(trans, ret);
2991 		return ret;
2992 	}
2993 
2994 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2995 	if (ret)
2996 		btrfs_abort_transaction(trans, ret);
2997 
2998 	return ret;
2999 }
3000 
3001 #define abort_and_dump(trans, path, fmt, args...)	\
3002 ({							\
3003 	btrfs_abort_transaction(trans, -EUCLEAN);	\
3004 	btrfs_print_leaf(path->nodes[0]);		\
3005 	btrfs_crit(trans->fs_info, fmt, ##args);	\
3006 })
3007 
3008 /*
3009  * Drop one or more refs of @node.
3010  *
3011  * 1. Locate the extent refs.
3012  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3013  *    Locate it, then reduce the refs number or remove the ref line completely.
3014  *
3015  * 2. Update the refs count in EXTENT/METADATA_ITEM
3016  *
3017  * Inline backref case:
3018  *
3019  * in extent tree we have:
3020  *
3021  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3022  *		refs 2 gen 6 flags DATA
3023  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3024  *		extent data backref root FS_TREE objectid 257 offset 0 count 1
3025  *
3026  * This function gets called with:
3027  *
3028  *    node->bytenr = 13631488
3029  *    node->num_bytes = 1048576
3030  *    root_objectid = FS_TREE
3031  *    owner_objectid = 257
3032  *    owner_offset = 0
3033  *    refs_to_drop = 1
3034  *
3035  * Then we should get some like:
3036  *
3037  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3038  *		refs 1 gen 6 flags DATA
3039  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3040  *
3041  * Keyed backref case:
3042  *
3043  * in extent tree we have:
3044  *
3045  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3046  *		refs 754 gen 6 flags DATA
3047  *	[...]
3048  *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3049  *		extent data backref root FS_TREE objectid 866 offset 0 count 1
3050  *
3051  * This function get called with:
3052  *
3053  *    node->bytenr = 13631488
3054  *    node->num_bytes = 1048576
3055  *    root_objectid = FS_TREE
3056  *    owner_objectid = 866
3057  *    owner_offset = 0
3058  *    refs_to_drop = 1
3059  *
3060  * Then we should get some like:
3061  *
3062  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3063  *		refs 753 gen 6 flags DATA
3064  *
3065  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3066  */
3067 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3068 			       struct btrfs_delayed_ref_head *href,
3069 			       struct btrfs_delayed_ref_node *node,
3070 			       struct btrfs_delayed_extent_op *extent_op)
3071 {
3072 	struct btrfs_fs_info *info = trans->fs_info;
3073 	struct btrfs_key key;
3074 	struct btrfs_path *path;
3075 	struct btrfs_root *extent_root;
3076 	struct extent_buffer *leaf;
3077 	struct btrfs_extent_item *ei;
3078 	struct btrfs_extent_inline_ref *iref;
3079 	int ret;
3080 	int is_data;
3081 	int extent_slot = 0;
3082 	int found_extent = 0;
3083 	int num_to_del = 1;
3084 	int refs_to_drop = node->ref_mod;
3085 	u32 item_size;
3086 	u64 refs;
3087 	u64 bytenr = node->bytenr;
3088 	u64 num_bytes = node->num_bytes;
3089 	u64 owner_objectid = btrfs_delayed_ref_owner(node);
3090 	u64 owner_offset = btrfs_delayed_ref_offset(node);
3091 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3092 	u64 delayed_ref_root = href->owning_root;
3093 
3094 	extent_root = btrfs_extent_root(info, bytenr);
3095 	ASSERT(extent_root);
3096 
3097 	path = btrfs_alloc_path();
3098 	if (!path)
3099 		return -ENOMEM;
3100 
3101 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3102 
3103 	if (!is_data && refs_to_drop != 1) {
3104 		btrfs_crit(info,
3105 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3106 			   node->bytenr, refs_to_drop);
3107 		ret = -EINVAL;
3108 		btrfs_abort_transaction(trans, ret);
3109 		goto out;
3110 	}
3111 
3112 	if (is_data)
3113 		skinny_metadata = false;
3114 
3115 	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3116 				    node->parent, node->ref_root, owner_objectid,
3117 				    owner_offset);
3118 	if (ret == 0) {
3119 		/*
3120 		 * Either the inline backref or the SHARED_DATA_REF/
3121 		 * SHARED_BLOCK_REF is found
3122 		 *
3123 		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3124 		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3125 		 */
3126 		extent_slot = path->slots[0];
3127 		while (extent_slot >= 0) {
3128 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3129 					      extent_slot);
3130 			if (key.objectid != bytenr)
3131 				break;
3132 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3133 			    key.offset == num_bytes) {
3134 				found_extent = 1;
3135 				break;
3136 			}
3137 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3138 			    key.offset == owner_objectid) {
3139 				found_extent = 1;
3140 				break;
3141 			}
3142 
3143 			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3144 			if (path->slots[0] - extent_slot > 5)
3145 				break;
3146 			extent_slot--;
3147 		}
3148 
3149 		if (!found_extent) {
3150 			if (iref) {
3151 				abort_and_dump(trans, path,
3152 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3153 					   path->slots[0]);
3154 				ret = -EUCLEAN;
3155 				goto out;
3156 			}
3157 			/* Must be SHARED_* item, remove the backref first */
3158 			ret = remove_extent_backref(trans, extent_root, path,
3159 						    NULL, refs_to_drop, is_data);
3160 			if (ret) {
3161 				btrfs_abort_transaction(trans, ret);
3162 				goto out;
3163 			}
3164 			btrfs_release_path(path);
3165 
3166 			/* Slow path to locate EXTENT/METADATA_ITEM */
3167 			key.objectid = bytenr;
3168 			key.type = BTRFS_EXTENT_ITEM_KEY;
3169 			key.offset = num_bytes;
3170 
3171 			if (!is_data && skinny_metadata) {
3172 				key.type = BTRFS_METADATA_ITEM_KEY;
3173 				key.offset = owner_objectid;
3174 			}
3175 
3176 			ret = btrfs_search_slot(trans, extent_root,
3177 						&key, path, -1, 1);
3178 			if (ret > 0 && skinny_metadata && path->slots[0]) {
3179 				/*
3180 				 * Couldn't find our skinny metadata item,
3181 				 * see if we have ye olde extent item.
3182 				 */
3183 				path->slots[0]--;
3184 				btrfs_item_key_to_cpu(path->nodes[0], &key,
3185 						      path->slots[0]);
3186 				if (key.objectid == bytenr &&
3187 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3188 				    key.offset == num_bytes)
3189 					ret = 0;
3190 			}
3191 
3192 			if (ret > 0 && skinny_metadata) {
3193 				skinny_metadata = false;
3194 				key.objectid = bytenr;
3195 				key.type = BTRFS_EXTENT_ITEM_KEY;
3196 				key.offset = num_bytes;
3197 				btrfs_release_path(path);
3198 				ret = btrfs_search_slot(trans, extent_root,
3199 							&key, path, -1, 1);
3200 			}
3201 
3202 			if (ret) {
3203 				if (ret > 0)
3204 					btrfs_print_leaf(path->nodes[0]);
3205 				btrfs_err(info,
3206 			"umm, got %d back from search, was looking for %llu, slot %d",
3207 					  ret, bytenr, path->slots[0]);
3208 			}
3209 			if (ret < 0) {
3210 				btrfs_abort_transaction(trans, ret);
3211 				goto out;
3212 			}
3213 			extent_slot = path->slots[0];
3214 		}
3215 	} else if (WARN_ON(ret == -ENOENT)) {
3216 		abort_and_dump(trans, path,
3217 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3218 			       bytenr, node->parent, node->ref_root, owner_objectid,
3219 			       owner_offset, path->slots[0]);
3220 		goto out;
3221 	} else {
3222 		btrfs_abort_transaction(trans, ret);
3223 		goto out;
3224 	}
3225 
3226 	leaf = path->nodes[0];
3227 	item_size = btrfs_item_size(leaf, extent_slot);
3228 	if (unlikely(item_size < sizeof(*ei))) {
3229 		ret = -EUCLEAN;
3230 		btrfs_err(trans->fs_info,
3231 			  "unexpected extent item size, has %u expect >= %zu",
3232 			  item_size, sizeof(*ei));
3233 		btrfs_abort_transaction(trans, ret);
3234 		goto out;
3235 	}
3236 	ei = btrfs_item_ptr(leaf, extent_slot,
3237 			    struct btrfs_extent_item);
3238 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3239 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3240 		struct btrfs_tree_block_info *bi;
3241 
3242 		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3243 			abort_and_dump(trans, path,
3244 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3245 				       key.objectid, key.type, key.offset,
3246 				       path->slots[0], owner_objectid, item_size,
3247 				       sizeof(*ei) + sizeof(*bi));
3248 			ret = -EUCLEAN;
3249 			goto out;
3250 		}
3251 		bi = (struct btrfs_tree_block_info *)(ei + 1);
3252 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3253 	}
3254 
3255 	refs = btrfs_extent_refs(leaf, ei);
3256 	if (refs < refs_to_drop) {
3257 		abort_and_dump(trans, path,
3258 		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3259 			       refs_to_drop, refs, bytenr, path->slots[0]);
3260 		ret = -EUCLEAN;
3261 		goto out;
3262 	}
3263 	refs -= refs_to_drop;
3264 
3265 	if (refs > 0) {
3266 		if (extent_op)
3267 			__run_delayed_extent_op(extent_op, leaf, ei);
3268 		/*
3269 		 * In the case of inline back ref, reference count will
3270 		 * be updated by remove_extent_backref
3271 		 */
3272 		if (iref) {
3273 			if (!found_extent) {
3274 				abort_and_dump(trans, path,
3275 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3276 					       path->slots[0]);
3277 				ret = -EUCLEAN;
3278 				goto out;
3279 			}
3280 		} else {
3281 			btrfs_set_extent_refs(leaf, ei, refs);
3282 			btrfs_mark_buffer_dirty(trans, leaf);
3283 		}
3284 		if (found_extent) {
3285 			ret = remove_extent_backref(trans, extent_root, path,
3286 						    iref, refs_to_drop, is_data);
3287 			if (ret) {
3288 				btrfs_abort_transaction(trans, ret);
3289 				goto out;
3290 			}
3291 		}
3292 	} else {
3293 		struct btrfs_squota_delta delta = {
3294 			.root = delayed_ref_root,
3295 			.num_bytes = num_bytes,
3296 			.is_data = is_data,
3297 			.is_inc = false,
3298 			.generation = btrfs_extent_generation(leaf, ei),
3299 		};
3300 
3301 		/* In this branch refs == 1 */
3302 		if (found_extent) {
3303 			if (is_data && refs_to_drop !=
3304 			    extent_data_ref_count(path, iref)) {
3305 				abort_and_dump(trans, path,
3306 		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3307 					       extent_data_ref_count(path, iref),
3308 					       refs_to_drop, path->slots[0]);
3309 				ret = -EUCLEAN;
3310 				goto out;
3311 			}
3312 			if (iref) {
3313 				if (path->slots[0] != extent_slot) {
3314 					abort_and_dump(trans, path,
3315 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3316 						       key.objectid, key.type,
3317 						       key.offset, path->slots[0]);
3318 					ret = -EUCLEAN;
3319 					goto out;
3320 				}
3321 			} else {
3322 				/*
3323 				 * No inline ref, we must be at SHARED_* item,
3324 				 * And it's single ref, it must be:
3325 				 * |	extent_slot	  ||extent_slot + 1|
3326 				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3327 				 */
3328 				if (path->slots[0] != extent_slot + 1) {
3329 					abort_and_dump(trans, path,
3330 	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3331 						       path->slots[0]);
3332 					ret = -EUCLEAN;
3333 					goto out;
3334 				}
3335 				path->slots[0] = extent_slot;
3336 				num_to_del = 2;
3337 			}
3338 		}
3339 		/*
3340 		 * We can't infer the data owner from the delayed ref, so we need
3341 		 * to try to get it from the owning ref item.
3342 		 *
3343 		 * If it is not present, then that extent was not written under
3344 		 * simple quotas mode, so we don't need to account for its deletion.
3345 		 */
3346 		if (is_data)
3347 			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3348 								 leaf, extent_slot);
3349 
3350 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3351 				      num_to_del);
3352 		if (ret) {
3353 			btrfs_abort_transaction(trans, ret);
3354 			goto out;
3355 		}
3356 		btrfs_release_path(path);
3357 
3358 		ret = do_free_extent_accounting(trans, bytenr, &delta);
3359 	}
3360 	btrfs_release_path(path);
3361 
3362 out:
3363 	btrfs_free_path(path);
3364 	return ret;
3365 }
3366 
3367 /*
3368  * when we free an block, it is possible (and likely) that we free the last
3369  * delayed ref for that extent as well.  This searches the delayed ref tree for
3370  * a given extent, and if there are no other delayed refs to be processed, it
3371  * removes it from the tree.
3372  */
3373 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3374 				      u64 bytenr)
3375 {
3376 	struct btrfs_delayed_ref_head *head;
3377 	struct btrfs_delayed_ref_root *delayed_refs;
3378 	int ret = 0;
3379 
3380 	delayed_refs = &trans->transaction->delayed_refs;
3381 	spin_lock(&delayed_refs->lock);
3382 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3383 	if (!head)
3384 		goto out_delayed_unlock;
3385 
3386 	spin_lock(&head->lock);
3387 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3388 		goto out;
3389 
3390 	if (cleanup_extent_op(head) != NULL)
3391 		goto out;
3392 
3393 	/*
3394 	 * waiting for the lock here would deadlock.  If someone else has it
3395 	 * locked they are already in the process of dropping it anyway
3396 	 */
3397 	if (!mutex_trylock(&head->mutex))
3398 		goto out;
3399 
3400 	btrfs_delete_ref_head(delayed_refs, head);
3401 	head->processing = false;
3402 
3403 	spin_unlock(&head->lock);
3404 	spin_unlock(&delayed_refs->lock);
3405 
3406 	BUG_ON(head->extent_op);
3407 	if (head->must_insert_reserved)
3408 		ret = 1;
3409 
3410 	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3411 	mutex_unlock(&head->mutex);
3412 	btrfs_put_delayed_ref_head(head);
3413 	return ret;
3414 out:
3415 	spin_unlock(&head->lock);
3416 
3417 out_delayed_unlock:
3418 	spin_unlock(&delayed_refs->lock);
3419 	return 0;
3420 }
3421 
3422 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3423 			   u64 root_id,
3424 			   struct extent_buffer *buf,
3425 			   u64 parent, int last_ref)
3426 {
3427 	struct btrfs_fs_info *fs_info = trans->fs_info;
3428 	struct btrfs_block_group *bg;
3429 	int ret;
3430 
3431 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3432 		struct btrfs_ref generic_ref = {
3433 			.action = BTRFS_DROP_DELAYED_REF,
3434 			.bytenr = buf->start,
3435 			.num_bytes = buf->len,
3436 			.parent = parent,
3437 			.owning_root = btrfs_header_owner(buf),
3438 			.ref_root = root_id,
3439 		};
3440 
3441 		/*
3442 		 * Assert that the extent buffer is not cleared due to
3443 		 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3444 		 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3445 		 * detail.
3446 		 */
3447 		ASSERT(btrfs_header_bytenr(buf) != 0);
3448 
3449 		btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
3450 		btrfs_ref_tree_mod(fs_info, &generic_ref);
3451 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3452 		BUG_ON(ret); /* -ENOMEM */
3453 	}
3454 
3455 	if (!last_ref)
3456 		return;
3457 
3458 	if (btrfs_header_generation(buf) != trans->transid)
3459 		goto out;
3460 
3461 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3462 		ret = check_ref_cleanup(trans, buf->start);
3463 		if (!ret)
3464 			goto out;
3465 	}
3466 
3467 	bg = btrfs_lookup_block_group(fs_info, buf->start);
3468 
3469 	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3470 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3471 		btrfs_put_block_group(bg);
3472 		goto out;
3473 	}
3474 
3475 	/*
3476 	 * If there are tree mod log users we may have recorded mod log
3477 	 * operations for this node.  If we re-allocate this node we
3478 	 * could replay operations on this node that happened when it
3479 	 * existed in a completely different root.  For example if it
3480 	 * was part of root A, then was reallocated to root B, and we
3481 	 * are doing a btrfs_old_search_slot(root b), we could replay
3482 	 * operations that happened when the block was part of root A,
3483 	 * giving us an inconsistent view of the btree.
3484 	 *
3485 	 * We are safe from races here because at this point no other
3486 	 * node or root points to this extent buffer, so if after this
3487 	 * check a new tree mod log user joins we will not have an
3488 	 * existing log of operations on this node that we have to
3489 	 * contend with.
3490 	 */
3491 
3492 	if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3493 		     || btrfs_is_zoned(fs_info)) {
3494 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3495 		btrfs_put_block_group(bg);
3496 		goto out;
3497 	}
3498 
3499 	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3500 
3501 	btrfs_add_free_space(bg, buf->start, buf->len);
3502 	btrfs_free_reserved_bytes(bg, buf->len, 0);
3503 	btrfs_put_block_group(bg);
3504 	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3505 
3506 out:
3507 
3508 	/*
3509 	 * Deleting the buffer, clear the corrupt flag since it doesn't
3510 	 * matter anymore.
3511 	 */
3512 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3513 }
3514 
3515 /* Can return -ENOMEM */
3516 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3517 {
3518 	struct btrfs_fs_info *fs_info = trans->fs_info;
3519 	int ret;
3520 
3521 	if (btrfs_is_testing(fs_info))
3522 		return 0;
3523 
3524 	/*
3525 	 * tree log blocks never actually go into the extent allocation
3526 	 * tree, just update pinning info and exit early.
3527 	 */
3528 	if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
3529 		btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
3530 		ret = 0;
3531 	} else if (ref->type == BTRFS_REF_METADATA) {
3532 		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3533 	} else {
3534 		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3535 	}
3536 
3537 	if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
3538 		btrfs_ref_tree_mod(fs_info, ref);
3539 
3540 	return ret;
3541 }
3542 
3543 enum btrfs_loop_type {
3544 	/*
3545 	 * Start caching block groups but do not wait for progress or for them
3546 	 * to be done.
3547 	 */
3548 	LOOP_CACHING_NOWAIT,
3549 
3550 	/*
3551 	 * Wait for the block group free_space >= the space we're waiting for if
3552 	 * the block group isn't cached.
3553 	 */
3554 	LOOP_CACHING_WAIT,
3555 
3556 	/*
3557 	 * Allow allocations to happen from block groups that do not yet have a
3558 	 * size classification.
3559 	 */
3560 	LOOP_UNSET_SIZE_CLASS,
3561 
3562 	/*
3563 	 * Allocate a chunk and then retry the allocation.
3564 	 */
3565 	LOOP_ALLOC_CHUNK,
3566 
3567 	/*
3568 	 * Ignore the size class restrictions for this allocation.
3569 	 */
3570 	LOOP_WRONG_SIZE_CLASS,
3571 
3572 	/*
3573 	 * Ignore the empty size, only try to allocate the number of bytes
3574 	 * needed for this allocation.
3575 	 */
3576 	LOOP_NO_EMPTY_SIZE,
3577 };
3578 
3579 static inline void
3580 btrfs_lock_block_group(struct btrfs_block_group *cache,
3581 		       int delalloc)
3582 {
3583 	if (delalloc)
3584 		down_read(&cache->data_rwsem);
3585 }
3586 
3587 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3588 		       int delalloc)
3589 {
3590 	btrfs_get_block_group(cache);
3591 	if (delalloc)
3592 		down_read(&cache->data_rwsem);
3593 }
3594 
3595 static struct btrfs_block_group *btrfs_lock_cluster(
3596 		   struct btrfs_block_group *block_group,
3597 		   struct btrfs_free_cluster *cluster,
3598 		   int delalloc)
3599 	__acquires(&cluster->refill_lock)
3600 {
3601 	struct btrfs_block_group *used_bg = NULL;
3602 
3603 	spin_lock(&cluster->refill_lock);
3604 	while (1) {
3605 		used_bg = cluster->block_group;
3606 		if (!used_bg)
3607 			return NULL;
3608 
3609 		if (used_bg == block_group)
3610 			return used_bg;
3611 
3612 		btrfs_get_block_group(used_bg);
3613 
3614 		if (!delalloc)
3615 			return used_bg;
3616 
3617 		if (down_read_trylock(&used_bg->data_rwsem))
3618 			return used_bg;
3619 
3620 		spin_unlock(&cluster->refill_lock);
3621 
3622 		/* We should only have one-level nested. */
3623 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3624 
3625 		spin_lock(&cluster->refill_lock);
3626 		if (used_bg == cluster->block_group)
3627 			return used_bg;
3628 
3629 		up_read(&used_bg->data_rwsem);
3630 		btrfs_put_block_group(used_bg);
3631 	}
3632 }
3633 
3634 static inline void
3635 btrfs_release_block_group(struct btrfs_block_group *cache,
3636 			 int delalloc)
3637 {
3638 	if (delalloc)
3639 		up_read(&cache->data_rwsem);
3640 	btrfs_put_block_group(cache);
3641 }
3642 
3643 /*
3644  * Helper function for find_free_extent().
3645  *
3646  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3647  * Return >0 to inform caller that we find nothing
3648  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3649  */
3650 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3651 				      struct find_free_extent_ctl *ffe_ctl,
3652 				      struct btrfs_block_group **cluster_bg_ret)
3653 {
3654 	struct btrfs_block_group *cluster_bg;
3655 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3656 	u64 aligned_cluster;
3657 	u64 offset;
3658 	int ret;
3659 
3660 	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3661 	if (!cluster_bg)
3662 		goto refill_cluster;
3663 	if (cluster_bg != bg && (cluster_bg->ro ||
3664 	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
3665 		goto release_cluster;
3666 
3667 	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3668 			ffe_ctl->num_bytes, cluster_bg->start,
3669 			&ffe_ctl->max_extent_size);
3670 	if (offset) {
3671 		/* We have a block, we're done */
3672 		spin_unlock(&last_ptr->refill_lock);
3673 		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3674 		*cluster_bg_ret = cluster_bg;
3675 		ffe_ctl->found_offset = offset;
3676 		return 0;
3677 	}
3678 	WARN_ON(last_ptr->block_group != cluster_bg);
3679 
3680 release_cluster:
3681 	/*
3682 	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3683 	 * lets just skip it and let the allocator find whatever block it can
3684 	 * find. If we reach this point, we will have tried the cluster
3685 	 * allocator plenty of times and not have found anything, so we are
3686 	 * likely way too fragmented for the clustering stuff to find anything.
3687 	 *
3688 	 * However, if the cluster is taken from the current block group,
3689 	 * release the cluster first, so that we stand a better chance of
3690 	 * succeeding in the unclustered allocation.
3691 	 */
3692 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3693 		spin_unlock(&last_ptr->refill_lock);
3694 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3695 		return -ENOENT;
3696 	}
3697 
3698 	/* This cluster didn't work out, free it and start over */
3699 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3700 
3701 	if (cluster_bg != bg)
3702 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3703 
3704 refill_cluster:
3705 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3706 		spin_unlock(&last_ptr->refill_lock);
3707 		return -ENOENT;
3708 	}
3709 
3710 	aligned_cluster = max_t(u64,
3711 			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3712 			bg->full_stripe_len);
3713 	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3714 			ffe_ctl->num_bytes, aligned_cluster);
3715 	if (ret == 0) {
3716 		/* Now pull our allocation out of this cluster */
3717 		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3718 				ffe_ctl->num_bytes, ffe_ctl->search_start,
3719 				&ffe_ctl->max_extent_size);
3720 		if (offset) {
3721 			/* We found one, proceed */
3722 			spin_unlock(&last_ptr->refill_lock);
3723 			ffe_ctl->found_offset = offset;
3724 			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3725 			return 0;
3726 		}
3727 	}
3728 	/*
3729 	 * At this point we either didn't find a cluster or we weren't able to
3730 	 * allocate a block from our cluster.  Free the cluster we've been
3731 	 * trying to use, and go to the next block group.
3732 	 */
3733 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3734 	spin_unlock(&last_ptr->refill_lock);
3735 	return 1;
3736 }
3737 
3738 /*
3739  * Return >0 to inform caller that we find nothing
3740  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3741  */
3742 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3743 					struct find_free_extent_ctl *ffe_ctl)
3744 {
3745 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3746 	u64 offset;
3747 
3748 	/*
3749 	 * We are doing an unclustered allocation, set the fragmented flag so
3750 	 * we don't bother trying to setup a cluster again until we get more
3751 	 * space.
3752 	 */
3753 	if (unlikely(last_ptr)) {
3754 		spin_lock(&last_ptr->lock);
3755 		last_ptr->fragmented = 1;
3756 		spin_unlock(&last_ptr->lock);
3757 	}
3758 	if (ffe_ctl->cached) {
3759 		struct btrfs_free_space_ctl *free_space_ctl;
3760 
3761 		free_space_ctl = bg->free_space_ctl;
3762 		spin_lock(&free_space_ctl->tree_lock);
3763 		if (free_space_ctl->free_space <
3764 		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3765 		    ffe_ctl->empty_size) {
3766 			ffe_ctl->total_free_space = max_t(u64,
3767 					ffe_ctl->total_free_space,
3768 					free_space_ctl->free_space);
3769 			spin_unlock(&free_space_ctl->tree_lock);
3770 			return 1;
3771 		}
3772 		spin_unlock(&free_space_ctl->tree_lock);
3773 	}
3774 
3775 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3776 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3777 			&ffe_ctl->max_extent_size);
3778 	if (!offset)
3779 		return 1;
3780 	ffe_ctl->found_offset = offset;
3781 	return 0;
3782 }
3783 
3784 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3785 				   struct find_free_extent_ctl *ffe_ctl,
3786 				   struct btrfs_block_group **bg_ret)
3787 {
3788 	int ret;
3789 
3790 	/* We want to try and use the cluster allocator, so lets look there */
3791 	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3792 		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3793 		if (ret >= 0)
3794 			return ret;
3795 		/* ret == -ENOENT case falls through */
3796 	}
3797 
3798 	return find_free_extent_unclustered(block_group, ffe_ctl);
3799 }
3800 
3801 /*
3802  * Tree-log block group locking
3803  * ============================
3804  *
3805  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3806  * indicates the starting address of a block group, which is reserved only
3807  * for tree-log metadata.
3808  *
3809  * Lock nesting
3810  * ============
3811  *
3812  * space_info::lock
3813  *   block_group::lock
3814  *     fs_info::treelog_bg_lock
3815  */
3816 
3817 /*
3818  * Simple allocator for sequential-only block group. It only allows sequential
3819  * allocation. No need to play with trees. This function also reserves the
3820  * bytes as in btrfs_add_reserved_bytes.
3821  */
3822 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3823 			       struct find_free_extent_ctl *ffe_ctl,
3824 			       struct btrfs_block_group **bg_ret)
3825 {
3826 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3827 	struct btrfs_space_info *space_info = block_group->space_info;
3828 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3829 	u64 start = block_group->start;
3830 	u64 num_bytes = ffe_ctl->num_bytes;
3831 	u64 avail;
3832 	u64 bytenr = block_group->start;
3833 	u64 log_bytenr;
3834 	u64 data_reloc_bytenr;
3835 	int ret = 0;
3836 	bool skip = false;
3837 
3838 	ASSERT(btrfs_is_zoned(block_group->fs_info));
3839 
3840 	/*
3841 	 * Do not allow non-tree-log blocks in the dedicated tree-log block
3842 	 * group, and vice versa.
3843 	 */
3844 	spin_lock(&fs_info->treelog_bg_lock);
3845 	log_bytenr = fs_info->treelog_bg;
3846 	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3847 			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3848 		skip = true;
3849 	spin_unlock(&fs_info->treelog_bg_lock);
3850 	if (skip)
3851 		return 1;
3852 
3853 	/*
3854 	 * Do not allow non-relocation blocks in the dedicated relocation block
3855 	 * group, and vice versa.
3856 	 */
3857 	spin_lock(&fs_info->relocation_bg_lock);
3858 	data_reloc_bytenr = fs_info->data_reloc_bg;
3859 	if (data_reloc_bytenr &&
3860 	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3861 	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3862 		skip = true;
3863 	spin_unlock(&fs_info->relocation_bg_lock);
3864 	if (skip)
3865 		return 1;
3866 
3867 	/* Check RO and no space case before trying to activate it */
3868 	spin_lock(&block_group->lock);
3869 	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3870 		ret = 1;
3871 		/*
3872 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3873 		 * Return the error after taking the locks.
3874 		 */
3875 	}
3876 	spin_unlock(&block_group->lock);
3877 
3878 	/* Metadata block group is activated at write time. */
3879 	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3880 	    !btrfs_zone_activate(block_group)) {
3881 		ret = 1;
3882 		/*
3883 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3884 		 * Return the error after taking the locks.
3885 		 */
3886 	}
3887 
3888 	spin_lock(&space_info->lock);
3889 	spin_lock(&block_group->lock);
3890 	spin_lock(&fs_info->treelog_bg_lock);
3891 	spin_lock(&fs_info->relocation_bg_lock);
3892 
3893 	if (ret)
3894 		goto out;
3895 
3896 	ASSERT(!ffe_ctl->for_treelog ||
3897 	       block_group->start == fs_info->treelog_bg ||
3898 	       fs_info->treelog_bg == 0);
3899 	ASSERT(!ffe_ctl->for_data_reloc ||
3900 	       block_group->start == fs_info->data_reloc_bg ||
3901 	       fs_info->data_reloc_bg == 0);
3902 
3903 	if (block_group->ro ||
3904 	    (!ffe_ctl->for_data_reloc &&
3905 	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3906 		ret = 1;
3907 		goto out;
3908 	}
3909 
3910 	/*
3911 	 * Do not allow currently using block group to be tree-log dedicated
3912 	 * block group.
3913 	 */
3914 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3915 	    (block_group->used || block_group->reserved)) {
3916 		ret = 1;
3917 		goto out;
3918 	}
3919 
3920 	/*
3921 	 * Do not allow currently used block group to be the data relocation
3922 	 * dedicated block group.
3923 	 */
3924 	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3925 	    (block_group->used || block_group->reserved)) {
3926 		ret = 1;
3927 		goto out;
3928 	}
3929 
3930 	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3931 	avail = block_group->zone_capacity - block_group->alloc_offset;
3932 	if (avail < num_bytes) {
3933 		if (ffe_ctl->max_extent_size < avail) {
3934 			/*
3935 			 * With sequential allocator, free space is always
3936 			 * contiguous
3937 			 */
3938 			ffe_ctl->max_extent_size = avail;
3939 			ffe_ctl->total_free_space = avail;
3940 		}
3941 		ret = 1;
3942 		goto out;
3943 	}
3944 
3945 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3946 		fs_info->treelog_bg = block_group->start;
3947 
3948 	if (ffe_ctl->for_data_reloc) {
3949 		if (!fs_info->data_reloc_bg)
3950 			fs_info->data_reloc_bg = block_group->start;
3951 		/*
3952 		 * Do not allow allocations from this block group, unless it is
3953 		 * for data relocation. Compared to increasing the ->ro, setting
3954 		 * the ->zoned_data_reloc_ongoing flag still allows nocow
3955 		 * writers to come in. See btrfs_inc_nocow_writers().
3956 		 *
3957 		 * We need to disable an allocation to avoid an allocation of
3958 		 * regular (non-relocation data) extent. With mix of relocation
3959 		 * extents and regular extents, we can dispatch WRITE commands
3960 		 * (for relocation extents) and ZONE APPEND commands (for
3961 		 * regular extents) at the same time to the same zone, which
3962 		 * easily break the write pointer.
3963 		 *
3964 		 * Also, this flag avoids this block group to be zone finished.
3965 		 */
3966 		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3967 	}
3968 
3969 	ffe_ctl->found_offset = start + block_group->alloc_offset;
3970 	block_group->alloc_offset += num_bytes;
3971 	spin_lock(&ctl->tree_lock);
3972 	ctl->free_space -= num_bytes;
3973 	spin_unlock(&ctl->tree_lock);
3974 
3975 	/*
3976 	 * We do not check if found_offset is aligned to stripesize. The
3977 	 * address is anyway rewritten when using zone append writing.
3978 	 */
3979 
3980 	ffe_ctl->search_start = ffe_ctl->found_offset;
3981 
3982 out:
3983 	if (ret && ffe_ctl->for_treelog)
3984 		fs_info->treelog_bg = 0;
3985 	if (ret && ffe_ctl->for_data_reloc)
3986 		fs_info->data_reloc_bg = 0;
3987 	spin_unlock(&fs_info->relocation_bg_lock);
3988 	spin_unlock(&fs_info->treelog_bg_lock);
3989 	spin_unlock(&block_group->lock);
3990 	spin_unlock(&space_info->lock);
3991 	return ret;
3992 }
3993 
3994 static int do_allocation(struct btrfs_block_group *block_group,
3995 			 struct find_free_extent_ctl *ffe_ctl,
3996 			 struct btrfs_block_group **bg_ret)
3997 {
3998 	switch (ffe_ctl->policy) {
3999 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4000 		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
4001 	case BTRFS_EXTENT_ALLOC_ZONED:
4002 		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
4003 	default:
4004 		BUG();
4005 	}
4006 }
4007 
4008 static void release_block_group(struct btrfs_block_group *block_group,
4009 				struct find_free_extent_ctl *ffe_ctl,
4010 				int delalloc)
4011 {
4012 	switch (ffe_ctl->policy) {
4013 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4014 		ffe_ctl->retry_uncached = false;
4015 		break;
4016 	case BTRFS_EXTENT_ALLOC_ZONED:
4017 		/* Nothing to do */
4018 		break;
4019 	default:
4020 		BUG();
4021 	}
4022 
4023 	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4024 	       ffe_ctl->index);
4025 	btrfs_release_block_group(block_group, delalloc);
4026 }
4027 
4028 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4029 				   struct btrfs_key *ins)
4030 {
4031 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4032 
4033 	if (!ffe_ctl->use_cluster && last_ptr) {
4034 		spin_lock(&last_ptr->lock);
4035 		last_ptr->window_start = ins->objectid;
4036 		spin_unlock(&last_ptr->lock);
4037 	}
4038 }
4039 
4040 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4041 			 struct btrfs_key *ins)
4042 {
4043 	switch (ffe_ctl->policy) {
4044 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4045 		found_extent_clustered(ffe_ctl, ins);
4046 		break;
4047 	case BTRFS_EXTENT_ALLOC_ZONED:
4048 		/* Nothing to do */
4049 		break;
4050 	default:
4051 		BUG();
4052 	}
4053 }
4054 
4055 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4056 				    struct find_free_extent_ctl *ffe_ctl)
4057 {
4058 	/* Block group's activeness is not a requirement for METADATA block groups. */
4059 	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4060 		return 0;
4061 
4062 	/* If we can activate new zone, just allocate a chunk and use it */
4063 	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4064 		return 0;
4065 
4066 	/*
4067 	 * We already reached the max active zones. Try to finish one block
4068 	 * group to make a room for a new block group. This is only possible
4069 	 * for a data block group because btrfs_zone_finish() may need to wait
4070 	 * for a running transaction which can cause a deadlock for metadata
4071 	 * allocation.
4072 	 */
4073 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4074 		int ret = btrfs_zone_finish_one_bg(fs_info);
4075 
4076 		if (ret == 1)
4077 			return 0;
4078 		else if (ret < 0)
4079 			return ret;
4080 	}
4081 
4082 	/*
4083 	 * If we have enough free space left in an already active block group
4084 	 * and we can't activate any other zone now, do not allow allocating a
4085 	 * new chunk and let find_free_extent() retry with a smaller size.
4086 	 */
4087 	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4088 		return -ENOSPC;
4089 
4090 	/*
4091 	 * Even min_alloc_size is not left in any block groups. Since we cannot
4092 	 * activate a new block group, allocating it may not help. Let's tell a
4093 	 * caller to try again and hope it progress something by writing some
4094 	 * parts of the region. That is only possible for data block groups,
4095 	 * where a part of the region can be written.
4096 	 */
4097 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4098 		return -EAGAIN;
4099 
4100 	/*
4101 	 * We cannot activate a new block group and no enough space left in any
4102 	 * block groups. So, allocating a new block group may not help. But,
4103 	 * there is nothing to do anyway, so let's go with it.
4104 	 */
4105 	return 0;
4106 }
4107 
4108 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4109 			      struct find_free_extent_ctl *ffe_ctl)
4110 {
4111 	switch (ffe_ctl->policy) {
4112 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4113 		return 0;
4114 	case BTRFS_EXTENT_ALLOC_ZONED:
4115 		return can_allocate_chunk_zoned(fs_info, ffe_ctl);
4116 	default:
4117 		BUG();
4118 	}
4119 }
4120 
4121 /*
4122  * Return >0 means caller needs to re-search for free extent
4123  * Return 0 means we have the needed free extent.
4124  * Return <0 means we failed to locate any free extent.
4125  */
4126 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4127 					struct btrfs_key *ins,
4128 					struct find_free_extent_ctl *ffe_ctl,
4129 					bool full_search)
4130 {
4131 	struct btrfs_root *root = fs_info->chunk_root;
4132 	int ret;
4133 
4134 	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4135 	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4136 		ffe_ctl->orig_have_caching_bg = true;
4137 
4138 	if (ins->objectid) {
4139 		found_extent(ffe_ctl, ins);
4140 		return 0;
4141 	}
4142 
4143 	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4144 		return 1;
4145 
4146 	ffe_ctl->index++;
4147 	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4148 		return 1;
4149 
4150 	/* See the comments for btrfs_loop_type for an explanation of the phases. */
4151 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4152 		ffe_ctl->index = 0;
4153 		/*
4154 		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4155 		 * any uncached bgs and we've already done a full search
4156 		 * through.
4157 		 */
4158 		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4159 		    (!ffe_ctl->orig_have_caching_bg && full_search))
4160 			ffe_ctl->loop++;
4161 		ffe_ctl->loop++;
4162 
4163 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4164 			struct btrfs_trans_handle *trans;
4165 			int exist = 0;
4166 
4167 			/* Check if allocation policy allows to create a new chunk */
4168 			ret = can_allocate_chunk(fs_info, ffe_ctl);
4169 			if (ret)
4170 				return ret;
4171 
4172 			trans = current->journal_info;
4173 			if (trans)
4174 				exist = 1;
4175 			else
4176 				trans = btrfs_join_transaction(root);
4177 
4178 			if (IS_ERR(trans)) {
4179 				ret = PTR_ERR(trans);
4180 				return ret;
4181 			}
4182 
4183 			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4184 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
4185 
4186 			/* Do not bail out on ENOSPC since we can do more. */
4187 			if (ret == -ENOSPC) {
4188 				ret = 0;
4189 				ffe_ctl->loop++;
4190 			}
4191 			else if (ret < 0)
4192 				btrfs_abort_transaction(trans, ret);
4193 			else
4194 				ret = 0;
4195 			if (!exist)
4196 				btrfs_end_transaction(trans);
4197 			if (ret)
4198 				return ret;
4199 		}
4200 
4201 		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4202 			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4203 				return -ENOSPC;
4204 
4205 			/*
4206 			 * Don't loop again if we already have no empty_size and
4207 			 * no empty_cluster.
4208 			 */
4209 			if (ffe_ctl->empty_size == 0 &&
4210 			    ffe_ctl->empty_cluster == 0)
4211 				return -ENOSPC;
4212 			ffe_ctl->empty_size = 0;
4213 			ffe_ctl->empty_cluster = 0;
4214 		}
4215 		return 1;
4216 	}
4217 	return -ENOSPC;
4218 }
4219 
4220 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4221 					      struct btrfs_block_group *bg)
4222 {
4223 	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4224 		return true;
4225 	if (!btrfs_block_group_should_use_size_class(bg))
4226 		return true;
4227 	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4228 		return true;
4229 	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4230 	    bg->size_class == BTRFS_BG_SZ_NONE)
4231 		return true;
4232 	return ffe_ctl->size_class == bg->size_class;
4233 }
4234 
4235 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4236 					struct find_free_extent_ctl *ffe_ctl,
4237 					struct btrfs_space_info *space_info,
4238 					struct btrfs_key *ins)
4239 {
4240 	/*
4241 	 * If our free space is heavily fragmented we may not be able to make
4242 	 * big contiguous allocations, so instead of doing the expensive search
4243 	 * for free space, simply return ENOSPC with our max_extent_size so we
4244 	 * can go ahead and search for a more manageable chunk.
4245 	 *
4246 	 * If our max_extent_size is large enough for our allocation simply
4247 	 * disable clustering since we will likely not be able to find enough
4248 	 * space to create a cluster and induce latency trying.
4249 	 */
4250 	if (space_info->max_extent_size) {
4251 		spin_lock(&space_info->lock);
4252 		if (space_info->max_extent_size &&
4253 		    ffe_ctl->num_bytes > space_info->max_extent_size) {
4254 			ins->offset = space_info->max_extent_size;
4255 			spin_unlock(&space_info->lock);
4256 			return -ENOSPC;
4257 		} else if (space_info->max_extent_size) {
4258 			ffe_ctl->use_cluster = false;
4259 		}
4260 		spin_unlock(&space_info->lock);
4261 	}
4262 
4263 	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4264 					       &ffe_ctl->empty_cluster);
4265 	if (ffe_ctl->last_ptr) {
4266 		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4267 
4268 		spin_lock(&last_ptr->lock);
4269 		if (last_ptr->block_group)
4270 			ffe_ctl->hint_byte = last_ptr->window_start;
4271 		if (last_ptr->fragmented) {
4272 			/*
4273 			 * We still set window_start so we can keep track of the
4274 			 * last place we found an allocation to try and save
4275 			 * some time.
4276 			 */
4277 			ffe_ctl->hint_byte = last_ptr->window_start;
4278 			ffe_ctl->use_cluster = false;
4279 		}
4280 		spin_unlock(&last_ptr->lock);
4281 	}
4282 
4283 	return 0;
4284 }
4285 
4286 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4287 				    struct find_free_extent_ctl *ffe_ctl)
4288 {
4289 	if (ffe_ctl->for_treelog) {
4290 		spin_lock(&fs_info->treelog_bg_lock);
4291 		if (fs_info->treelog_bg)
4292 			ffe_ctl->hint_byte = fs_info->treelog_bg;
4293 		spin_unlock(&fs_info->treelog_bg_lock);
4294 	} else if (ffe_ctl->for_data_reloc) {
4295 		spin_lock(&fs_info->relocation_bg_lock);
4296 		if (fs_info->data_reloc_bg)
4297 			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4298 		spin_unlock(&fs_info->relocation_bg_lock);
4299 	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4300 		struct btrfs_block_group *block_group;
4301 
4302 		spin_lock(&fs_info->zone_active_bgs_lock);
4303 		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4304 			/*
4305 			 * No lock is OK here because avail is monotinically
4306 			 * decreasing, and this is just a hint.
4307 			 */
4308 			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4309 
4310 			if (block_group_bits(block_group, ffe_ctl->flags) &&
4311 			    avail >= ffe_ctl->num_bytes) {
4312 				ffe_ctl->hint_byte = block_group->start;
4313 				break;
4314 			}
4315 		}
4316 		spin_unlock(&fs_info->zone_active_bgs_lock);
4317 	}
4318 
4319 	return 0;
4320 }
4321 
4322 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4323 			      struct find_free_extent_ctl *ffe_ctl,
4324 			      struct btrfs_space_info *space_info,
4325 			      struct btrfs_key *ins)
4326 {
4327 	switch (ffe_ctl->policy) {
4328 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4329 		return prepare_allocation_clustered(fs_info, ffe_ctl,
4330 						    space_info, ins);
4331 	case BTRFS_EXTENT_ALLOC_ZONED:
4332 		return prepare_allocation_zoned(fs_info, ffe_ctl);
4333 	default:
4334 		BUG();
4335 	}
4336 }
4337 
4338 /*
4339  * walks the btree of allocated extents and find a hole of a given size.
4340  * The key ins is changed to record the hole:
4341  * ins->objectid == start position
4342  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4343  * ins->offset == the size of the hole.
4344  * Any available blocks before search_start are skipped.
4345  *
4346  * If there is no suitable free space, we will record the max size of
4347  * the free space extent currently.
4348  *
4349  * The overall logic and call chain:
4350  *
4351  * find_free_extent()
4352  * |- Iterate through all block groups
4353  * |  |- Get a valid block group
4354  * |  |- Try to do clustered allocation in that block group
4355  * |  |- Try to do unclustered allocation in that block group
4356  * |  |- Check if the result is valid
4357  * |  |  |- If valid, then exit
4358  * |  |- Jump to next block group
4359  * |
4360  * |- Push harder to find free extents
4361  *    |- If not found, re-iterate all block groups
4362  */
4363 static noinline int find_free_extent(struct btrfs_root *root,
4364 				     struct btrfs_key *ins,
4365 				     struct find_free_extent_ctl *ffe_ctl)
4366 {
4367 	struct btrfs_fs_info *fs_info = root->fs_info;
4368 	int ret = 0;
4369 	int cache_block_group_error = 0;
4370 	struct btrfs_block_group *block_group = NULL;
4371 	struct btrfs_space_info *space_info;
4372 	bool full_search = false;
4373 
4374 	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4375 
4376 	ffe_ctl->search_start = 0;
4377 	/* For clustered allocation */
4378 	ffe_ctl->empty_cluster = 0;
4379 	ffe_ctl->last_ptr = NULL;
4380 	ffe_ctl->use_cluster = true;
4381 	ffe_ctl->have_caching_bg = false;
4382 	ffe_ctl->orig_have_caching_bg = false;
4383 	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4384 	ffe_ctl->loop = 0;
4385 	ffe_ctl->retry_uncached = false;
4386 	ffe_ctl->cached = 0;
4387 	ffe_ctl->max_extent_size = 0;
4388 	ffe_ctl->total_free_space = 0;
4389 	ffe_ctl->found_offset = 0;
4390 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4391 	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4392 
4393 	if (btrfs_is_zoned(fs_info))
4394 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4395 
4396 	ins->type = BTRFS_EXTENT_ITEM_KEY;
4397 	ins->objectid = 0;
4398 	ins->offset = 0;
4399 
4400 	trace_find_free_extent(root, ffe_ctl);
4401 
4402 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4403 	if (!space_info) {
4404 		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4405 		return -ENOSPC;
4406 	}
4407 
4408 	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4409 	if (ret < 0)
4410 		return ret;
4411 
4412 	ffe_ctl->search_start = max(ffe_ctl->search_start,
4413 				    first_logical_byte(fs_info));
4414 	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4415 	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4416 		block_group = btrfs_lookup_block_group(fs_info,
4417 						       ffe_ctl->search_start);
4418 		/*
4419 		 * we don't want to use the block group if it doesn't match our
4420 		 * allocation bits, or if its not cached.
4421 		 *
4422 		 * However if we are re-searching with an ideal block group
4423 		 * picked out then we don't care that the block group is cached.
4424 		 */
4425 		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4426 		    block_group->cached != BTRFS_CACHE_NO) {
4427 			down_read(&space_info->groups_sem);
4428 			if (list_empty(&block_group->list) ||
4429 			    block_group->ro) {
4430 				/*
4431 				 * someone is removing this block group,
4432 				 * we can't jump into the have_block_group
4433 				 * target because our list pointers are not
4434 				 * valid
4435 				 */
4436 				btrfs_put_block_group(block_group);
4437 				up_read(&space_info->groups_sem);
4438 			} else {
4439 				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4440 							block_group->flags);
4441 				btrfs_lock_block_group(block_group,
4442 						       ffe_ctl->delalloc);
4443 				ffe_ctl->hinted = true;
4444 				goto have_block_group;
4445 			}
4446 		} else if (block_group) {
4447 			btrfs_put_block_group(block_group);
4448 		}
4449 	}
4450 search:
4451 	trace_find_free_extent_search_loop(root, ffe_ctl);
4452 	ffe_ctl->have_caching_bg = false;
4453 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4454 	    ffe_ctl->index == 0)
4455 		full_search = true;
4456 	down_read(&space_info->groups_sem);
4457 	list_for_each_entry(block_group,
4458 			    &space_info->block_groups[ffe_ctl->index], list) {
4459 		struct btrfs_block_group *bg_ret;
4460 
4461 		ffe_ctl->hinted = false;
4462 		/* If the block group is read-only, we can skip it entirely. */
4463 		if (unlikely(block_group->ro)) {
4464 			if (ffe_ctl->for_treelog)
4465 				btrfs_clear_treelog_bg(block_group);
4466 			if (ffe_ctl->for_data_reloc)
4467 				btrfs_clear_data_reloc_bg(block_group);
4468 			continue;
4469 		}
4470 
4471 		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4472 		ffe_ctl->search_start = block_group->start;
4473 
4474 		/*
4475 		 * this can happen if we end up cycling through all the
4476 		 * raid types, but we want to make sure we only allocate
4477 		 * for the proper type.
4478 		 */
4479 		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4480 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4481 				BTRFS_BLOCK_GROUP_RAID1_MASK |
4482 				BTRFS_BLOCK_GROUP_RAID56_MASK |
4483 				BTRFS_BLOCK_GROUP_RAID10;
4484 
4485 			/*
4486 			 * if they asked for extra copies and this block group
4487 			 * doesn't provide them, bail.  This does allow us to
4488 			 * fill raid0 from raid1.
4489 			 */
4490 			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4491 				goto loop;
4492 
4493 			/*
4494 			 * This block group has different flags than we want.
4495 			 * It's possible that we have MIXED_GROUP flag but no
4496 			 * block group is mixed.  Just skip such block group.
4497 			 */
4498 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4499 			continue;
4500 		}
4501 
4502 have_block_group:
4503 		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4504 		ffe_ctl->cached = btrfs_block_group_done(block_group);
4505 		if (unlikely(!ffe_ctl->cached)) {
4506 			ffe_ctl->have_caching_bg = true;
4507 			ret = btrfs_cache_block_group(block_group, false);
4508 
4509 			/*
4510 			 * If we get ENOMEM here or something else we want to
4511 			 * try other block groups, because it may not be fatal.
4512 			 * However if we can't find anything else we need to
4513 			 * save our return here so that we return the actual
4514 			 * error that caused problems, not ENOSPC.
4515 			 */
4516 			if (ret < 0) {
4517 				if (!cache_block_group_error)
4518 					cache_block_group_error = ret;
4519 				ret = 0;
4520 				goto loop;
4521 			}
4522 			ret = 0;
4523 		}
4524 
4525 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4526 			if (!cache_block_group_error)
4527 				cache_block_group_error = -EIO;
4528 			goto loop;
4529 		}
4530 
4531 		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4532 			goto loop;
4533 
4534 		bg_ret = NULL;
4535 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4536 		if (ret > 0)
4537 			goto loop;
4538 
4539 		if (bg_ret && bg_ret != block_group) {
4540 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4541 			block_group = bg_ret;
4542 		}
4543 
4544 		/* Checks */
4545 		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4546 						 fs_info->stripesize);
4547 
4548 		/* move on to the next group */
4549 		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4550 		    block_group->start + block_group->length) {
4551 			btrfs_add_free_space_unused(block_group,
4552 					    ffe_ctl->found_offset,
4553 					    ffe_ctl->num_bytes);
4554 			goto loop;
4555 		}
4556 
4557 		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4558 			btrfs_add_free_space_unused(block_group,
4559 					ffe_ctl->found_offset,
4560 					ffe_ctl->search_start - ffe_ctl->found_offset);
4561 
4562 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4563 					       ffe_ctl->num_bytes,
4564 					       ffe_ctl->delalloc,
4565 					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4566 		if (ret == -EAGAIN) {
4567 			btrfs_add_free_space_unused(block_group,
4568 					ffe_ctl->found_offset,
4569 					ffe_ctl->num_bytes);
4570 			goto loop;
4571 		}
4572 		btrfs_inc_block_group_reservations(block_group);
4573 
4574 		/* we are all good, lets return */
4575 		ins->objectid = ffe_ctl->search_start;
4576 		ins->offset = ffe_ctl->num_bytes;
4577 
4578 		trace_btrfs_reserve_extent(block_group, ffe_ctl);
4579 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4580 		break;
4581 loop:
4582 		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4583 		    !ffe_ctl->retry_uncached) {
4584 			ffe_ctl->retry_uncached = true;
4585 			btrfs_wait_block_group_cache_progress(block_group,
4586 						ffe_ctl->num_bytes +
4587 						ffe_ctl->empty_cluster +
4588 						ffe_ctl->empty_size);
4589 			goto have_block_group;
4590 		}
4591 		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4592 		cond_resched();
4593 	}
4594 	up_read(&space_info->groups_sem);
4595 
4596 	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4597 	if (ret > 0)
4598 		goto search;
4599 
4600 	if (ret == -ENOSPC && !cache_block_group_error) {
4601 		/*
4602 		 * Use ffe_ctl->total_free_space as fallback if we can't find
4603 		 * any contiguous hole.
4604 		 */
4605 		if (!ffe_ctl->max_extent_size)
4606 			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4607 		spin_lock(&space_info->lock);
4608 		space_info->max_extent_size = ffe_ctl->max_extent_size;
4609 		spin_unlock(&space_info->lock);
4610 		ins->offset = ffe_ctl->max_extent_size;
4611 	} else if (ret == -ENOSPC) {
4612 		ret = cache_block_group_error;
4613 	}
4614 	return ret;
4615 }
4616 
4617 /*
4618  * Entry point to the extent allocator. Tries to find a hole that is at least
4619  * as big as @num_bytes.
4620  *
4621  * @root           -	The root that will contain this extent
4622  *
4623  * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4624  *			is used for accounting purposes. This value differs
4625  *			from @num_bytes only in the case of compressed extents.
4626  *
4627  * @num_bytes      -	Number of bytes to allocate on-disk.
4628  *
4629  * @min_alloc_size -	Indicates the minimum amount of space that the
4630  *			allocator should try to satisfy. In some cases
4631  *			@num_bytes may be larger than what is required and if
4632  *			the filesystem is fragmented then allocation fails.
4633  *			However, the presence of @min_alloc_size gives a
4634  *			chance to try and satisfy the smaller allocation.
4635  *
4636  * @empty_size     -	A hint that you plan on doing more COW. This is the
4637  *			size in bytes the allocator should try to find free
4638  *			next to the block it returns.  This is just a hint and
4639  *			may be ignored by the allocator.
4640  *
4641  * @hint_byte      -	Hint to the allocator to start searching above the byte
4642  *			address passed. It might be ignored.
4643  *
4644  * @ins            -	This key is modified to record the found hole. It will
4645  *			have the following values:
4646  *			ins->objectid == start position
4647  *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4648  *			ins->offset == the size of the hole.
4649  *
4650  * @is_data        -	Boolean flag indicating whether an extent is
4651  *			allocated for data (true) or metadata (false)
4652  *
4653  * @delalloc       -	Boolean flag indicating whether this allocation is for
4654  *			delalloc or not. If 'true' data_rwsem of block groups
4655  *			is going to be acquired.
4656  *
4657  *
4658  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4659  * case -ENOSPC is returned then @ins->offset will contain the size of the
4660  * largest available hole the allocator managed to find.
4661  */
4662 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4663 			 u64 num_bytes, u64 min_alloc_size,
4664 			 u64 empty_size, u64 hint_byte,
4665 			 struct btrfs_key *ins, int is_data, int delalloc)
4666 {
4667 	struct btrfs_fs_info *fs_info = root->fs_info;
4668 	struct find_free_extent_ctl ffe_ctl = {};
4669 	bool final_tried = num_bytes == min_alloc_size;
4670 	u64 flags;
4671 	int ret;
4672 	bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
4673 	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4674 
4675 	flags = get_alloc_profile_by_root(root, is_data);
4676 again:
4677 	WARN_ON(num_bytes < fs_info->sectorsize);
4678 
4679 	ffe_ctl.ram_bytes = ram_bytes;
4680 	ffe_ctl.num_bytes = num_bytes;
4681 	ffe_ctl.min_alloc_size = min_alloc_size;
4682 	ffe_ctl.empty_size = empty_size;
4683 	ffe_ctl.flags = flags;
4684 	ffe_ctl.delalloc = delalloc;
4685 	ffe_ctl.hint_byte = hint_byte;
4686 	ffe_ctl.for_treelog = for_treelog;
4687 	ffe_ctl.for_data_reloc = for_data_reloc;
4688 
4689 	ret = find_free_extent(root, ins, &ffe_ctl);
4690 	if (!ret && !is_data) {
4691 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4692 	} else if (ret == -ENOSPC) {
4693 		if (!final_tried && ins->offset) {
4694 			num_bytes = min(num_bytes >> 1, ins->offset);
4695 			num_bytes = round_down(num_bytes,
4696 					       fs_info->sectorsize);
4697 			num_bytes = max(num_bytes, min_alloc_size);
4698 			ram_bytes = num_bytes;
4699 			if (num_bytes == min_alloc_size)
4700 				final_tried = true;
4701 			goto again;
4702 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4703 			struct btrfs_space_info *sinfo;
4704 
4705 			sinfo = btrfs_find_space_info(fs_info, flags);
4706 			btrfs_err(fs_info,
4707 	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4708 				  flags, num_bytes, for_treelog, for_data_reloc);
4709 			if (sinfo)
4710 				btrfs_dump_space_info(fs_info, sinfo,
4711 						      num_bytes, 1);
4712 		}
4713 	}
4714 
4715 	return ret;
4716 }
4717 
4718 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4719 			       u64 start, u64 len, int delalloc)
4720 {
4721 	struct btrfs_block_group *cache;
4722 
4723 	cache = btrfs_lookup_block_group(fs_info, start);
4724 	if (!cache) {
4725 		btrfs_err(fs_info, "Unable to find block group for %llu",
4726 			  start);
4727 		return -ENOSPC;
4728 	}
4729 
4730 	btrfs_add_free_space(cache, start, len);
4731 	btrfs_free_reserved_bytes(cache, len, delalloc);
4732 	trace_btrfs_reserved_extent_free(fs_info, start, len);
4733 
4734 	btrfs_put_block_group(cache);
4735 	return 0;
4736 }
4737 
4738 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4739 			      const struct extent_buffer *eb)
4740 {
4741 	struct btrfs_block_group *cache;
4742 	int ret = 0;
4743 
4744 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4745 	if (!cache) {
4746 		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4747 			  eb->start);
4748 		return -ENOSPC;
4749 	}
4750 
4751 	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4752 	btrfs_put_block_group(cache);
4753 	return ret;
4754 }
4755 
4756 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4757 				 u64 num_bytes)
4758 {
4759 	struct btrfs_fs_info *fs_info = trans->fs_info;
4760 	int ret;
4761 
4762 	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4763 	if (ret)
4764 		return ret;
4765 
4766 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4767 	if (ret) {
4768 		ASSERT(!ret);
4769 		btrfs_err(fs_info, "update block group failed for %llu %llu",
4770 			  bytenr, num_bytes);
4771 		return ret;
4772 	}
4773 
4774 	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4775 	return 0;
4776 }
4777 
4778 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4779 				      u64 parent, u64 root_objectid,
4780 				      u64 flags, u64 owner, u64 offset,
4781 				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
4782 {
4783 	struct btrfs_fs_info *fs_info = trans->fs_info;
4784 	struct btrfs_root *extent_root;
4785 	int ret;
4786 	struct btrfs_extent_item *extent_item;
4787 	struct btrfs_extent_owner_ref *oref;
4788 	struct btrfs_extent_inline_ref *iref;
4789 	struct btrfs_path *path;
4790 	struct extent_buffer *leaf;
4791 	int type;
4792 	u32 size;
4793 	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
4794 
4795 	if (parent > 0)
4796 		type = BTRFS_SHARED_DATA_REF_KEY;
4797 	else
4798 		type = BTRFS_EXTENT_DATA_REF_KEY;
4799 
4800 	size = sizeof(*extent_item);
4801 	if (simple_quota)
4802 		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4803 	size += btrfs_extent_inline_ref_size(type);
4804 
4805 	path = btrfs_alloc_path();
4806 	if (!path)
4807 		return -ENOMEM;
4808 
4809 	extent_root = btrfs_extent_root(fs_info, ins->objectid);
4810 	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4811 	if (ret) {
4812 		btrfs_free_path(path);
4813 		return ret;
4814 	}
4815 
4816 	leaf = path->nodes[0];
4817 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4818 				     struct btrfs_extent_item);
4819 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4820 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4821 	btrfs_set_extent_flags(leaf, extent_item,
4822 			       flags | BTRFS_EXTENT_FLAG_DATA);
4823 
4824 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4825 	if (simple_quota) {
4826 		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4827 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
4828 		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
4829 		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4830 	}
4831 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4832 
4833 	if (parent > 0) {
4834 		struct btrfs_shared_data_ref *ref;
4835 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4836 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4837 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4838 	} else {
4839 		struct btrfs_extent_data_ref *ref;
4840 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4841 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4842 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4843 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4844 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4845 	}
4846 
4847 	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4848 	btrfs_free_path(path);
4849 
4850 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4851 }
4852 
4853 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4854 				     struct btrfs_delayed_ref_node *node,
4855 				     struct btrfs_delayed_extent_op *extent_op)
4856 {
4857 	struct btrfs_fs_info *fs_info = trans->fs_info;
4858 	struct btrfs_root *extent_root;
4859 	int ret;
4860 	struct btrfs_extent_item *extent_item;
4861 	struct btrfs_key extent_key;
4862 	struct btrfs_tree_block_info *block_info;
4863 	struct btrfs_extent_inline_ref *iref;
4864 	struct btrfs_path *path;
4865 	struct extent_buffer *leaf;
4866 	u32 size = sizeof(*extent_item) + sizeof(*iref);
4867 	u64 flags = extent_op->flags_to_set;
4868 	/* The owner of a tree block is the level. */
4869 	int level = btrfs_delayed_ref_owner(node);
4870 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4871 
4872 	extent_key.objectid = node->bytenr;
4873 	if (skinny_metadata) {
4874 		/* The owner of a tree block is the level. */
4875 		extent_key.offset = level;
4876 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4877 	} else {
4878 		extent_key.offset = node->num_bytes;
4879 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4880 		size += sizeof(*block_info);
4881 	}
4882 
4883 	path = btrfs_alloc_path();
4884 	if (!path)
4885 		return -ENOMEM;
4886 
4887 	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4888 	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4889 				      size);
4890 	if (ret) {
4891 		btrfs_free_path(path);
4892 		return ret;
4893 	}
4894 
4895 	leaf = path->nodes[0];
4896 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4897 				     struct btrfs_extent_item);
4898 	btrfs_set_extent_refs(leaf, extent_item, 1);
4899 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4900 	btrfs_set_extent_flags(leaf, extent_item,
4901 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4902 
4903 	if (skinny_metadata) {
4904 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4905 	} else {
4906 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4907 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4908 		btrfs_set_tree_block_level(leaf, block_info, level);
4909 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4910 	}
4911 
4912 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4913 		btrfs_set_extent_inline_ref_type(leaf, iref,
4914 						 BTRFS_SHARED_BLOCK_REF_KEY);
4915 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
4916 	} else {
4917 		btrfs_set_extent_inline_ref_type(leaf, iref,
4918 						 BTRFS_TREE_BLOCK_REF_KEY);
4919 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
4920 	}
4921 
4922 	btrfs_mark_buffer_dirty(trans, leaf);
4923 	btrfs_free_path(path);
4924 
4925 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4926 }
4927 
4928 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4929 				     struct btrfs_root *root, u64 owner,
4930 				     u64 offset, u64 ram_bytes,
4931 				     struct btrfs_key *ins)
4932 {
4933 	struct btrfs_ref generic_ref = {
4934 		.action = BTRFS_ADD_DELAYED_EXTENT,
4935 		.bytenr = ins->objectid,
4936 		.num_bytes = ins->offset,
4937 		.owning_root = btrfs_root_id(root),
4938 		.ref_root = btrfs_root_id(root),
4939 	};
4940 
4941 	ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
4942 
4943 	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4944 		generic_ref.owning_root = root->relocation_src_root;
4945 
4946 	btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
4947 	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4948 
4949 	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4950 }
4951 
4952 /*
4953  * this is used by the tree logging recovery code.  It records that
4954  * an extent has been allocated and makes sure to clear the free
4955  * space cache bits as well
4956  */
4957 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4958 				   u64 root_objectid, u64 owner, u64 offset,
4959 				   struct btrfs_key *ins)
4960 {
4961 	struct btrfs_fs_info *fs_info = trans->fs_info;
4962 	int ret;
4963 	struct btrfs_block_group *block_group;
4964 	struct btrfs_space_info *space_info;
4965 	struct btrfs_squota_delta delta = {
4966 		.root = root_objectid,
4967 		.num_bytes = ins->offset,
4968 		.generation = trans->transid,
4969 		.is_data = true,
4970 		.is_inc = true,
4971 	};
4972 
4973 	/*
4974 	 * Mixed block groups will exclude before processing the log so we only
4975 	 * need to do the exclude dance if this fs isn't mixed.
4976 	 */
4977 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4978 		ret = __exclude_logged_extent(fs_info, ins->objectid,
4979 					      ins->offset);
4980 		if (ret)
4981 			return ret;
4982 	}
4983 
4984 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4985 	if (!block_group)
4986 		return -EINVAL;
4987 
4988 	space_info = block_group->space_info;
4989 	spin_lock(&space_info->lock);
4990 	spin_lock(&block_group->lock);
4991 	space_info->bytes_reserved += ins->offset;
4992 	block_group->reserved += ins->offset;
4993 	spin_unlock(&block_group->lock);
4994 	spin_unlock(&space_info->lock);
4995 
4996 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4997 					 offset, ins, 1, root_objectid);
4998 	if (ret)
4999 		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
5000 	ret = btrfs_record_squota_delta(fs_info, &delta);
5001 	btrfs_put_block_group(block_group);
5002 	return ret;
5003 }
5004 
5005 #ifdef CONFIG_BTRFS_DEBUG
5006 /*
5007  * Extra safety check in case the extent tree is corrupted and extent allocator
5008  * chooses to use a tree block which is already used and locked.
5009  */
5010 static bool check_eb_lock_owner(const struct extent_buffer *eb)
5011 {
5012 	if (eb->lock_owner == current->pid) {
5013 		btrfs_err_rl(eb->fs_info,
5014 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5015 			     eb->start, btrfs_header_owner(eb), current->pid);
5016 		return true;
5017 	}
5018 	return false;
5019 }
5020 #else
5021 static bool check_eb_lock_owner(struct extent_buffer *eb)
5022 {
5023 	return false;
5024 }
5025 #endif
5026 
5027 static struct extent_buffer *
5028 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5029 		      u64 bytenr, int level, u64 owner,
5030 		      enum btrfs_lock_nesting nest)
5031 {
5032 	struct btrfs_fs_info *fs_info = root->fs_info;
5033 	struct extent_buffer *buf;
5034 	u64 lockdep_owner = owner;
5035 
5036 	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
5037 	if (IS_ERR(buf))
5038 		return buf;
5039 
5040 	if (check_eb_lock_owner(buf)) {
5041 		free_extent_buffer(buf);
5042 		return ERR_PTR(-EUCLEAN);
5043 	}
5044 
5045 	/*
5046 	 * The reloc trees are just snapshots, so we need them to appear to be
5047 	 * just like any other fs tree WRT lockdep.
5048 	 *
5049 	 * The exception however is in replace_path() in relocation, where we
5050 	 * hold the lock on the original fs root and then search for the reloc
5051 	 * root.  At that point we need to make sure any reloc root buffers are
5052 	 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5053 	 * lockdep happy.
5054 	 */
5055 	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5056 	    !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5057 		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5058 
5059 	/* btrfs_clear_buffer_dirty() accesses generation field. */
5060 	btrfs_set_header_generation(buf, trans->transid);
5061 
5062 	/*
5063 	 * This needs to stay, because we could allocate a freed block from an
5064 	 * old tree into a new tree, so we need to make sure this new block is
5065 	 * set to the appropriate level and owner.
5066 	 */
5067 	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5068 
5069 	btrfs_tree_lock_nested(buf, nest);
5070 	btrfs_clear_buffer_dirty(trans, buf);
5071 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
5072 	clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
5073 
5074 	set_extent_buffer_uptodate(buf);
5075 
5076 	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5077 	btrfs_set_header_level(buf, level);
5078 	btrfs_set_header_bytenr(buf, buf->start);
5079 	btrfs_set_header_generation(buf, trans->transid);
5080 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5081 	btrfs_set_header_owner(buf, owner);
5082 	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
5083 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
5084 	if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
5085 		buf->log_index = root->log_transid % 2;
5086 		/*
5087 		 * we allow two log transactions at a time, use different
5088 		 * EXTENT bit to differentiate dirty pages.
5089 		 */
5090 		if (buf->log_index == 0)
5091 			set_extent_bit(&root->dirty_log_pages, buf->start,
5092 				       buf->start + buf->len - 1,
5093 				       EXTENT_DIRTY, NULL);
5094 		else
5095 			set_extent_bit(&root->dirty_log_pages, buf->start,
5096 				       buf->start + buf->len - 1,
5097 				       EXTENT_NEW, NULL);
5098 	} else {
5099 		buf->log_index = -1;
5100 		set_extent_bit(&trans->transaction->dirty_pages, buf->start,
5101 			       buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
5102 	}
5103 	/* this returns a buffer locked for blocking */
5104 	return buf;
5105 }
5106 
5107 /*
5108  * finds a free extent and does all the dirty work required for allocation
5109  * returns the tree buffer or an ERR_PTR on error.
5110  */
5111 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
5112 					     struct btrfs_root *root,
5113 					     u64 parent, u64 root_objectid,
5114 					     const struct btrfs_disk_key *key,
5115 					     int level, u64 hint,
5116 					     u64 empty_size,
5117 					     u64 reloc_src_root,
5118 					     enum btrfs_lock_nesting nest)
5119 {
5120 	struct btrfs_fs_info *fs_info = root->fs_info;
5121 	struct btrfs_key ins;
5122 	struct btrfs_block_rsv *block_rsv;
5123 	struct extent_buffer *buf;
5124 	struct btrfs_delayed_extent_op *extent_op;
5125 	u64 flags = 0;
5126 	int ret;
5127 	u32 blocksize = fs_info->nodesize;
5128 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5129 	u64 owning_root;
5130 
5131 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5132 	if (btrfs_is_testing(fs_info)) {
5133 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
5134 					    level, root_objectid, nest);
5135 		if (!IS_ERR(buf))
5136 			root->alloc_bytenr += blocksize;
5137 		return buf;
5138 	}
5139 #endif
5140 
5141 	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
5142 	if (IS_ERR(block_rsv))
5143 		return ERR_CAST(block_rsv);
5144 
5145 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
5146 				   empty_size, hint, &ins, 0, 0);
5147 	if (ret)
5148 		goto out_unuse;
5149 
5150 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
5151 				    root_objectid, nest);
5152 	if (IS_ERR(buf)) {
5153 		ret = PTR_ERR(buf);
5154 		goto out_free_reserved;
5155 	}
5156 	owning_root = btrfs_header_owner(buf);
5157 
5158 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5159 		if (parent == 0)
5160 			parent = ins.objectid;
5161 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5162 		owning_root = reloc_src_root;
5163 	} else
5164 		BUG_ON(parent > 0);
5165 
5166 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5167 		struct btrfs_ref generic_ref = {
5168 			.action = BTRFS_ADD_DELAYED_EXTENT,
5169 			.bytenr = ins.objectid,
5170 			.num_bytes = ins.offset,
5171 			.parent = parent,
5172 			.owning_root = owning_root,
5173 			.ref_root = root_objectid,
5174 		};
5175 		extent_op = btrfs_alloc_delayed_extent_op();
5176 		if (!extent_op) {
5177 			ret = -ENOMEM;
5178 			goto out_free_buf;
5179 		}
5180 		if (key)
5181 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
5182 		else
5183 			memset(&extent_op->key, 0, sizeof(extent_op->key));
5184 		extent_op->flags_to_set = flags;
5185 		extent_op->update_key = skinny_metadata ? false : true;
5186 		extent_op->update_flags = true;
5187 		extent_op->level = level;
5188 
5189 		btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
5190 		btrfs_ref_tree_mod(fs_info, &generic_ref);
5191 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5192 		if (ret)
5193 			goto out_free_delayed;
5194 	}
5195 	return buf;
5196 
5197 out_free_delayed:
5198 	btrfs_free_delayed_extent_op(extent_op);
5199 out_free_buf:
5200 	btrfs_tree_unlock(buf);
5201 	free_extent_buffer(buf);
5202 out_free_reserved:
5203 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5204 out_unuse:
5205 	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5206 	return ERR_PTR(ret);
5207 }
5208 
5209 struct walk_control {
5210 	u64 refs[BTRFS_MAX_LEVEL];
5211 	u64 flags[BTRFS_MAX_LEVEL];
5212 	struct btrfs_key update_progress;
5213 	struct btrfs_key drop_progress;
5214 	int drop_level;
5215 	int stage;
5216 	int level;
5217 	int shared_level;
5218 	int update_ref;
5219 	int keep_locks;
5220 	int reada_slot;
5221 	int reada_count;
5222 	int restarted;
5223 };
5224 
5225 #define DROP_REFERENCE	1
5226 #define UPDATE_BACKREF	2
5227 
5228 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5229 				     struct btrfs_root *root,
5230 				     struct walk_control *wc,
5231 				     struct btrfs_path *path)
5232 {
5233 	struct btrfs_fs_info *fs_info = root->fs_info;
5234 	u64 bytenr;
5235 	u64 generation;
5236 	u64 refs;
5237 	u64 flags;
5238 	u32 nritems;
5239 	struct btrfs_key key;
5240 	struct extent_buffer *eb;
5241 	int ret;
5242 	int slot;
5243 	int nread = 0;
5244 
5245 	if (path->slots[wc->level] < wc->reada_slot) {
5246 		wc->reada_count = wc->reada_count * 2 / 3;
5247 		wc->reada_count = max(wc->reada_count, 2);
5248 	} else {
5249 		wc->reada_count = wc->reada_count * 3 / 2;
5250 		wc->reada_count = min_t(int, wc->reada_count,
5251 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5252 	}
5253 
5254 	eb = path->nodes[wc->level];
5255 	nritems = btrfs_header_nritems(eb);
5256 
5257 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5258 		if (nread >= wc->reada_count)
5259 			break;
5260 
5261 		cond_resched();
5262 		bytenr = btrfs_node_blockptr(eb, slot);
5263 		generation = btrfs_node_ptr_generation(eb, slot);
5264 
5265 		if (slot == path->slots[wc->level])
5266 			goto reada;
5267 
5268 		if (wc->stage == UPDATE_BACKREF &&
5269 		    generation <= root->root_key.offset)
5270 			continue;
5271 
5272 		/* We don't lock the tree block, it's OK to be racy here */
5273 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5274 					       wc->level - 1, 1, &refs,
5275 					       &flags, NULL);
5276 		/* We don't care about errors in readahead. */
5277 		if (ret < 0)
5278 			continue;
5279 		BUG_ON(refs == 0);
5280 
5281 		if (wc->stage == DROP_REFERENCE) {
5282 			if (refs == 1)
5283 				goto reada;
5284 
5285 			if (wc->level == 1 &&
5286 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5287 				continue;
5288 			if (!wc->update_ref ||
5289 			    generation <= root->root_key.offset)
5290 				continue;
5291 			btrfs_node_key_to_cpu(eb, &key, slot);
5292 			ret = btrfs_comp_cpu_keys(&key,
5293 						  &wc->update_progress);
5294 			if (ret < 0)
5295 				continue;
5296 		} else {
5297 			if (wc->level == 1 &&
5298 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5299 				continue;
5300 		}
5301 reada:
5302 		btrfs_readahead_node_child(eb, slot);
5303 		nread++;
5304 	}
5305 	wc->reada_slot = slot;
5306 }
5307 
5308 /*
5309  * helper to process tree block while walking down the tree.
5310  *
5311  * when wc->stage == UPDATE_BACKREF, this function updates
5312  * back refs for pointers in the block.
5313  *
5314  * NOTE: return value 1 means we should stop walking down.
5315  */
5316 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5317 				   struct btrfs_root *root,
5318 				   struct btrfs_path *path,
5319 				   struct walk_control *wc, int lookup_info)
5320 {
5321 	struct btrfs_fs_info *fs_info = root->fs_info;
5322 	int level = wc->level;
5323 	struct extent_buffer *eb = path->nodes[level];
5324 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5325 	int ret;
5326 
5327 	if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
5328 		return 1;
5329 
5330 	/*
5331 	 * when reference count of tree block is 1, it won't increase
5332 	 * again. once full backref flag is set, we never clear it.
5333 	 */
5334 	if (lookup_info &&
5335 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5336 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5337 		BUG_ON(!path->locks[level]);
5338 		ret = btrfs_lookup_extent_info(trans, fs_info,
5339 					       eb->start, level, 1,
5340 					       &wc->refs[level],
5341 					       &wc->flags[level],
5342 					       NULL);
5343 		BUG_ON(ret == -ENOMEM);
5344 		if (ret)
5345 			return ret;
5346 		BUG_ON(wc->refs[level] == 0);
5347 	}
5348 
5349 	if (wc->stage == DROP_REFERENCE) {
5350 		if (wc->refs[level] > 1)
5351 			return 1;
5352 
5353 		if (path->locks[level] && !wc->keep_locks) {
5354 			btrfs_tree_unlock_rw(eb, path->locks[level]);
5355 			path->locks[level] = 0;
5356 		}
5357 		return 0;
5358 	}
5359 
5360 	/* wc->stage == UPDATE_BACKREF */
5361 	if (!(wc->flags[level] & flag)) {
5362 		BUG_ON(!path->locks[level]);
5363 		ret = btrfs_inc_ref(trans, root, eb, 1);
5364 		BUG_ON(ret); /* -ENOMEM */
5365 		ret = btrfs_dec_ref(trans, root, eb, 0);
5366 		BUG_ON(ret); /* -ENOMEM */
5367 		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5368 		BUG_ON(ret); /* -ENOMEM */
5369 		wc->flags[level] |= flag;
5370 	}
5371 
5372 	/*
5373 	 * the block is shared by multiple trees, so it's not good to
5374 	 * keep the tree lock
5375 	 */
5376 	if (path->locks[level] && level > 0) {
5377 		btrfs_tree_unlock_rw(eb, path->locks[level]);
5378 		path->locks[level] = 0;
5379 	}
5380 	return 0;
5381 }
5382 
5383 /*
5384  * This is used to verify a ref exists for this root to deal with a bug where we
5385  * would have a drop_progress key that hadn't been updated properly.
5386  */
5387 static int check_ref_exists(struct btrfs_trans_handle *trans,
5388 			    struct btrfs_root *root, u64 bytenr, u64 parent,
5389 			    int level)
5390 {
5391 	struct btrfs_path *path;
5392 	struct btrfs_extent_inline_ref *iref;
5393 	int ret;
5394 
5395 	path = btrfs_alloc_path();
5396 	if (!path)
5397 		return -ENOMEM;
5398 
5399 	ret = lookup_extent_backref(trans, path, &iref, bytenr,
5400 				    root->fs_info->nodesize, parent,
5401 				    btrfs_root_id(root), level, 0);
5402 	btrfs_free_path(path);
5403 	if (ret == -ENOENT)
5404 		return 0;
5405 	if (ret < 0)
5406 		return ret;
5407 	return 1;
5408 }
5409 
5410 /*
5411  * helper to process tree block pointer.
5412  *
5413  * when wc->stage == DROP_REFERENCE, this function checks
5414  * reference count of the block pointed to. if the block
5415  * is shared and we need update back refs for the subtree
5416  * rooted at the block, this function changes wc->stage to
5417  * UPDATE_BACKREF. if the block is shared and there is no
5418  * need to update back, this function drops the reference
5419  * to the block.
5420  *
5421  * NOTE: return value 1 means we should stop walking down.
5422  */
5423 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5424 				 struct btrfs_root *root,
5425 				 struct btrfs_path *path,
5426 				 struct walk_control *wc, int *lookup_info)
5427 {
5428 	struct btrfs_fs_info *fs_info = root->fs_info;
5429 	u64 bytenr;
5430 	u64 generation;
5431 	u64 owner_root = 0;
5432 	struct btrfs_tree_parent_check check = { 0 };
5433 	struct btrfs_key key;
5434 	struct extent_buffer *next;
5435 	int level = wc->level;
5436 	int reada = 0;
5437 	int ret = 0;
5438 	bool need_account = false;
5439 
5440 	generation = btrfs_node_ptr_generation(path->nodes[level],
5441 					       path->slots[level]);
5442 	/*
5443 	 * if the lower level block was created before the snapshot
5444 	 * was created, we know there is no need to update back refs
5445 	 * for the subtree
5446 	 */
5447 	if (wc->stage == UPDATE_BACKREF &&
5448 	    generation <= root->root_key.offset) {
5449 		*lookup_info = 1;
5450 		return 1;
5451 	}
5452 
5453 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5454 
5455 	check.level = level - 1;
5456 	check.transid = generation;
5457 	check.owner_root = btrfs_root_id(root);
5458 	check.has_first_key = true;
5459 	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
5460 			      path->slots[level]);
5461 
5462 	next = find_extent_buffer(fs_info, bytenr);
5463 	if (!next) {
5464 		next = btrfs_find_create_tree_block(fs_info, bytenr,
5465 				btrfs_root_id(root), level - 1);
5466 		if (IS_ERR(next))
5467 			return PTR_ERR(next);
5468 		reada = 1;
5469 	}
5470 	btrfs_tree_lock(next);
5471 
5472 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5473 				       &wc->refs[level - 1],
5474 				       &wc->flags[level - 1],
5475 				       &owner_root);
5476 	if (ret < 0)
5477 		goto out_unlock;
5478 
5479 	if (unlikely(wc->refs[level - 1] == 0)) {
5480 		btrfs_err(fs_info, "Missing references.");
5481 		ret = -EIO;
5482 		goto out_unlock;
5483 	}
5484 	*lookup_info = 0;
5485 
5486 	if (wc->stage == DROP_REFERENCE) {
5487 		if (wc->refs[level - 1] > 1) {
5488 			need_account = true;
5489 			if (level == 1 &&
5490 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5491 				goto skip;
5492 
5493 			if (!wc->update_ref ||
5494 			    generation <= root->root_key.offset)
5495 				goto skip;
5496 
5497 			btrfs_node_key_to_cpu(path->nodes[level], &key,
5498 					      path->slots[level]);
5499 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5500 			if (ret < 0)
5501 				goto skip;
5502 
5503 			wc->stage = UPDATE_BACKREF;
5504 			wc->shared_level = level - 1;
5505 		}
5506 	} else {
5507 		if (level == 1 &&
5508 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5509 			goto skip;
5510 	}
5511 
5512 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
5513 		btrfs_tree_unlock(next);
5514 		free_extent_buffer(next);
5515 		next = NULL;
5516 		*lookup_info = 1;
5517 	}
5518 
5519 	if (!next) {
5520 		if (reada && level == 1)
5521 			reada_walk_down(trans, root, wc, path);
5522 		next = read_tree_block(fs_info, bytenr, &check);
5523 		if (IS_ERR(next)) {
5524 			return PTR_ERR(next);
5525 		} else if (!extent_buffer_uptodate(next)) {
5526 			free_extent_buffer(next);
5527 			return -EIO;
5528 		}
5529 		btrfs_tree_lock(next);
5530 	}
5531 
5532 	level--;
5533 	ASSERT(level == btrfs_header_level(next));
5534 	if (level != btrfs_header_level(next)) {
5535 		btrfs_err(root->fs_info, "mismatched level");
5536 		ret = -EIO;
5537 		goto out_unlock;
5538 	}
5539 	path->nodes[level] = next;
5540 	path->slots[level] = 0;
5541 	path->locks[level] = BTRFS_WRITE_LOCK;
5542 	wc->level = level;
5543 	if (wc->level == 1)
5544 		wc->reada_slot = 0;
5545 	return 0;
5546 skip:
5547 	wc->refs[level - 1] = 0;
5548 	wc->flags[level - 1] = 0;
5549 	if (wc->stage == DROP_REFERENCE) {
5550 		struct btrfs_ref ref = {
5551 			.action = BTRFS_DROP_DELAYED_REF,
5552 			.bytenr = bytenr,
5553 			.num_bytes = fs_info->nodesize,
5554 			.owning_root = owner_root,
5555 			.ref_root = btrfs_root_id(root),
5556 		};
5557 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5558 			ref.parent = path->nodes[level]->start;
5559 		} else {
5560 			ASSERT(btrfs_root_id(root) ==
5561 			       btrfs_header_owner(path->nodes[level]));
5562 			if (btrfs_root_id(root) !=
5563 			    btrfs_header_owner(path->nodes[level])) {
5564 				btrfs_err(root->fs_info,
5565 						"mismatched block owner");
5566 				ret = -EIO;
5567 				goto out_unlock;
5568 			}
5569 		}
5570 
5571 		/*
5572 		 * If we had a drop_progress we need to verify the refs are set
5573 		 * as expected.  If we find our ref then we know that from here
5574 		 * on out everything should be correct, and we can clear the
5575 		 * ->restarted flag.
5576 		 */
5577 		if (wc->restarted) {
5578 			ret = check_ref_exists(trans, root, bytenr, ref.parent,
5579 					       level - 1);
5580 			if (ret < 0)
5581 				goto out_unlock;
5582 			if (ret == 0)
5583 				goto no_delete;
5584 			ret = 0;
5585 			wc->restarted = 0;
5586 		}
5587 
5588 		/*
5589 		 * Reloc tree doesn't contribute to qgroup numbers, and we have
5590 		 * already accounted them at merge time (replace_path),
5591 		 * thus we could skip expensive subtree trace here.
5592 		 */
5593 		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
5594 			ret = btrfs_qgroup_trace_subtree(trans, next,
5595 							 generation, level - 1);
5596 			if (ret) {
5597 				btrfs_err_rl(fs_info,
5598 					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5599 					     ret);
5600 			}
5601 		}
5602 
5603 		/*
5604 		 * We need to update the next key in our walk control so we can
5605 		 * update the drop_progress key accordingly.  We don't care if
5606 		 * find_next_key doesn't find a key because that means we're at
5607 		 * the end and are going to clean up now.
5608 		 */
5609 		wc->drop_level = level;
5610 		find_next_key(path, level, &wc->drop_progress);
5611 
5612 		btrfs_init_tree_ref(&ref, level - 1, 0, false);
5613 		ret = btrfs_free_extent(trans, &ref);
5614 		if (ret)
5615 			goto out_unlock;
5616 	}
5617 no_delete:
5618 	*lookup_info = 1;
5619 	ret = 1;
5620 
5621 out_unlock:
5622 	btrfs_tree_unlock(next);
5623 	free_extent_buffer(next);
5624 
5625 	return ret;
5626 }
5627 
5628 /*
5629  * helper to process tree block while walking up the tree.
5630  *
5631  * when wc->stage == DROP_REFERENCE, this function drops
5632  * reference count on the block.
5633  *
5634  * when wc->stage == UPDATE_BACKREF, this function changes
5635  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5636  * to UPDATE_BACKREF previously while processing the block.
5637  *
5638  * NOTE: return value 1 means we should stop walking up.
5639  */
5640 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5641 				 struct btrfs_root *root,
5642 				 struct btrfs_path *path,
5643 				 struct walk_control *wc)
5644 {
5645 	struct btrfs_fs_info *fs_info = root->fs_info;
5646 	int ret;
5647 	int level = wc->level;
5648 	struct extent_buffer *eb = path->nodes[level];
5649 	u64 parent = 0;
5650 
5651 	if (wc->stage == UPDATE_BACKREF) {
5652 		BUG_ON(wc->shared_level < level);
5653 		if (level < wc->shared_level)
5654 			goto out;
5655 
5656 		ret = find_next_key(path, level + 1, &wc->update_progress);
5657 		if (ret > 0)
5658 			wc->update_ref = 0;
5659 
5660 		wc->stage = DROP_REFERENCE;
5661 		wc->shared_level = -1;
5662 		path->slots[level] = 0;
5663 
5664 		/*
5665 		 * check reference count again if the block isn't locked.
5666 		 * we should start walking down the tree again if reference
5667 		 * count is one.
5668 		 */
5669 		if (!path->locks[level]) {
5670 			BUG_ON(level == 0);
5671 			btrfs_tree_lock(eb);
5672 			path->locks[level] = BTRFS_WRITE_LOCK;
5673 
5674 			ret = btrfs_lookup_extent_info(trans, fs_info,
5675 						       eb->start, level, 1,
5676 						       &wc->refs[level],
5677 						       &wc->flags[level],
5678 						       NULL);
5679 			if (ret < 0) {
5680 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5681 				path->locks[level] = 0;
5682 				return ret;
5683 			}
5684 			BUG_ON(wc->refs[level] == 0);
5685 			if (wc->refs[level] == 1) {
5686 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5687 				path->locks[level] = 0;
5688 				return 1;
5689 			}
5690 		}
5691 	}
5692 
5693 	/* wc->stage == DROP_REFERENCE */
5694 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5695 
5696 	if (wc->refs[level] == 1) {
5697 		if (level == 0) {
5698 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5699 				ret = btrfs_dec_ref(trans, root, eb, 1);
5700 			else
5701 				ret = btrfs_dec_ref(trans, root, eb, 0);
5702 			BUG_ON(ret); /* -ENOMEM */
5703 			if (is_fstree(btrfs_root_id(root))) {
5704 				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5705 				if (ret) {
5706 					btrfs_err_rl(fs_info,
5707 	"error %d accounting leaf items, quota is out of sync, rescan required",
5708 					     ret);
5709 				}
5710 			}
5711 		}
5712 		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5713 		if (!path->locks[level]) {
5714 			btrfs_tree_lock(eb);
5715 			path->locks[level] = BTRFS_WRITE_LOCK;
5716 		}
5717 		btrfs_clear_buffer_dirty(trans, eb);
5718 	}
5719 
5720 	if (eb == root->node) {
5721 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5722 			parent = eb->start;
5723 		else if (btrfs_root_id(root) != btrfs_header_owner(eb))
5724 			goto owner_mismatch;
5725 	} else {
5726 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5727 			parent = path->nodes[level + 1]->start;
5728 		else if (btrfs_root_id(root) !=
5729 			 btrfs_header_owner(path->nodes[level + 1]))
5730 			goto owner_mismatch;
5731 	}
5732 
5733 	btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5734 			      wc->refs[level] == 1);
5735 out:
5736 	wc->refs[level] = 0;
5737 	wc->flags[level] = 0;
5738 	return 0;
5739 
5740 owner_mismatch:
5741 	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5742 		     btrfs_header_owner(eb), btrfs_root_id(root));
5743 	return -EUCLEAN;
5744 }
5745 
5746 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5747 				   struct btrfs_root *root,
5748 				   struct btrfs_path *path,
5749 				   struct walk_control *wc)
5750 {
5751 	int level = wc->level;
5752 	int lookup_info = 1;
5753 	int ret = 0;
5754 
5755 	while (level >= 0) {
5756 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5757 		if (ret)
5758 			break;
5759 
5760 		if (level == 0)
5761 			break;
5762 
5763 		if (path->slots[level] >=
5764 		    btrfs_header_nritems(path->nodes[level]))
5765 			break;
5766 
5767 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
5768 		if (ret > 0) {
5769 			path->slots[level]++;
5770 			continue;
5771 		} else if (ret < 0)
5772 			break;
5773 		level = wc->level;
5774 	}
5775 	return (ret == 1) ? 0 : ret;
5776 }
5777 
5778 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5779 				 struct btrfs_root *root,
5780 				 struct btrfs_path *path,
5781 				 struct walk_control *wc, int max_level)
5782 {
5783 	int level = wc->level;
5784 	int ret;
5785 
5786 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5787 	while (level < max_level && path->nodes[level]) {
5788 		wc->level = level;
5789 		if (path->slots[level] + 1 <
5790 		    btrfs_header_nritems(path->nodes[level])) {
5791 			path->slots[level]++;
5792 			return 0;
5793 		} else {
5794 			ret = walk_up_proc(trans, root, path, wc);
5795 			if (ret > 0)
5796 				return 0;
5797 			if (ret < 0)
5798 				return ret;
5799 
5800 			if (path->locks[level]) {
5801 				btrfs_tree_unlock_rw(path->nodes[level],
5802 						     path->locks[level]);
5803 				path->locks[level] = 0;
5804 			}
5805 			free_extent_buffer(path->nodes[level]);
5806 			path->nodes[level] = NULL;
5807 			level++;
5808 		}
5809 	}
5810 	return 1;
5811 }
5812 
5813 /*
5814  * drop a subvolume tree.
5815  *
5816  * this function traverses the tree freeing any blocks that only
5817  * referenced by the tree.
5818  *
5819  * when a shared tree block is found. this function decreases its
5820  * reference count by one. if update_ref is true, this function
5821  * also make sure backrefs for the shared block and all lower level
5822  * blocks are properly updated.
5823  *
5824  * If called with for_reloc == 0, may exit early with -EAGAIN
5825  */
5826 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5827 {
5828 	const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
5829 	struct btrfs_fs_info *fs_info = root->fs_info;
5830 	struct btrfs_path *path;
5831 	struct btrfs_trans_handle *trans;
5832 	struct btrfs_root *tree_root = fs_info->tree_root;
5833 	struct btrfs_root_item *root_item = &root->root_item;
5834 	struct walk_control *wc;
5835 	struct btrfs_key key;
5836 	int err = 0;
5837 	int ret;
5838 	int level;
5839 	bool root_dropped = false;
5840 	bool unfinished_drop = false;
5841 
5842 	btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
5843 
5844 	path = btrfs_alloc_path();
5845 	if (!path) {
5846 		err = -ENOMEM;
5847 		goto out;
5848 	}
5849 
5850 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5851 	if (!wc) {
5852 		btrfs_free_path(path);
5853 		err = -ENOMEM;
5854 		goto out;
5855 	}
5856 
5857 	/*
5858 	 * Use join to avoid potential EINTR from transaction start. See
5859 	 * wait_reserve_ticket and the whole reservation callchain.
5860 	 */
5861 	if (for_reloc)
5862 		trans = btrfs_join_transaction(tree_root);
5863 	else
5864 		trans = btrfs_start_transaction(tree_root, 0);
5865 	if (IS_ERR(trans)) {
5866 		err = PTR_ERR(trans);
5867 		goto out_free;
5868 	}
5869 
5870 	err = btrfs_run_delayed_items(trans);
5871 	if (err)
5872 		goto out_end_trans;
5873 
5874 	/*
5875 	 * This will help us catch people modifying the fs tree while we're
5876 	 * dropping it.  It is unsafe to mess with the fs tree while it's being
5877 	 * dropped as we unlock the root node and parent nodes as we walk down
5878 	 * the tree, assuming nothing will change.  If something does change
5879 	 * then we'll have stale information and drop references to blocks we've
5880 	 * already dropped.
5881 	 */
5882 	set_bit(BTRFS_ROOT_DELETING, &root->state);
5883 	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5884 
5885 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5886 		level = btrfs_header_level(root->node);
5887 		path->nodes[level] = btrfs_lock_root_node(root);
5888 		path->slots[level] = 0;
5889 		path->locks[level] = BTRFS_WRITE_LOCK;
5890 		memset(&wc->update_progress, 0,
5891 		       sizeof(wc->update_progress));
5892 	} else {
5893 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5894 		memcpy(&wc->update_progress, &key,
5895 		       sizeof(wc->update_progress));
5896 
5897 		level = btrfs_root_drop_level(root_item);
5898 		BUG_ON(level == 0);
5899 		path->lowest_level = level;
5900 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5901 		path->lowest_level = 0;
5902 		if (ret < 0) {
5903 			err = ret;
5904 			goto out_end_trans;
5905 		}
5906 		WARN_ON(ret > 0);
5907 
5908 		/*
5909 		 * unlock our path, this is safe because only this
5910 		 * function is allowed to delete this snapshot
5911 		 */
5912 		btrfs_unlock_up_safe(path, 0);
5913 
5914 		level = btrfs_header_level(root->node);
5915 		while (1) {
5916 			btrfs_tree_lock(path->nodes[level]);
5917 			path->locks[level] = BTRFS_WRITE_LOCK;
5918 
5919 			ret = btrfs_lookup_extent_info(trans, fs_info,
5920 						path->nodes[level]->start,
5921 						level, 1, &wc->refs[level],
5922 						&wc->flags[level], NULL);
5923 			if (ret < 0) {
5924 				err = ret;
5925 				goto out_end_trans;
5926 			}
5927 			BUG_ON(wc->refs[level] == 0);
5928 
5929 			if (level == btrfs_root_drop_level(root_item))
5930 				break;
5931 
5932 			btrfs_tree_unlock(path->nodes[level]);
5933 			path->locks[level] = 0;
5934 			WARN_ON(wc->refs[level] != 1);
5935 			level--;
5936 		}
5937 	}
5938 
5939 	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5940 	wc->level = level;
5941 	wc->shared_level = -1;
5942 	wc->stage = DROP_REFERENCE;
5943 	wc->update_ref = update_ref;
5944 	wc->keep_locks = 0;
5945 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5946 
5947 	while (1) {
5948 
5949 		ret = walk_down_tree(trans, root, path, wc);
5950 		if (ret < 0) {
5951 			btrfs_abort_transaction(trans, ret);
5952 			err = ret;
5953 			break;
5954 		}
5955 
5956 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5957 		if (ret < 0) {
5958 			btrfs_abort_transaction(trans, ret);
5959 			err = ret;
5960 			break;
5961 		}
5962 
5963 		if (ret > 0) {
5964 			BUG_ON(wc->stage != DROP_REFERENCE);
5965 			break;
5966 		}
5967 
5968 		if (wc->stage == DROP_REFERENCE) {
5969 			wc->drop_level = wc->level;
5970 			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5971 					      &wc->drop_progress,
5972 					      path->slots[wc->drop_level]);
5973 		}
5974 		btrfs_cpu_key_to_disk(&root_item->drop_progress,
5975 				      &wc->drop_progress);
5976 		btrfs_set_root_drop_level(root_item, wc->drop_level);
5977 
5978 		BUG_ON(wc->level == 0);
5979 		if (btrfs_should_end_transaction(trans) ||
5980 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5981 			ret = btrfs_update_root(trans, tree_root,
5982 						&root->root_key,
5983 						root_item);
5984 			if (ret) {
5985 				btrfs_abort_transaction(trans, ret);
5986 				err = ret;
5987 				goto out_end_trans;
5988 			}
5989 
5990 			if (!is_reloc_root)
5991 				btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5992 
5993 			btrfs_end_transaction_throttle(trans);
5994 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5995 				btrfs_debug(fs_info,
5996 					    "drop snapshot early exit");
5997 				err = -EAGAIN;
5998 				goto out_free;
5999 			}
6000 
6001 		       /*
6002 			* Use join to avoid potential EINTR from transaction
6003 			* start. See wait_reserve_ticket and the whole
6004 			* reservation callchain.
6005 			*/
6006 			if (for_reloc)
6007 				trans = btrfs_join_transaction(tree_root);
6008 			else
6009 				trans = btrfs_start_transaction(tree_root, 0);
6010 			if (IS_ERR(trans)) {
6011 				err = PTR_ERR(trans);
6012 				goto out_free;
6013 			}
6014 		}
6015 	}
6016 	btrfs_release_path(path);
6017 	if (err)
6018 		goto out_end_trans;
6019 
6020 	ret = btrfs_del_root(trans, &root->root_key);
6021 	if (ret) {
6022 		btrfs_abort_transaction(trans, ret);
6023 		err = ret;
6024 		goto out_end_trans;
6025 	}
6026 
6027 	if (!is_reloc_root) {
6028 		ret = btrfs_find_root(tree_root, &root->root_key, path,
6029 				      NULL, NULL);
6030 		if (ret < 0) {
6031 			btrfs_abort_transaction(trans, ret);
6032 			err = ret;
6033 			goto out_end_trans;
6034 		} else if (ret > 0) {
6035 			/* if we fail to delete the orphan item this time
6036 			 * around, it'll get picked up the next time.
6037 			 *
6038 			 * The most common failure here is just -ENOENT.
6039 			 */
6040 			btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
6041 		}
6042 	}
6043 
6044 	/*
6045 	 * This subvolume is going to be completely dropped, and won't be
6046 	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6047 	 * commit transaction time.  So free it here manually.
6048 	 */
6049 	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6050 	btrfs_qgroup_free_meta_all_pertrans(root);
6051 
6052 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
6053 		btrfs_add_dropped_root(trans, root);
6054 	else
6055 		btrfs_put_root(root);
6056 	root_dropped = true;
6057 out_end_trans:
6058 	if (!is_reloc_root)
6059 		btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6060 
6061 	btrfs_end_transaction_throttle(trans);
6062 out_free:
6063 	kfree(wc);
6064 	btrfs_free_path(path);
6065 out:
6066 	/*
6067 	 * We were an unfinished drop root, check to see if there are any
6068 	 * pending, and if not clear and wake up any waiters.
6069 	 */
6070 	if (!err && unfinished_drop)
6071 		btrfs_maybe_wake_unfinished_drop(fs_info);
6072 
6073 	/*
6074 	 * So if we need to stop dropping the snapshot for whatever reason we
6075 	 * need to make sure to add it back to the dead root list so that we
6076 	 * keep trying to do the work later.  This also cleans up roots if we
6077 	 * don't have it in the radix (like when we recover after a power fail
6078 	 * or unmount) so we don't leak memory.
6079 	 */
6080 	if (!for_reloc && !root_dropped)
6081 		btrfs_add_dead_root(root);
6082 	return err;
6083 }
6084 
6085 /*
6086  * drop subtree rooted at tree block 'node'.
6087  *
6088  * NOTE: this function will unlock and release tree block 'node'
6089  * only used by relocation code
6090  */
6091 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6092 			struct btrfs_root *root,
6093 			struct extent_buffer *node,
6094 			struct extent_buffer *parent)
6095 {
6096 	struct btrfs_fs_info *fs_info = root->fs_info;
6097 	struct btrfs_path *path;
6098 	struct walk_control *wc;
6099 	int level;
6100 	int parent_level;
6101 	int ret = 0;
6102 
6103 	BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
6104 
6105 	path = btrfs_alloc_path();
6106 	if (!path)
6107 		return -ENOMEM;
6108 
6109 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6110 	if (!wc) {
6111 		btrfs_free_path(path);
6112 		return -ENOMEM;
6113 	}
6114 
6115 	btrfs_assert_tree_write_locked(parent);
6116 	parent_level = btrfs_header_level(parent);
6117 	atomic_inc(&parent->refs);
6118 	path->nodes[parent_level] = parent;
6119 	path->slots[parent_level] = btrfs_header_nritems(parent);
6120 
6121 	btrfs_assert_tree_write_locked(node);
6122 	level = btrfs_header_level(node);
6123 	path->nodes[level] = node;
6124 	path->slots[level] = 0;
6125 	path->locks[level] = BTRFS_WRITE_LOCK;
6126 
6127 	wc->refs[parent_level] = 1;
6128 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6129 	wc->level = level;
6130 	wc->shared_level = -1;
6131 	wc->stage = DROP_REFERENCE;
6132 	wc->update_ref = 0;
6133 	wc->keep_locks = 1;
6134 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6135 
6136 	while (1) {
6137 		ret = walk_down_tree(trans, root, path, wc);
6138 		if (ret < 0)
6139 			break;
6140 
6141 		ret = walk_up_tree(trans, root, path, wc, parent_level);
6142 		if (ret) {
6143 			if (ret > 0)
6144 				ret = 0;
6145 			break;
6146 		}
6147 	}
6148 
6149 	kfree(wc);
6150 	btrfs_free_path(path);
6151 	return ret;
6152 }
6153 
6154 /*
6155  * Unpin the extent range in an error context and don't add the space back.
6156  * Errors are not propagated further.
6157  */
6158 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
6159 {
6160 	unpin_extent_range(fs_info, start, end, false);
6161 }
6162 
6163 /*
6164  * It used to be that old block groups would be left around forever.
6165  * Iterating over them would be enough to trim unused space.  Since we
6166  * now automatically remove them, we also need to iterate over unallocated
6167  * space.
6168  *
6169  * We don't want a transaction for this since the discard may take a
6170  * substantial amount of time.  We don't require that a transaction be
6171  * running, but we do need to take a running transaction into account
6172  * to ensure that we're not discarding chunks that were released or
6173  * allocated in the current transaction.
6174  *
6175  * Holding the chunks lock will prevent other threads from allocating
6176  * or releasing chunks, but it won't prevent a running transaction
6177  * from committing and releasing the memory that the pending chunks
6178  * list head uses.  For that, we need to take a reference to the
6179  * transaction and hold the commit root sem.  We only need to hold
6180  * it while performing the free space search since we have already
6181  * held back allocations.
6182  */
6183 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6184 {
6185 	u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6186 	int ret;
6187 
6188 	*trimmed = 0;
6189 
6190 	/* Discard not supported = nothing to do. */
6191 	if (!bdev_max_discard_sectors(device->bdev))
6192 		return 0;
6193 
6194 	/* Not writable = nothing to do. */
6195 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6196 		return 0;
6197 
6198 	/* No free space = nothing to do. */
6199 	if (device->total_bytes <= device->bytes_used)
6200 		return 0;
6201 
6202 	ret = 0;
6203 
6204 	while (1) {
6205 		struct btrfs_fs_info *fs_info = device->fs_info;
6206 		u64 bytes;
6207 
6208 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6209 		if (ret)
6210 			break;
6211 
6212 		find_first_clear_extent_bit(&device->alloc_state, start,
6213 					    &start, &end,
6214 					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6215 
6216 		/* Check if there are any CHUNK_* bits left */
6217 		if (start > device->total_bytes) {
6218 			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6219 			btrfs_warn_in_rcu(fs_info,
6220 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6221 					  start, end - start + 1,
6222 					  btrfs_dev_name(device),
6223 					  device->total_bytes);
6224 			mutex_unlock(&fs_info->chunk_mutex);
6225 			ret = 0;
6226 			break;
6227 		}
6228 
6229 		/* Ensure we skip the reserved space on each device. */
6230 		start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6231 
6232 		/*
6233 		 * If find_first_clear_extent_bit find a range that spans the
6234 		 * end of the device it will set end to -1, in this case it's up
6235 		 * to the caller to trim the value to the size of the device.
6236 		 */
6237 		end = min(end, device->total_bytes - 1);
6238 
6239 		len = end - start + 1;
6240 
6241 		/* We didn't find any extents */
6242 		if (!len) {
6243 			mutex_unlock(&fs_info->chunk_mutex);
6244 			ret = 0;
6245 			break;
6246 		}
6247 
6248 		ret = btrfs_issue_discard(device->bdev, start, len,
6249 					  &bytes);
6250 		if (!ret)
6251 			set_extent_bit(&device->alloc_state, start,
6252 				       start + bytes - 1, CHUNK_TRIMMED, NULL);
6253 		mutex_unlock(&fs_info->chunk_mutex);
6254 
6255 		if (ret)
6256 			break;
6257 
6258 		start += len;
6259 		*trimmed += bytes;
6260 
6261 		if (fatal_signal_pending(current)) {
6262 			ret = -ERESTARTSYS;
6263 			break;
6264 		}
6265 
6266 		cond_resched();
6267 	}
6268 
6269 	return ret;
6270 }
6271 
6272 /*
6273  * Trim the whole filesystem by:
6274  * 1) trimming the free space in each block group
6275  * 2) trimming the unallocated space on each device
6276  *
6277  * This will also continue trimming even if a block group or device encounters
6278  * an error.  The return value will be the last error, or 0 if nothing bad
6279  * happens.
6280  */
6281 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6282 {
6283 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6284 	struct btrfs_block_group *cache = NULL;
6285 	struct btrfs_device *device;
6286 	u64 group_trimmed;
6287 	u64 range_end = U64_MAX;
6288 	u64 start;
6289 	u64 end;
6290 	u64 trimmed = 0;
6291 	u64 bg_failed = 0;
6292 	u64 dev_failed = 0;
6293 	int bg_ret = 0;
6294 	int dev_ret = 0;
6295 	int ret = 0;
6296 
6297 	if (range->start == U64_MAX)
6298 		return -EINVAL;
6299 
6300 	/*
6301 	 * Check range overflow if range->len is set.
6302 	 * The default range->len is U64_MAX.
6303 	 */
6304 	if (range->len != U64_MAX &&
6305 	    check_add_overflow(range->start, range->len, &range_end))
6306 		return -EINVAL;
6307 
6308 	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6309 	for (; cache; cache = btrfs_next_block_group(cache)) {
6310 		if (cache->start >= range_end) {
6311 			btrfs_put_block_group(cache);
6312 			break;
6313 		}
6314 
6315 		start = max(range->start, cache->start);
6316 		end = min(range_end, cache->start + cache->length);
6317 
6318 		if (end - start >= range->minlen) {
6319 			if (!btrfs_block_group_done(cache)) {
6320 				ret = btrfs_cache_block_group(cache, true);
6321 				if (ret) {
6322 					bg_failed++;
6323 					bg_ret = ret;
6324 					continue;
6325 				}
6326 			}
6327 			ret = btrfs_trim_block_group(cache,
6328 						     &group_trimmed,
6329 						     start,
6330 						     end,
6331 						     range->minlen);
6332 
6333 			trimmed += group_trimmed;
6334 			if (ret) {
6335 				bg_failed++;
6336 				bg_ret = ret;
6337 				continue;
6338 			}
6339 		}
6340 	}
6341 
6342 	if (bg_failed)
6343 		btrfs_warn(fs_info,
6344 			"failed to trim %llu block group(s), last error %d",
6345 			bg_failed, bg_ret);
6346 
6347 	mutex_lock(&fs_devices->device_list_mutex);
6348 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6349 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6350 			continue;
6351 
6352 		ret = btrfs_trim_free_extents(device, &group_trimmed);
6353 		if (ret) {
6354 			dev_failed++;
6355 			dev_ret = ret;
6356 			break;
6357 		}
6358 
6359 		trimmed += group_trimmed;
6360 	}
6361 	mutex_unlock(&fs_devices->device_list_mutex);
6362 
6363 	if (dev_failed)
6364 		btrfs_warn(fs_info,
6365 			"failed to trim %llu device(s), last error %d",
6366 			dev_failed, dev_ret);
6367 	range->len = trimmed;
6368 	if (bg_ret)
6369 		return bg_ret;
6370 	return dev_ret;
6371 }
6372