xref: /linux/fs/btrfs/extent-tree.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "discard.h"
34 #include "zoned.h"
35 #include "dev-replace.h"
36 #include "fs.h"
37 #include "accessors.h"
38 #include "root-tree.h"
39 #include "file-item.h"
40 #include "orphan.h"
41 #include "tree-checker.h"
42 #include "raid-stripe-tree.h"
43 
44 #undef SCRAMBLE_DELAYED_REFS
45 
46 
47 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
48 			       struct btrfs_delayed_ref_head *href,
49 			       struct btrfs_delayed_ref_node *node,
50 			       struct btrfs_delayed_extent_op *extra_op);
51 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
52 				    struct extent_buffer *leaf,
53 				    struct btrfs_extent_item *ei);
54 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
55 				      u64 parent, u64 root_objectid,
56 				      u64 flags, u64 owner, u64 offset,
57 				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
58 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
59 				     struct btrfs_delayed_ref_node *node,
60 				     struct btrfs_delayed_extent_op *extent_op);
61 static int find_next_key(struct btrfs_path *path, int level,
62 			 struct btrfs_key *key);
63 
64 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
65 {
66 	return (cache->flags & bits) == bits;
67 }
68 
69 /* simple helper to search for an existing data extent at a given offset */
70 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
71 {
72 	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
73 	int ret;
74 	struct btrfs_key key;
75 	struct btrfs_path *path;
76 
77 	path = btrfs_alloc_path();
78 	if (!path)
79 		return -ENOMEM;
80 
81 	key.objectid = start;
82 	key.offset = len;
83 	key.type = BTRFS_EXTENT_ITEM_KEY;
84 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
85 	btrfs_free_path(path);
86 	return ret;
87 }
88 
89 /*
90  * helper function to lookup reference count and flags of a tree block.
91  *
92  * the head node for delayed ref is used to store the sum of all the
93  * reference count modifications queued up in the rbtree. the head
94  * node may also store the extent flags to set. This way you can check
95  * to see what the reference count and extent flags would be if all of
96  * the delayed refs are not processed.
97  */
98 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
99 			     struct btrfs_fs_info *fs_info, u64 bytenr,
100 			     u64 offset, int metadata, u64 *refs, u64 *flags,
101 			     u64 *owning_root)
102 {
103 	struct btrfs_root *extent_root;
104 	struct btrfs_delayed_ref_head *head;
105 	struct btrfs_delayed_ref_root *delayed_refs;
106 	struct btrfs_path *path;
107 	struct btrfs_key key;
108 	u64 num_refs;
109 	u64 extent_flags;
110 	u64 owner = 0;
111 	int ret;
112 
113 	/*
114 	 * If we don't have skinny metadata, don't bother doing anything
115 	 * different
116 	 */
117 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
118 		offset = fs_info->nodesize;
119 		metadata = 0;
120 	}
121 
122 	path = btrfs_alloc_path();
123 	if (!path)
124 		return -ENOMEM;
125 
126 search_again:
127 	key.objectid = bytenr;
128 	key.offset = offset;
129 	if (metadata)
130 		key.type = BTRFS_METADATA_ITEM_KEY;
131 	else
132 		key.type = BTRFS_EXTENT_ITEM_KEY;
133 
134 	extent_root = btrfs_extent_root(fs_info, bytenr);
135 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
136 	if (ret < 0)
137 		goto out_free;
138 
139 	if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
140 		if (path->slots[0]) {
141 			path->slots[0]--;
142 			btrfs_item_key_to_cpu(path->nodes[0], &key,
143 					      path->slots[0]);
144 			if (key.objectid == bytenr &&
145 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
146 			    key.offset == fs_info->nodesize)
147 				ret = 0;
148 		}
149 	}
150 
151 	if (ret == 0) {
152 		struct extent_buffer *leaf = path->nodes[0];
153 		struct btrfs_extent_item *ei;
154 		const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
155 
156 		if (unlikely(item_size < sizeof(*ei))) {
157 			ret = -EUCLEAN;
158 			btrfs_err(fs_info,
159 			"unexpected extent item size, has %u expect >= %zu",
160 				  item_size, sizeof(*ei));
161 			btrfs_abort_transaction(trans, ret);
162 			goto out_free;
163 		}
164 
165 		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
166 		num_refs = btrfs_extent_refs(leaf, ei);
167 		if (unlikely(num_refs == 0)) {
168 			ret = -EUCLEAN;
169 			btrfs_err(fs_info,
170 		"unexpected zero reference count for extent item (%llu %u %llu)",
171 				  key.objectid, key.type, key.offset);
172 			btrfs_abort_transaction(trans, ret);
173 			goto out_free;
174 		}
175 		extent_flags = btrfs_extent_flags(leaf, ei);
176 		owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
177 	} else {
178 		num_refs = 0;
179 		extent_flags = 0;
180 		ret = 0;
181 	}
182 
183 	delayed_refs = &trans->transaction->delayed_refs;
184 	spin_lock(&delayed_refs->lock);
185 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
186 	if (head) {
187 		if (!mutex_trylock(&head->mutex)) {
188 			refcount_inc(&head->refs);
189 			spin_unlock(&delayed_refs->lock);
190 
191 			btrfs_release_path(path);
192 
193 			/*
194 			 * Mutex was contended, block until it's released and try
195 			 * again
196 			 */
197 			mutex_lock(&head->mutex);
198 			mutex_unlock(&head->mutex);
199 			btrfs_put_delayed_ref_head(head);
200 			goto search_again;
201 		}
202 		spin_lock(&head->lock);
203 		if (head->extent_op && head->extent_op->update_flags)
204 			extent_flags |= head->extent_op->flags_to_set;
205 
206 		num_refs += head->ref_mod;
207 		spin_unlock(&head->lock);
208 		mutex_unlock(&head->mutex);
209 	}
210 	spin_unlock(&delayed_refs->lock);
211 
212 	WARN_ON(num_refs == 0);
213 	if (refs)
214 		*refs = num_refs;
215 	if (flags)
216 		*flags = extent_flags;
217 	if (owning_root)
218 		*owning_root = owner;
219 out_free:
220 	btrfs_free_path(path);
221 	return ret;
222 }
223 
224 /*
225  * Back reference rules.  Back refs have three main goals:
226  *
227  * 1) differentiate between all holders of references to an extent so that
228  *    when a reference is dropped we can make sure it was a valid reference
229  *    before freeing the extent.
230  *
231  * 2) Provide enough information to quickly find the holders of an extent
232  *    if we notice a given block is corrupted or bad.
233  *
234  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
235  *    maintenance.  This is actually the same as #2, but with a slightly
236  *    different use case.
237  *
238  * There are two kinds of back refs. The implicit back refs is optimized
239  * for pointers in non-shared tree blocks. For a given pointer in a block,
240  * back refs of this kind provide information about the block's owner tree
241  * and the pointer's key. These information allow us to find the block by
242  * b-tree searching. The full back refs is for pointers in tree blocks not
243  * referenced by their owner trees. The location of tree block is recorded
244  * in the back refs. Actually the full back refs is generic, and can be
245  * used in all cases the implicit back refs is used. The major shortcoming
246  * of the full back refs is its overhead. Every time a tree block gets
247  * COWed, we have to update back refs entry for all pointers in it.
248  *
249  * For a newly allocated tree block, we use implicit back refs for
250  * pointers in it. This means most tree related operations only involve
251  * implicit back refs. For a tree block created in old transaction, the
252  * only way to drop a reference to it is COW it. So we can detect the
253  * event that tree block loses its owner tree's reference and do the
254  * back refs conversion.
255  *
256  * When a tree block is COWed through a tree, there are four cases:
257  *
258  * The reference count of the block is one and the tree is the block's
259  * owner tree. Nothing to do in this case.
260  *
261  * The reference count of the block is one and the tree is not the
262  * block's owner tree. In this case, full back refs is used for pointers
263  * in the block. Remove these full back refs, add implicit back refs for
264  * every pointers in the new block.
265  *
266  * The reference count of the block is greater than one and the tree is
267  * the block's owner tree. In this case, implicit back refs is used for
268  * pointers in the block. Add full back refs for every pointers in the
269  * block, increase lower level extents' reference counts. The original
270  * implicit back refs are entailed to the new block.
271  *
272  * The reference count of the block is greater than one and the tree is
273  * not the block's owner tree. Add implicit back refs for every pointer in
274  * the new block, increase lower level extents' reference count.
275  *
276  * Back Reference Key composing:
277  *
278  * The key objectid corresponds to the first byte in the extent,
279  * The key type is used to differentiate between types of back refs.
280  * There are different meanings of the key offset for different types
281  * of back refs.
282  *
283  * File extents can be referenced by:
284  *
285  * - multiple snapshots, subvolumes, or different generations in one subvol
286  * - different files inside a single subvolume
287  * - different offsets inside a file (bookend extents in file.c)
288  *
289  * The extent ref structure for the implicit back refs has fields for:
290  *
291  * - Objectid of the subvolume root
292  * - objectid of the file holding the reference
293  * - original offset in the file
294  * - how many bookend extents
295  *
296  * The key offset for the implicit back refs is hash of the first
297  * three fields.
298  *
299  * The extent ref structure for the full back refs has field for:
300  *
301  * - number of pointers in the tree leaf
302  *
303  * The key offset for the implicit back refs is the first byte of
304  * the tree leaf
305  *
306  * When a file extent is allocated, The implicit back refs is used.
307  * the fields are filled in:
308  *
309  *     (root_key.objectid, inode objectid, offset in file, 1)
310  *
311  * When a file extent is removed file truncation, we find the
312  * corresponding implicit back refs and check the following fields:
313  *
314  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
315  *
316  * Btree extents can be referenced by:
317  *
318  * - Different subvolumes
319  *
320  * Both the implicit back refs and the full back refs for tree blocks
321  * only consist of key. The key offset for the implicit back refs is
322  * objectid of block's owner tree. The key offset for the full back refs
323  * is the first byte of parent block.
324  *
325  * When implicit back refs is used, information about the lowest key and
326  * level of the tree block are required. These information are stored in
327  * tree block info structure.
328  */
329 
330 /*
331  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
332  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
333  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
334  */
335 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
336 				     struct btrfs_extent_inline_ref *iref,
337 				     enum btrfs_inline_ref_type is_data)
338 {
339 	struct btrfs_fs_info *fs_info = eb->fs_info;
340 	int type = btrfs_extent_inline_ref_type(eb, iref);
341 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
342 
343 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
344 		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
345 		return type;
346 	}
347 
348 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
349 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
350 	    type == BTRFS_SHARED_DATA_REF_KEY ||
351 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
352 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
353 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
354 				return type;
355 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
356 				ASSERT(fs_info);
357 				/*
358 				 * Every shared one has parent tree block,
359 				 * which must be aligned to sector size.
360 				 */
361 				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
362 					return type;
363 			}
364 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
365 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
366 				return type;
367 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
368 				ASSERT(fs_info);
369 				/*
370 				 * Every shared one has parent tree block,
371 				 * which must be aligned to sector size.
372 				 */
373 				if (offset &&
374 				    IS_ALIGNED(offset, fs_info->sectorsize))
375 					return type;
376 			}
377 		} else {
378 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
379 			return type;
380 		}
381 	}
382 
383 	WARN_ON(1);
384 	btrfs_print_leaf(eb);
385 	btrfs_err(fs_info,
386 		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
387 		  eb->start, (unsigned long)iref, type);
388 
389 	return BTRFS_REF_TYPE_INVALID;
390 }
391 
392 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
393 {
394 	u32 high_crc = ~(u32)0;
395 	u32 low_crc = ~(u32)0;
396 	__le64 lenum;
397 
398 	lenum = cpu_to_le64(root_objectid);
399 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
400 	lenum = cpu_to_le64(owner);
401 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
402 	lenum = cpu_to_le64(offset);
403 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
404 
405 	return ((u64)high_crc << 31) ^ (u64)low_crc;
406 }
407 
408 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
409 				     struct btrfs_extent_data_ref *ref)
410 {
411 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
412 				    btrfs_extent_data_ref_objectid(leaf, ref),
413 				    btrfs_extent_data_ref_offset(leaf, ref));
414 }
415 
416 static int match_extent_data_ref(struct extent_buffer *leaf,
417 				 struct btrfs_extent_data_ref *ref,
418 				 u64 root_objectid, u64 owner, u64 offset)
419 {
420 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
421 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
422 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
423 		return 0;
424 	return 1;
425 }
426 
427 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
428 					   struct btrfs_path *path,
429 					   u64 bytenr, u64 parent,
430 					   u64 root_objectid,
431 					   u64 owner, u64 offset)
432 {
433 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
434 	struct btrfs_key key;
435 	struct btrfs_extent_data_ref *ref;
436 	struct extent_buffer *leaf;
437 	u32 nritems;
438 	int recow;
439 	int ret;
440 
441 	key.objectid = bytenr;
442 	if (parent) {
443 		key.type = BTRFS_SHARED_DATA_REF_KEY;
444 		key.offset = parent;
445 	} else {
446 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
447 		key.offset = hash_extent_data_ref(root_objectid,
448 						  owner, offset);
449 	}
450 again:
451 	recow = 0;
452 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
453 	if (ret < 0)
454 		return ret;
455 
456 	if (parent) {
457 		if (ret)
458 			return -ENOENT;
459 		return 0;
460 	}
461 
462 	ret = -ENOENT;
463 	leaf = path->nodes[0];
464 	nritems = btrfs_header_nritems(leaf);
465 	while (1) {
466 		if (path->slots[0] >= nritems) {
467 			ret = btrfs_next_leaf(root, path);
468 			if (ret) {
469 				if (ret > 0)
470 					return -ENOENT;
471 				return ret;
472 			}
473 
474 			leaf = path->nodes[0];
475 			nritems = btrfs_header_nritems(leaf);
476 			recow = 1;
477 		}
478 
479 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
480 		if (key.objectid != bytenr ||
481 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
482 			goto fail;
483 
484 		ref = btrfs_item_ptr(leaf, path->slots[0],
485 				     struct btrfs_extent_data_ref);
486 
487 		if (match_extent_data_ref(leaf, ref, root_objectid,
488 					  owner, offset)) {
489 			if (recow) {
490 				btrfs_release_path(path);
491 				goto again;
492 			}
493 			ret = 0;
494 			break;
495 		}
496 		path->slots[0]++;
497 	}
498 fail:
499 	return ret;
500 }
501 
502 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
503 					   struct btrfs_path *path,
504 					   struct btrfs_delayed_ref_node *node,
505 					   u64 bytenr)
506 {
507 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
508 	struct btrfs_key key;
509 	struct extent_buffer *leaf;
510 	u64 owner = btrfs_delayed_ref_owner(node);
511 	u64 offset = btrfs_delayed_ref_offset(node);
512 	u32 size;
513 	u32 num_refs;
514 	int ret;
515 
516 	key.objectid = bytenr;
517 	if (node->parent) {
518 		key.type = BTRFS_SHARED_DATA_REF_KEY;
519 		key.offset = node->parent;
520 		size = sizeof(struct btrfs_shared_data_ref);
521 	} else {
522 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
523 		key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
524 		size = sizeof(struct btrfs_extent_data_ref);
525 	}
526 
527 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
528 	if (ret && ret != -EEXIST)
529 		goto fail;
530 
531 	leaf = path->nodes[0];
532 	if (node->parent) {
533 		struct btrfs_shared_data_ref *ref;
534 		ref = btrfs_item_ptr(leaf, path->slots[0],
535 				     struct btrfs_shared_data_ref);
536 		if (ret == 0) {
537 			btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
538 		} else {
539 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
540 			num_refs += node->ref_mod;
541 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
542 		}
543 	} else {
544 		struct btrfs_extent_data_ref *ref;
545 		while (ret == -EEXIST) {
546 			ref = btrfs_item_ptr(leaf, path->slots[0],
547 					     struct btrfs_extent_data_ref);
548 			if (match_extent_data_ref(leaf, ref, node->ref_root,
549 						  owner, offset))
550 				break;
551 			btrfs_release_path(path);
552 			key.offset++;
553 			ret = btrfs_insert_empty_item(trans, root, path, &key,
554 						      size);
555 			if (ret && ret != -EEXIST)
556 				goto fail;
557 
558 			leaf = path->nodes[0];
559 		}
560 		ref = btrfs_item_ptr(leaf, path->slots[0],
561 				     struct btrfs_extent_data_ref);
562 		if (ret == 0) {
563 			btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
564 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
565 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
566 			btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
567 		} else {
568 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
569 			num_refs += node->ref_mod;
570 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
571 		}
572 	}
573 	btrfs_mark_buffer_dirty(trans, leaf);
574 	ret = 0;
575 fail:
576 	btrfs_release_path(path);
577 	return ret;
578 }
579 
580 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
581 					   struct btrfs_root *root,
582 					   struct btrfs_path *path,
583 					   int refs_to_drop)
584 {
585 	struct btrfs_key key;
586 	struct btrfs_extent_data_ref *ref1 = NULL;
587 	struct btrfs_shared_data_ref *ref2 = NULL;
588 	struct extent_buffer *leaf;
589 	u32 num_refs = 0;
590 	int ret = 0;
591 
592 	leaf = path->nodes[0];
593 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
594 
595 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
596 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
597 				      struct btrfs_extent_data_ref);
598 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
599 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
600 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
601 				      struct btrfs_shared_data_ref);
602 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
603 	} else {
604 		btrfs_err(trans->fs_info,
605 			  "unrecognized backref key (%llu %u %llu)",
606 			  key.objectid, key.type, key.offset);
607 		btrfs_abort_transaction(trans, -EUCLEAN);
608 		return -EUCLEAN;
609 	}
610 
611 	BUG_ON(num_refs < refs_to_drop);
612 	num_refs -= refs_to_drop;
613 
614 	if (num_refs == 0) {
615 		ret = btrfs_del_item(trans, root, path);
616 	} else {
617 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
618 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
619 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
620 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
621 		btrfs_mark_buffer_dirty(trans, leaf);
622 	}
623 	return ret;
624 }
625 
626 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
627 					  struct btrfs_extent_inline_ref *iref)
628 {
629 	struct btrfs_key key;
630 	struct extent_buffer *leaf;
631 	struct btrfs_extent_data_ref *ref1;
632 	struct btrfs_shared_data_ref *ref2;
633 	u32 num_refs = 0;
634 	int type;
635 
636 	leaf = path->nodes[0];
637 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
638 
639 	if (iref) {
640 		/*
641 		 * If type is invalid, we should have bailed out earlier than
642 		 * this call.
643 		 */
644 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
645 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
646 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
647 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
648 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649 		} else {
650 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
651 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
652 		}
653 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
654 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
655 				      struct btrfs_extent_data_ref);
656 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
657 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
658 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
659 				      struct btrfs_shared_data_ref);
660 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
661 	} else {
662 		WARN_ON(1);
663 	}
664 	return num_refs;
665 }
666 
667 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
668 					  struct btrfs_path *path,
669 					  u64 bytenr, u64 parent,
670 					  u64 root_objectid)
671 {
672 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
673 	struct btrfs_key key;
674 	int ret;
675 
676 	key.objectid = bytenr;
677 	if (parent) {
678 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
679 		key.offset = parent;
680 	} else {
681 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
682 		key.offset = root_objectid;
683 	}
684 
685 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
686 	if (ret > 0)
687 		ret = -ENOENT;
688 	return ret;
689 }
690 
691 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
692 					  struct btrfs_path *path,
693 					  struct btrfs_delayed_ref_node *node,
694 					  u64 bytenr)
695 {
696 	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
697 	struct btrfs_key key;
698 	int ret;
699 
700 	key.objectid = bytenr;
701 	if (node->parent) {
702 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
703 		key.offset = node->parent;
704 	} else {
705 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
706 		key.offset = node->ref_root;
707 	}
708 
709 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
710 	btrfs_release_path(path);
711 	return ret;
712 }
713 
714 static inline int extent_ref_type(u64 parent, u64 owner)
715 {
716 	int type;
717 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
718 		if (parent > 0)
719 			type = BTRFS_SHARED_BLOCK_REF_KEY;
720 		else
721 			type = BTRFS_TREE_BLOCK_REF_KEY;
722 	} else {
723 		if (parent > 0)
724 			type = BTRFS_SHARED_DATA_REF_KEY;
725 		else
726 			type = BTRFS_EXTENT_DATA_REF_KEY;
727 	}
728 	return type;
729 }
730 
731 static int find_next_key(struct btrfs_path *path, int level,
732 			 struct btrfs_key *key)
733 
734 {
735 	for (; level < BTRFS_MAX_LEVEL; level++) {
736 		if (!path->nodes[level])
737 			break;
738 		if (path->slots[level] + 1 >=
739 		    btrfs_header_nritems(path->nodes[level]))
740 			continue;
741 		if (level == 0)
742 			btrfs_item_key_to_cpu(path->nodes[level], key,
743 					      path->slots[level] + 1);
744 		else
745 			btrfs_node_key_to_cpu(path->nodes[level], key,
746 					      path->slots[level] + 1);
747 		return 0;
748 	}
749 	return 1;
750 }
751 
752 /*
753  * look for inline back ref. if back ref is found, *ref_ret is set
754  * to the address of inline back ref, and 0 is returned.
755  *
756  * if back ref isn't found, *ref_ret is set to the address where it
757  * should be inserted, and -ENOENT is returned.
758  *
759  * if insert is true and there are too many inline back refs, the path
760  * points to the extent item, and -EAGAIN is returned.
761  *
762  * NOTE: inline back refs are ordered in the same way that back ref
763  *	 items in the tree are ordered.
764  */
765 static noinline_for_stack
766 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
767 				 struct btrfs_path *path,
768 				 struct btrfs_extent_inline_ref **ref_ret,
769 				 u64 bytenr, u64 num_bytes,
770 				 u64 parent, u64 root_objectid,
771 				 u64 owner, u64 offset, int insert)
772 {
773 	struct btrfs_fs_info *fs_info = trans->fs_info;
774 	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
775 	struct btrfs_key key;
776 	struct extent_buffer *leaf;
777 	struct btrfs_extent_item *ei;
778 	struct btrfs_extent_inline_ref *iref;
779 	u64 flags;
780 	u64 item_size;
781 	unsigned long ptr;
782 	unsigned long end;
783 	int extra_size;
784 	int type;
785 	int want;
786 	int ret;
787 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
788 	int needed;
789 
790 	key.objectid = bytenr;
791 	key.type = BTRFS_EXTENT_ITEM_KEY;
792 	key.offset = num_bytes;
793 
794 	want = extent_ref_type(parent, owner);
795 	if (insert) {
796 		extra_size = btrfs_extent_inline_ref_size(want);
797 		path->search_for_extension = 1;
798 		path->keep_locks = 1;
799 	} else
800 		extra_size = -1;
801 
802 	/*
803 	 * Owner is our level, so we can just add one to get the level for the
804 	 * block we are interested in.
805 	 */
806 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
807 		key.type = BTRFS_METADATA_ITEM_KEY;
808 		key.offset = owner;
809 	}
810 
811 again:
812 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
813 	if (ret < 0)
814 		goto out;
815 
816 	/*
817 	 * We may be a newly converted file system which still has the old fat
818 	 * extent entries for metadata, so try and see if we have one of those.
819 	 */
820 	if (ret > 0 && skinny_metadata) {
821 		skinny_metadata = false;
822 		if (path->slots[0]) {
823 			path->slots[0]--;
824 			btrfs_item_key_to_cpu(path->nodes[0], &key,
825 					      path->slots[0]);
826 			if (key.objectid == bytenr &&
827 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
828 			    key.offset == num_bytes)
829 				ret = 0;
830 		}
831 		if (ret) {
832 			key.objectid = bytenr;
833 			key.type = BTRFS_EXTENT_ITEM_KEY;
834 			key.offset = num_bytes;
835 			btrfs_release_path(path);
836 			goto again;
837 		}
838 	}
839 
840 	if (ret && !insert) {
841 		ret = -ENOENT;
842 		goto out;
843 	} else if (WARN_ON(ret)) {
844 		btrfs_print_leaf(path->nodes[0]);
845 		btrfs_err(fs_info,
846 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
847 			  bytenr, num_bytes, parent, root_objectid, owner,
848 			  offset);
849 		ret = -EUCLEAN;
850 		goto out;
851 	}
852 
853 	leaf = path->nodes[0];
854 	item_size = btrfs_item_size(leaf, path->slots[0]);
855 	if (unlikely(item_size < sizeof(*ei))) {
856 		ret = -EUCLEAN;
857 		btrfs_err(fs_info,
858 			  "unexpected extent item size, has %llu expect >= %zu",
859 			  item_size, sizeof(*ei));
860 		btrfs_abort_transaction(trans, ret);
861 		goto out;
862 	}
863 
864 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
865 	flags = btrfs_extent_flags(leaf, ei);
866 
867 	ptr = (unsigned long)(ei + 1);
868 	end = (unsigned long)ei + item_size;
869 
870 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
871 		ptr += sizeof(struct btrfs_tree_block_info);
872 		BUG_ON(ptr > end);
873 	}
874 
875 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
876 		needed = BTRFS_REF_TYPE_DATA;
877 	else
878 		needed = BTRFS_REF_TYPE_BLOCK;
879 
880 	ret = -ENOENT;
881 	while (ptr < end) {
882 		iref = (struct btrfs_extent_inline_ref *)ptr;
883 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
884 		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
885 			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
886 			ptr += btrfs_extent_inline_ref_size(type);
887 			continue;
888 		}
889 		if (type == BTRFS_REF_TYPE_INVALID) {
890 			ret = -EUCLEAN;
891 			goto out;
892 		}
893 
894 		if (want < type)
895 			break;
896 		if (want > type) {
897 			ptr += btrfs_extent_inline_ref_size(type);
898 			continue;
899 		}
900 
901 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
902 			struct btrfs_extent_data_ref *dref;
903 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
904 			if (match_extent_data_ref(leaf, dref, root_objectid,
905 						  owner, offset)) {
906 				ret = 0;
907 				break;
908 			}
909 			if (hash_extent_data_ref_item(leaf, dref) <
910 			    hash_extent_data_ref(root_objectid, owner, offset))
911 				break;
912 		} else {
913 			u64 ref_offset;
914 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
915 			if (parent > 0) {
916 				if (parent == ref_offset) {
917 					ret = 0;
918 					break;
919 				}
920 				if (ref_offset < parent)
921 					break;
922 			} else {
923 				if (root_objectid == ref_offset) {
924 					ret = 0;
925 					break;
926 				}
927 				if (ref_offset < root_objectid)
928 					break;
929 			}
930 		}
931 		ptr += btrfs_extent_inline_ref_size(type);
932 	}
933 
934 	if (unlikely(ptr > end)) {
935 		ret = -EUCLEAN;
936 		btrfs_print_leaf(path->nodes[0]);
937 		btrfs_crit(fs_info,
938 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
939 			   path->slots[0], root_objectid, owner, offset, parent);
940 		goto out;
941 	}
942 
943 	if (ret == -ENOENT && insert) {
944 		if (item_size + extra_size >=
945 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
946 			ret = -EAGAIN;
947 			goto out;
948 		}
949 		/*
950 		 * To add new inline back ref, we have to make sure
951 		 * there is no corresponding back ref item.
952 		 * For simplicity, we just do not add new inline back
953 		 * ref if there is any kind of item for this block
954 		 */
955 		if (find_next_key(path, 0, &key) == 0 &&
956 		    key.objectid == bytenr &&
957 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
958 			ret = -EAGAIN;
959 			goto out;
960 		}
961 	}
962 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963 out:
964 	if (insert) {
965 		path->keep_locks = 0;
966 		path->search_for_extension = 0;
967 		btrfs_unlock_up_safe(path, 1);
968 	}
969 	return ret;
970 }
971 
972 /*
973  * helper to add new inline back ref
974  */
975 static noinline_for_stack
976 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
977 				 struct btrfs_path *path,
978 				 struct btrfs_extent_inline_ref *iref,
979 				 u64 parent, u64 root_objectid,
980 				 u64 owner, u64 offset, int refs_to_add,
981 				 struct btrfs_delayed_extent_op *extent_op)
982 {
983 	struct extent_buffer *leaf;
984 	struct btrfs_extent_item *ei;
985 	unsigned long ptr;
986 	unsigned long end;
987 	unsigned long item_offset;
988 	u64 refs;
989 	int size;
990 	int type;
991 
992 	leaf = path->nodes[0];
993 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994 	item_offset = (unsigned long)iref - (unsigned long)ei;
995 
996 	type = extent_ref_type(parent, owner);
997 	size = btrfs_extent_inline_ref_size(type);
998 
999 	btrfs_extend_item(trans, path, size);
1000 
1001 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002 	refs = btrfs_extent_refs(leaf, ei);
1003 	refs += refs_to_add;
1004 	btrfs_set_extent_refs(leaf, ei, refs);
1005 	if (extent_op)
1006 		__run_delayed_extent_op(extent_op, leaf, ei);
1007 
1008 	ptr = (unsigned long)ei + item_offset;
1009 	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1010 	if (ptr < end - size)
1011 		memmove_extent_buffer(leaf, ptr + size, ptr,
1012 				      end - size - ptr);
1013 
1014 	iref = (struct btrfs_extent_inline_ref *)ptr;
1015 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017 		struct btrfs_extent_data_ref *dref;
1018 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024 		struct btrfs_shared_data_ref *sref;
1025 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030 	} else {
1031 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032 	}
1033 	btrfs_mark_buffer_dirty(trans, leaf);
1034 }
1035 
1036 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1037 				 struct btrfs_path *path,
1038 				 struct btrfs_extent_inline_ref **ref_ret,
1039 				 u64 bytenr, u64 num_bytes, u64 parent,
1040 				 u64 root_objectid, u64 owner, u64 offset)
1041 {
1042 	int ret;
1043 
1044 	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045 					   num_bytes, parent, root_objectid,
1046 					   owner, offset, 0);
1047 	if (ret != -ENOENT)
1048 		return ret;
1049 
1050 	btrfs_release_path(path);
1051 	*ref_ret = NULL;
1052 
1053 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054 		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055 					    root_objectid);
1056 	} else {
1057 		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058 					     root_objectid, owner, offset);
1059 	}
1060 	return ret;
1061 }
1062 
1063 /*
1064  * helper to update/remove inline back ref
1065  */
1066 static noinline_for_stack int update_inline_extent_backref(
1067 				  struct btrfs_trans_handle *trans,
1068 				  struct btrfs_path *path,
1069 				  struct btrfs_extent_inline_ref *iref,
1070 				  int refs_to_mod,
1071 				  struct btrfs_delayed_extent_op *extent_op)
1072 {
1073 	struct extent_buffer *leaf = path->nodes[0];
1074 	struct btrfs_fs_info *fs_info = leaf->fs_info;
1075 	struct btrfs_extent_item *ei;
1076 	struct btrfs_extent_data_ref *dref = NULL;
1077 	struct btrfs_shared_data_ref *sref = NULL;
1078 	unsigned long ptr;
1079 	unsigned long end;
1080 	u32 item_size;
1081 	int size;
1082 	int type;
1083 	u64 refs;
1084 
1085 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1086 	refs = btrfs_extent_refs(leaf, ei);
1087 	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1088 		struct btrfs_key key;
1089 		u32 extent_size;
1090 
1091 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1092 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1093 			extent_size = fs_info->nodesize;
1094 		else
1095 			extent_size = key.offset;
1096 		btrfs_print_leaf(leaf);
1097 		btrfs_err(fs_info,
1098 	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1099 			  key.objectid, extent_size, refs_to_mod, refs);
1100 		return -EUCLEAN;
1101 	}
1102 	refs += refs_to_mod;
1103 	btrfs_set_extent_refs(leaf, ei, refs);
1104 	if (extent_op)
1105 		__run_delayed_extent_op(extent_op, leaf, ei);
1106 
1107 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1108 	/*
1109 	 * Function btrfs_get_extent_inline_ref_type() has already printed
1110 	 * error messages.
1111 	 */
1112 	if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1113 		return -EUCLEAN;
1114 
1115 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1116 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1117 		refs = btrfs_extent_data_ref_count(leaf, dref);
1118 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1119 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1120 		refs = btrfs_shared_data_ref_count(leaf, sref);
1121 	} else {
1122 		refs = 1;
1123 		/*
1124 		 * For tree blocks we can only drop one ref for it, and tree
1125 		 * blocks should not have refs > 1.
1126 		 *
1127 		 * Furthermore if we're inserting a new inline backref, we
1128 		 * won't reach this path either. That would be
1129 		 * setup_inline_extent_backref().
1130 		 */
1131 		if (unlikely(refs_to_mod != -1)) {
1132 			struct btrfs_key key;
1133 
1134 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 
1136 			btrfs_print_leaf(leaf);
1137 			btrfs_err(fs_info,
1138 			"invalid refs_to_mod for tree block %llu, has %d expect -1",
1139 				  key.objectid, refs_to_mod);
1140 			return -EUCLEAN;
1141 		}
1142 	}
1143 
1144 	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1145 		struct btrfs_key key;
1146 		u32 extent_size;
1147 
1148 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1149 		if (key.type == BTRFS_METADATA_ITEM_KEY)
1150 			extent_size = fs_info->nodesize;
1151 		else
1152 			extent_size = key.offset;
1153 		btrfs_print_leaf(leaf);
1154 		btrfs_err(fs_info,
1155 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1156 			  (unsigned long)iref, key.objectid, extent_size,
1157 			  refs_to_mod, refs);
1158 		return -EUCLEAN;
1159 	}
1160 	refs += refs_to_mod;
1161 
1162 	if (refs > 0) {
1163 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1164 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1165 		else
1166 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1167 	} else {
1168 		size =  btrfs_extent_inline_ref_size(type);
1169 		item_size = btrfs_item_size(leaf, path->slots[0]);
1170 		ptr = (unsigned long)iref;
1171 		end = (unsigned long)ei + item_size;
1172 		if (ptr + size < end)
1173 			memmove_extent_buffer(leaf, ptr, ptr + size,
1174 					      end - ptr - size);
1175 		item_size -= size;
1176 		btrfs_truncate_item(trans, path, item_size, 1);
1177 	}
1178 	btrfs_mark_buffer_dirty(trans, leaf);
1179 	return 0;
1180 }
1181 
1182 static noinline_for_stack
1183 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1184 				 struct btrfs_path *path,
1185 				 u64 bytenr, u64 num_bytes, u64 parent,
1186 				 u64 root_objectid, u64 owner,
1187 				 u64 offset, int refs_to_add,
1188 				 struct btrfs_delayed_extent_op *extent_op)
1189 {
1190 	struct btrfs_extent_inline_ref *iref;
1191 	int ret;
1192 
1193 	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1194 					   num_bytes, parent, root_objectid,
1195 					   owner, offset, 1);
1196 	if (ret == 0) {
1197 		/*
1198 		 * We're adding refs to a tree block we already own, this
1199 		 * should not happen at all.
1200 		 */
1201 		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1202 			btrfs_print_leaf(path->nodes[0]);
1203 			btrfs_crit(trans->fs_info,
1204 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1205 				   bytenr, num_bytes, root_objectid, path->slots[0]);
1206 			return -EUCLEAN;
1207 		}
1208 		ret = update_inline_extent_backref(trans, path, iref,
1209 						   refs_to_add, extent_op);
1210 	} else if (ret == -ENOENT) {
1211 		setup_inline_extent_backref(trans, path, iref, parent,
1212 					    root_objectid, owner, offset,
1213 					    refs_to_add, extent_op);
1214 		ret = 0;
1215 	}
1216 	return ret;
1217 }
1218 
1219 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1220 				 struct btrfs_root *root,
1221 				 struct btrfs_path *path,
1222 				 struct btrfs_extent_inline_ref *iref,
1223 				 int refs_to_drop, int is_data)
1224 {
1225 	int ret = 0;
1226 
1227 	BUG_ON(!is_data && refs_to_drop != 1);
1228 	if (iref)
1229 		ret = update_inline_extent_backref(trans, path, iref,
1230 						   -refs_to_drop, NULL);
1231 	else if (is_data)
1232 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1233 	else
1234 		ret = btrfs_del_item(trans, root, path);
1235 	return ret;
1236 }
1237 
1238 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1239 			       u64 *discarded_bytes)
1240 {
1241 	int j, ret = 0;
1242 	u64 bytes_left, end;
1243 	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1244 
1245 	/* Adjust the range to be aligned to 512B sectors if necessary. */
1246 	if (start != aligned_start) {
1247 		len -= aligned_start - start;
1248 		len = round_down(len, 1 << SECTOR_SHIFT);
1249 		start = aligned_start;
1250 	}
1251 
1252 	*discarded_bytes = 0;
1253 
1254 	if (!len)
1255 		return 0;
1256 
1257 	end = start + len;
1258 	bytes_left = len;
1259 
1260 	/* Skip any superblocks on this device. */
1261 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262 		u64 sb_start = btrfs_sb_offset(j);
1263 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264 		u64 size = sb_start - start;
1265 
1266 		if (!in_range(sb_start, start, bytes_left) &&
1267 		    !in_range(sb_end, start, bytes_left) &&
1268 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269 			continue;
1270 
1271 		/*
1272 		 * Superblock spans beginning of range.  Adjust start and
1273 		 * try again.
1274 		 */
1275 		if (sb_start <= start) {
1276 			start += sb_end - start;
1277 			if (start > end) {
1278 				bytes_left = 0;
1279 				break;
1280 			}
1281 			bytes_left = end - start;
1282 			continue;
1283 		}
1284 
1285 		if (size) {
1286 			ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1287 						   size >> SECTOR_SHIFT,
1288 						   GFP_NOFS);
1289 			if (!ret)
1290 				*discarded_bytes += size;
1291 			else if (ret != -EOPNOTSUPP)
1292 				return ret;
1293 		}
1294 
1295 		start = sb_end;
1296 		if (start > end) {
1297 			bytes_left = 0;
1298 			break;
1299 		}
1300 		bytes_left = end - start;
1301 	}
1302 
1303 	while (bytes_left) {
1304 		u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left);
1305 
1306 		ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1307 					   bytes_to_discard >> SECTOR_SHIFT,
1308 					   GFP_NOFS);
1309 
1310 		if (ret) {
1311 			if (ret != -EOPNOTSUPP)
1312 				break;
1313 			continue;
1314 		}
1315 
1316 		start += bytes_to_discard;
1317 		bytes_left -= bytes_to_discard;
1318 		*discarded_bytes += bytes_to_discard;
1319 
1320 		if (btrfs_trim_interrupted()) {
1321 			ret = -ERESTARTSYS;
1322 			break;
1323 		}
1324 	}
1325 
1326 	return ret;
1327 }
1328 
1329 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1330 {
1331 	struct btrfs_device *dev = stripe->dev;
1332 	struct btrfs_fs_info *fs_info = dev->fs_info;
1333 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1334 	u64 phys = stripe->physical;
1335 	u64 len = stripe->length;
1336 	u64 discarded = 0;
1337 	int ret = 0;
1338 
1339 	/* Zone reset on a zoned filesystem */
1340 	if (btrfs_can_zone_reset(dev, phys, len)) {
1341 		u64 src_disc;
1342 
1343 		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1344 		if (ret)
1345 			goto out;
1346 
1347 		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1348 		    dev != dev_replace->srcdev)
1349 			goto out;
1350 
1351 		src_disc = discarded;
1352 
1353 		/* Send to replace target as well */
1354 		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1355 					      &discarded);
1356 		discarded += src_disc;
1357 	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1358 		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1359 	} else {
1360 		ret = 0;
1361 		*bytes = 0;
1362 	}
1363 
1364 out:
1365 	*bytes = discarded;
1366 	return ret;
1367 }
1368 
1369 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1370 			 u64 num_bytes, u64 *actual_bytes)
1371 {
1372 	int ret = 0;
1373 	u64 discarded_bytes = 0;
1374 	u64 end = bytenr + num_bytes;
1375 	u64 cur = bytenr;
1376 
1377 	/*
1378 	 * Avoid races with device replace and make sure the devices in the
1379 	 * stripes don't go away while we are discarding.
1380 	 */
1381 	btrfs_bio_counter_inc_blocked(fs_info);
1382 	while (cur < end) {
1383 		struct btrfs_discard_stripe *stripes;
1384 		unsigned int num_stripes;
1385 		int i;
1386 
1387 		num_bytes = end - cur;
1388 		stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1389 		if (IS_ERR(stripes)) {
1390 			ret = PTR_ERR(stripes);
1391 			if (ret == -EOPNOTSUPP)
1392 				ret = 0;
1393 			break;
1394 		}
1395 
1396 		for (i = 0; i < num_stripes; i++) {
1397 			struct btrfs_discard_stripe *stripe = stripes + i;
1398 			u64 bytes;
1399 
1400 			if (!stripe->dev->bdev) {
1401 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1402 				continue;
1403 			}
1404 
1405 			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1406 					&stripe->dev->dev_state))
1407 				continue;
1408 
1409 			ret = do_discard_extent(stripe, &bytes);
1410 			if (ret) {
1411 				/*
1412 				 * Keep going if discard is not supported by the
1413 				 * device.
1414 				 */
1415 				if (ret != -EOPNOTSUPP)
1416 					break;
1417 				ret = 0;
1418 			} else {
1419 				discarded_bytes += bytes;
1420 			}
1421 		}
1422 		kfree(stripes);
1423 		if (ret)
1424 			break;
1425 		cur += num_bytes;
1426 	}
1427 	btrfs_bio_counter_dec(fs_info);
1428 	if (actual_bytes)
1429 		*actual_bytes = discarded_bytes;
1430 	return ret;
1431 }
1432 
1433 /* Can return -ENOMEM */
1434 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1435 			 struct btrfs_ref *generic_ref)
1436 {
1437 	struct btrfs_fs_info *fs_info = trans->fs_info;
1438 	int ret;
1439 
1440 	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1441 	       generic_ref->action);
1442 	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1443 	       generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);
1444 
1445 	if (generic_ref->type == BTRFS_REF_METADATA)
1446 		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1447 	else
1448 		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1449 
1450 	btrfs_ref_tree_mod(fs_info, generic_ref);
1451 
1452 	return ret;
1453 }
1454 
1455 /*
1456  * Insert backreference for a given extent.
1457  *
1458  * The counterpart is in __btrfs_free_extent(), with examples and more details
1459  * how it works.
1460  *
1461  * @trans:	    Handle of transaction
1462  *
1463  * @node:	    The delayed ref node used to get the bytenr/length for
1464  *		    extent whose references are incremented.
1465  *
1466  * @extent_op       Pointer to a structure, holding information necessary when
1467  *                  updating a tree block's flags
1468  *
1469  */
1470 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1471 				  struct btrfs_delayed_ref_node *node,
1472 				  struct btrfs_delayed_extent_op *extent_op)
1473 {
1474 	struct btrfs_path *path;
1475 	struct extent_buffer *leaf;
1476 	struct btrfs_extent_item *item;
1477 	struct btrfs_key key;
1478 	u64 bytenr = node->bytenr;
1479 	u64 num_bytes = node->num_bytes;
1480 	u64 owner = btrfs_delayed_ref_owner(node);
1481 	u64 offset = btrfs_delayed_ref_offset(node);
1482 	u64 refs;
1483 	int refs_to_add = node->ref_mod;
1484 	int ret;
1485 
1486 	path = btrfs_alloc_path();
1487 	if (!path)
1488 		return -ENOMEM;
1489 
1490 	/* this will setup the path even if it fails to insert the back ref */
1491 	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1492 					   node->parent, node->ref_root, owner,
1493 					   offset, refs_to_add, extent_op);
1494 	if ((ret < 0 && ret != -EAGAIN) || !ret)
1495 		goto out;
1496 
1497 	/*
1498 	 * Ok we had -EAGAIN which means we didn't have space to insert and
1499 	 * inline extent ref, so just update the reference count and add a
1500 	 * normal backref.
1501 	 */
1502 	leaf = path->nodes[0];
1503 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1504 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1505 	refs = btrfs_extent_refs(leaf, item);
1506 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1507 	if (extent_op)
1508 		__run_delayed_extent_op(extent_op, leaf, item);
1509 
1510 	btrfs_mark_buffer_dirty(trans, leaf);
1511 	btrfs_release_path(path);
1512 
1513 	/* now insert the actual backref */
1514 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1515 		ret = insert_tree_block_ref(trans, path, node, bytenr);
1516 	else
1517 		ret = insert_extent_data_ref(trans, path, node, bytenr);
1518 
1519 	if (ret)
1520 		btrfs_abort_transaction(trans, ret);
1521 out:
1522 	btrfs_free_path(path);
1523 	return ret;
1524 }
1525 
1526 static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
1527 				     struct btrfs_delayed_ref_head *href)
1528 {
1529 	u64 root = href->owning_root;
1530 
1531 	/*
1532 	 * Don't check must_insert_reserved, as this is called from contexts
1533 	 * where it has already been unset.
1534 	 */
1535 	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
1536 	    !href->is_data || !is_fstree(root))
1537 		return;
1538 
1539 	btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
1540 				  BTRFS_QGROUP_RSV_DATA);
1541 }
1542 
1543 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1544 				struct btrfs_delayed_ref_head *href,
1545 				struct btrfs_delayed_ref_node *node,
1546 				struct btrfs_delayed_extent_op *extent_op,
1547 				bool insert_reserved)
1548 {
1549 	int ret = 0;
1550 	u64 parent = 0;
1551 	u64 flags = 0;
1552 
1553 	trace_run_delayed_data_ref(trans->fs_info, node);
1554 
1555 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1556 		parent = node->parent;
1557 
1558 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1559 		struct btrfs_key key;
1560 		struct btrfs_squota_delta delta = {
1561 			.root = href->owning_root,
1562 			.num_bytes = node->num_bytes,
1563 			.is_data = true,
1564 			.is_inc	= true,
1565 			.generation = trans->transid,
1566 		};
1567 		u64 owner = btrfs_delayed_ref_owner(node);
1568 		u64 offset = btrfs_delayed_ref_offset(node);
1569 
1570 		if (extent_op)
1571 			flags |= extent_op->flags_to_set;
1572 
1573 		key.objectid = node->bytenr;
1574 		key.type = BTRFS_EXTENT_ITEM_KEY;
1575 		key.offset = node->num_bytes;
1576 
1577 		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
1578 						 flags, owner, offset, &key,
1579 						 node->ref_mod,
1580 						 href->owning_root);
1581 		free_head_ref_squota_rsv(trans->fs_info, href);
1582 		if (!ret)
1583 			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
1584 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1585 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1586 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1587 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1588 	} else {
1589 		BUG();
1590 	}
1591 	return ret;
1592 }
1593 
1594 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1595 				    struct extent_buffer *leaf,
1596 				    struct btrfs_extent_item *ei)
1597 {
1598 	u64 flags = btrfs_extent_flags(leaf, ei);
1599 	if (extent_op->update_flags) {
1600 		flags |= extent_op->flags_to_set;
1601 		btrfs_set_extent_flags(leaf, ei, flags);
1602 	}
1603 
1604 	if (extent_op->update_key) {
1605 		struct btrfs_tree_block_info *bi;
1606 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1607 		bi = (struct btrfs_tree_block_info *)(ei + 1);
1608 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1609 	}
1610 }
1611 
1612 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1613 				 struct btrfs_delayed_ref_head *head,
1614 				 struct btrfs_delayed_extent_op *extent_op)
1615 {
1616 	struct btrfs_fs_info *fs_info = trans->fs_info;
1617 	struct btrfs_root *root;
1618 	struct btrfs_key key;
1619 	struct btrfs_path *path;
1620 	struct btrfs_extent_item *ei;
1621 	struct extent_buffer *leaf;
1622 	u32 item_size;
1623 	int ret;
1624 	int metadata = 1;
1625 
1626 	if (TRANS_ABORTED(trans))
1627 		return 0;
1628 
1629 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1630 		metadata = 0;
1631 
1632 	path = btrfs_alloc_path();
1633 	if (!path)
1634 		return -ENOMEM;
1635 
1636 	key.objectid = head->bytenr;
1637 
1638 	if (metadata) {
1639 		key.type = BTRFS_METADATA_ITEM_KEY;
1640 		key.offset = head->level;
1641 	} else {
1642 		key.type = BTRFS_EXTENT_ITEM_KEY;
1643 		key.offset = head->num_bytes;
1644 	}
1645 
1646 	root = btrfs_extent_root(fs_info, key.objectid);
1647 again:
1648 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1649 	if (ret < 0) {
1650 		goto out;
1651 	} else if (ret > 0) {
1652 		if (metadata) {
1653 			if (path->slots[0] > 0) {
1654 				path->slots[0]--;
1655 				btrfs_item_key_to_cpu(path->nodes[0], &key,
1656 						      path->slots[0]);
1657 				if (key.objectid == head->bytenr &&
1658 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1659 				    key.offset == head->num_bytes)
1660 					ret = 0;
1661 			}
1662 			if (ret > 0) {
1663 				btrfs_release_path(path);
1664 				metadata = 0;
1665 
1666 				key.objectid = head->bytenr;
1667 				key.offset = head->num_bytes;
1668 				key.type = BTRFS_EXTENT_ITEM_KEY;
1669 				goto again;
1670 			}
1671 		} else {
1672 			ret = -EUCLEAN;
1673 			btrfs_err(fs_info,
1674 		  "missing extent item for extent %llu num_bytes %llu level %d",
1675 				  head->bytenr, head->num_bytes, head->level);
1676 			goto out;
1677 		}
1678 	}
1679 
1680 	leaf = path->nodes[0];
1681 	item_size = btrfs_item_size(leaf, path->slots[0]);
1682 
1683 	if (unlikely(item_size < sizeof(*ei))) {
1684 		ret = -EUCLEAN;
1685 		btrfs_err(fs_info,
1686 			  "unexpected extent item size, has %u expect >= %zu",
1687 			  item_size, sizeof(*ei));
1688 		btrfs_abort_transaction(trans, ret);
1689 		goto out;
1690 	}
1691 
1692 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1693 	__run_delayed_extent_op(extent_op, leaf, ei);
1694 
1695 	btrfs_mark_buffer_dirty(trans, leaf);
1696 out:
1697 	btrfs_free_path(path);
1698 	return ret;
1699 }
1700 
1701 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1702 				struct btrfs_delayed_ref_head *href,
1703 				struct btrfs_delayed_ref_node *node,
1704 				struct btrfs_delayed_extent_op *extent_op,
1705 				bool insert_reserved)
1706 {
1707 	int ret = 0;
1708 	struct btrfs_fs_info *fs_info = trans->fs_info;
1709 	u64 parent = 0;
1710 	u64 ref_root = 0;
1711 
1712 	trace_run_delayed_tree_ref(trans->fs_info, node);
1713 
1714 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1715 		parent = node->parent;
1716 	ref_root = node->ref_root;
1717 
1718 	if (unlikely(node->ref_mod != 1)) {
1719 		btrfs_err(trans->fs_info,
1720 	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1721 			  node->bytenr, node->ref_mod, node->action, ref_root,
1722 			  parent);
1723 		return -EUCLEAN;
1724 	}
1725 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1726 		struct btrfs_squota_delta delta = {
1727 			.root = href->owning_root,
1728 			.num_bytes = fs_info->nodesize,
1729 			.is_data = false,
1730 			.is_inc = true,
1731 			.generation = trans->transid,
1732 		};
1733 
1734 		ret = alloc_reserved_tree_block(trans, node, extent_op);
1735 		if (!ret)
1736 			btrfs_record_squota_delta(fs_info, &delta);
1737 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1738 		ret = __btrfs_inc_extent_ref(trans, node, extent_op);
1739 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1740 		ret = __btrfs_free_extent(trans, href, node, extent_op);
1741 	} else {
1742 		BUG();
1743 	}
1744 	return ret;
1745 }
1746 
1747 /* helper function to actually process a single delayed ref entry */
1748 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1749 			       struct btrfs_delayed_ref_head *href,
1750 			       struct btrfs_delayed_ref_node *node,
1751 			       struct btrfs_delayed_extent_op *extent_op,
1752 			       bool insert_reserved)
1753 {
1754 	int ret = 0;
1755 
1756 	if (TRANS_ABORTED(trans)) {
1757 		if (insert_reserved) {
1758 			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1759 			free_head_ref_squota_rsv(trans->fs_info, href);
1760 		}
1761 		return 0;
1762 	}
1763 
1764 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1765 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1766 		ret = run_delayed_tree_ref(trans, href, node, extent_op,
1767 					   insert_reserved);
1768 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1769 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1770 		ret = run_delayed_data_ref(trans, href, node, extent_op,
1771 					   insert_reserved);
1772 	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
1773 		ret = 0;
1774 	else
1775 		BUG();
1776 	if (ret && insert_reserved)
1777 		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1778 	if (ret < 0)
1779 		btrfs_err(trans->fs_info,
1780 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1781 			  node->bytenr, node->num_bytes, node->type,
1782 			  node->action, node->ref_mod, ret);
1783 	return ret;
1784 }
1785 
1786 static inline struct btrfs_delayed_ref_node *
1787 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1788 {
1789 	struct btrfs_delayed_ref_node *ref;
1790 
1791 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1792 		return NULL;
1793 
1794 	/*
1795 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1796 	 * This is to prevent a ref count from going down to zero, which deletes
1797 	 * the extent item from the extent tree, when there still are references
1798 	 * to add, which would fail because they would not find the extent item.
1799 	 */
1800 	if (!list_empty(&head->ref_add_list))
1801 		return list_first_entry(&head->ref_add_list,
1802 				struct btrfs_delayed_ref_node, add_list);
1803 
1804 	ref = rb_entry(rb_first_cached(&head->ref_tree),
1805 		       struct btrfs_delayed_ref_node, ref_node);
1806 	ASSERT(list_empty(&ref->add_list));
1807 	return ref;
1808 }
1809 
1810 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1811 				      struct btrfs_delayed_ref_head *head)
1812 {
1813 	spin_lock(&delayed_refs->lock);
1814 	head->processing = false;
1815 	delayed_refs->num_heads_ready++;
1816 	spin_unlock(&delayed_refs->lock);
1817 	btrfs_delayed_ref_unlock(head);
1818 }
1819 
1820 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1821 				struct btrfs_delayed_ref_head *head)
1822 {
1823 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1824 
1825 	if (!extent_op)
1826 		return NULL;
1827 
1828 	if (head->must_insert_reserved) {
1829 		head->extent_op = NULL;
1830 		btrfs_free_delayed_extent_op(extent_op);
1831 		return NULL;
1832 	}
1833 	return extent_op;
1834 }
1835 
1836 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1837 				     struct btrfs_delayed_ref_head *head)
1838 {
1839 	struct btrfs_delayed_extent_op *extent_op;
1840 	int ret;
1841 
1842 	extent_op = cleanup_extent_op(head);
1843 	if (!extent_op)
1844 		return 0;
1845 	head->extent_op = NULL;
1846 	spin_unlock(&head->lock);
1847 	ret = run_delayed_extent_op(trans, head, extent_op);
1848 	btrfs_free_delayed_extent_op(extent_op);
1849 	return ret ? ret : 1;
1850 }
1851 
1852 u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1853 				  struct btrfs_delayed_ref_root *delayed_refs,
1854 				  struct btrfs_delayed_ref_head *head)
1855 {
1856 	u64 ret = 0;
1857 
1858 	/*
1859 	 * We had csum deletions accounted for in our delayed refs rsv, we need
1860 	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1861 	 */
1862 	if (head->total_ref_mod < 0 && head->is_data) {
1863 		int nr_csums;
1864 
1865 		spin_lock(&delayed_refs->lock);
1866 		delayed_refs->pending_csums -= head->num_bytes;
1867 		spin_unlock(&delayed_refs->lock);
1868 		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1869 
1870 		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
1871 
1872 		ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
1873 	}
1874 	/* must_insert_reserved can be set only if we didn't run the head ref. */
1875 	if (head->must_insert_reserved)
1876 		free_head_ref_squota_rsv(fs_info, head);
1877 
1878 	return ret;
1879 }
1880 
1881 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1882 			    struct btrfs_delayed_ref_head *head,
1883 			    u64 *bytes_released)
1884 {
1885 
1886 	struct btrfs_fs_info *fs_info = trans->fs_info;
1887 	struct btrfs_delayed_ref_root *delayed_refs;
1888 	int ret;
1889 
1890 	delayed_refs = &trans->transaction->delayed_refs;
1891 
1892 	ret = run_and_cleanup_extent_op(trans, head);
1893 	if (ret < 0) {
1894 		unselect_delayed_ref_head(delayed_refs, head);
1895 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1896 		return ret;
1897 	} else if (ret) {
1898 		return ret;
1899 	}
1900 
1901 	/*
1902 	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1903 	 * and then re-check to make sure nobody got added.
1904 	 */
1905 	spin_unlock(&head->lock);
1906 	spin_lock(&delayed_refs->lock);
1907 	spin_lock(&head->lock);
1908 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1909 		spin_unlock(&head->lock);
1910 		spin_unlock(&delayed_refs->lock);
1911 		return 1;
1912 	}
1913 	btrfs_delete_ref_head(delayed_refs, head);
1914 	spin_unlock(&head->lock);
1915 	spin_unlock(&delayed_refs->lock);
1916 
1917 	if (head->must_insert_reserved) {
1918 		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1919 		if (head->is_data) {
1920 			struct btrfs_root *csum_root;
1921 
1922 			csum_root = btrfs_csum_root(fs_info, head->bytenr);
1923 			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1924 					      head->num_bytes);
1925 		}
1926 	}
1927 
1928 	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1929 
1930 	trace_run_delayed_ref_head(fs_info, head, 0);
1931 	btrfs_delayed_ref_unlock(head);
1932 	btrfs_put_delayed_ref_head(head);
1933 	return ret;
1934 }
1935 
1936 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1937 					struct btrfs_trans_handle *trans)
1938 {
1939 	struct btrfs_delayed_ref_root *delayed_refs =
1940 		&trans->transaction->delayed_refs;
1941 	struct btrfs_delayed_ref_head *head = NULL;
1942 	int ret;
1943 
1944 	spin_lock(&delayed_refs->lock);
1945 	head = btrfs_select_ref_head(delayed_refs);
1946 	if (!head) {
1947 		spin_unlock(&delayed_refs->lock);
1948 		return head;
1949 	}
1950 
1951 	/*
1952 	 * Grab the lock that says we are going to process all the refs for
1953 	 * this head
1954 	 */
1955 	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1956 	spin_unlock(&delayed_refs->lock);
1957 
1958 	/*
1959 	 * We may have dropped the spin lock to get the head mutex lock, and
1960 	 * that might have given someone else time to free the head.  If that's
1961 	 * true, it has been removed from our list and we can move on.
1962 	 */
1963 	if (ret == -EAGAIN)
1964 		head = ERR_PTR(-EAGAIN);
1965 
1966 	return head;
1967 }
1968 
1969 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1970 					   struct btrfs_delayed_ref_head *locked_ref,
1971 					   u64 *bytes_released)
1972 {
1973 	struct btrfs_fs_info *fs_info = trans->fs_info;
1974 	struct btrfs_delayed_ref_root *delayed_refs;
1975 	struct btrfs_delayed_extent_op *extent_op;
1976 	struct btrfs_delayed_ref_node *ref;
1977 	bool must_insert_reserved;
1978 	int ret;
1979 
1980 	delayed_refs = &trans->transaction->delayed_refs;
1981 
1982 	lockdep_assert_held(&locked_ref->mutex);
1983 	lockdep_assert_held(&locked_ref->lock);
1984 
1985 	while ((ref = select_delayed_ref(locked_ref))) {
1986 		if (ref->seq &&
1987 		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1988 			spin_unlock(&locked_ref->lock);
1989 			unselect_delayed_ref_head(delayed_refs, locked_ref);
1990 			return -EAGAIN;
1991 		}
1992 
1993 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1994 		RB_CLEAR_NODE(&ref->ref_node);
1995 		if (!list_empty(&ref->add_list))
1996 			list_del(&ref->add_list);
1997 		/*
1998 		 * When we play the delayed ref, also correct the ref_mod on
1999 		 * head
2000 		 */
2001 		switch (ref->action) {
2002 		case BTRFS_ADD_DELAYED_REF:
2003 		case BTRFS_ADD_DELAYED_EXTENT:
2004 			locked_ref->ref_mod -= ref->ref_mod;
2005 			break;
2006 		case BTRFS_DROP_DELAYED_REF:
2007 			locked_ref->ref_mod += ref->ref_mod;
2008 			break;
2009 		default:
2010 			WARN_ON(1);
2011 		}
2012 		atomic_dec(&delayed_refs->num_entries);
2013 
2014 		/*
2015 		 * Record the must_insert_reserved flag before we drop the
2016 		 * spin lock.
2017 		 */
2018 		must_insert_reserved = locked_ref->must_insert_reserved;
2019 		/*
2020 		 * Unsetting this on the head ref relinquishes ownership of
2021 		 * the rsv_bytes, so it is critical that every possible code
2022 		 * path from here forward frees all reserves including qgroup
2023 		 * reserve.
2024 		 */
2025 		locked_ref->must_insert_reserved = false;
2026 
2027 		extent_op = locked_ref->extent_op;
2028 		locked_ref->extent_op = NULL;
2029 		spin_unlock(&locked_ref->lock);
2030 
2031 		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
2032 					  must_insert_reserved);
2033 		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
2034 		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
2035 
2036 		btrfs_free_delayed_extent_op(extent_op);
2037 		if (ret) {
2038 			unselect_delayed_ref_head(delayed_refs, locked_ref);
2039 			btrfs_put_delayed_ref(ref);
2040 			return ret;
2041 		}
2042 
2043 		btrfs_put_delayed_ref(ref);
2044 		cond_resched();
2045 
2046 		spin_lock(&locked_ref->lock);
2047 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2048 	}
2049 
2050 	return 0;
2051 }
2052 
2053 /*
2054  * Returns 0 on success or if called with an already aborted transaction.
2055  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2056  */
2057 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2058 					     u64 min_bytes)
2059 {
2060 	struct btrfs_fs_info *fs_info = trans->fs_info;
2061 	struct btrfs_delayed_ref_root *delayed_refs;
2062 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2063 	int ret;
2064 	unsigned long count = 0;
2065 	unsigned long max_count = 0;
2066 	u64 bytes_processed = 0;
2067 
2068 	delayed_refs = &trans->transaction->delayed_refs;
2069 	if (min_bytes == 0) {
2070 		max_count = delayed_refs->num_heads_ready;
2071 		min_bytes = U64_MAX;
2072 	}
2073 
2074 	do {
2075 		if (!locked_ref) {
2076 			locked_ref = btrfs_obtain_ref_head(trans);
2077 			if (IS_ERR_OR_NULL(locked_ref)) {
2078 				if (PTR_ERR(locked_ref) == -EAGAIN) {
2079 					continue;
2080 				} else {
2081 					break;
2082 				}
2083 			}
2084 			count++;
2085 		}
2086 		/*
2087 		 * We need to try and merge add/drops of the same ref since we
2088 		 * can run into issues with relocate dropping the implicit ref
2089 		 * and then it being added back again before the drop can
2090 		 * finish.  If we merged anything we need to re-loop so we can
2091 		 * get a good ref.
2092 		 * Or we can get node references of the same type that weren't
2093 		 * merged when created due to bumps in the tree mod seq, and
2094 		 * we need to merge them to prevent adding an inline extent
2095 		 * backref before dropping it (triggering a BUG_ON at
2096 		 * insert_inline_extent_backref()).
2097 		 */
2098 		spin_lock(&locked_ref->lock);
2099 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2100 
2101 		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
2102 		if (ret < 0 && ret != -EAGAIN) {
2103 			/*
2104 			 * Error, btrfs_run_delayed_refs_for_head already
2105 			 * unlocked everything so just bail out
2106 			 */
2107 			return ret;
2108 		} else if (!ret) {
2109 			/*
2110 			 * Success, perform the usual cleanup of a processed
2111 			 * head
2112 			 */
2113 			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
2114 			if (ret > 0 ) {
2115 				/* We dropped our lock, we need to loop. */
2116 				ret = 0;
2117 				continue;
2118 			} else if (ret) {
2119 				return ret;
2120 			}
2121 		}
2122 
2123 		/*
2124 		 * Either success case or btrfs_run_delayed_refs_for_head
2125 		 * returned -EAGAIN, meaning we need to select another head
2126 		 */
2127 
2128 		locked_ref = NULL;
2129 		cond_resched();
2130 	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
2131 		 (max_count > 0 && count < max_count) ||
2132 		 locked_ref);
2133 
2134 	return 0;
2135 }
2136 
2137 #ifdef SCRAMBLE_DELAYED_REFS
2138 /*
2139  * Normally delayed refs get processed in ascending bytenr order. This
2140  * correlates in most cases to the order added. To expose dependencies on this
2141  * order, we start to process the tree in the middle instead of the beginning
2142  */
2143 static u64 find_middle(struct rb_root *root)
2144 {
2145 	struct rb_node *n = root->rb_node;
2146 	struct btrfs_delayed_ref_node *entry;
2147 	int alt = 1;
2148 	u64 middle;
2149 	u64 first = 0, last = 0;
2150 
2151 	n = rb_first(root);
2152 	if (n) {
2153 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2154 		first = entry->bytenr;
2155 	}
2156 	n = rb_last(root);
2157 	if (n) {
2158 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2159 		last = entry->bytenr;
2160 	}
2161 	n = root->rb_node;
2162 
2163 	while (n) {
2164 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2165 		WARN_ON(!entry->in_tree);
2166 
2167 		middle = entry->bytenr;
2168 
2169 		if (alt)
2170 			n = n->rb_left;
2171 		else
2172 			n = n->rb_right;
2173 
2174 		alt = 1 - alt;
2175 	}
2176 	return middle;
2177 }
2178 #endif
2179 
2180 /*
2181  * Start processing the delayed reference count updates and extent insertions
2182  * we have queued up so far.
2183  *
2184  * @trans:	Transaction handle.
2185  * @min_bytes:	How many bytes of delayed references to process. After this
2186  *		many bytes we stop processing delayed references if there are
2187  *		any more. If 0 it means to run all existing delayed references,
2188  *		but not new ones added after running all existing ones.
2189  *		Use (u64)-1 (U64_MAX) to run all existing delayed references
2190  *		plus any new ones that are added.
2191  *
2192  * Returns 0 on success or if called with an aborted transaction
2193  * Returns <0 on error and aborts the transaction
2194  */
2195 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
2196 {
2197 	struct btrfs_fs_info *fs_info = trans->fs_info;
2198 	struct btrfs_delayed_ref_root *delayed_refs;
2199 	int ret;
2200 
2201 	/* We'll clean this up in btrfs_cleanup_transaction */
2202 	if (TRANS_ABORTED(trans))
2203 		return 0;
2204 
2205 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2206 		return 0;
2207 
2208 	delayed_refs = &trans->transaction->delayed_refs;
2209 again:
2210 #ifdef SCRAMBLE_DELAYED_REFS
2211 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2212 #endif
2213 	ret = __btrfs_run_delayed_refs(trans, min_bytes);
2214 	if (ret < 0) {
2215 		btrfs_abort_transaction(trans, ret);
2216 		return ret;
2217 	}
2218 
2219 	if (min_bytes == U64_MAX) {
2220 		btrfs_create_pending_block_groups(trans);
2221 
2222 		spin_lock(&delayed_refs->lock);
2223 		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2224 			spin_unlock(&delayed_refs->lock);
2225 			return 0;
2226 		}
2227 		spin_unlock(&delayed_refs->lock);
2228 
2229 		cond_resched();
2230 		goto again;
2231 	}
2232 
2233 	return 0;
2234 }
2235 
2236 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2237 				struct extent_buffer *eb, u64 flags)
2238 {
2239 	struct btrfs_delayed_extent_op *extent_op;
2240 	int ret;
2241 
2242 	extent_op = btrfs_alloc_delayed_extent_op();
2243 	if (!extent_op)
2244 		return -ENOMEM;
2245 
2246 	extent_op->flags_to_set = flags;
2247 	extent_op->update_flags = true;
2248 	extent_op->update_key = false;
2249 
2250 	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
2251 					  btrfs_header_level(eb), extent_op);
2252 	if (ret)
2253 		btrfs_free_delayed_extent_op(extent_op);
2254 	return ret;
2255 }
2256 
2257 static noinline int check_delayed_ref(struct btrfs_root *root,
2258 				      struct btrfs_path *path,
2259 				      u64 objectid, u64 offset, u64 bytenr)
2260 {
2261 	struct btrfs_delayed_ref_head *head;
2262 	struct btrfs_delayed_ref_node *ref;
2263 	struct btrfs_delayed_ref_root *delayed_refs;
2264 	struct btrfs_transaction *cur_trans;
2265 	struct rb_node *node;
2266 	int ret = 0;
2267 
2268 	spin_lock(&root->fs_info->trans_lock);
2269 	cur_trans = root->fs_info->running_transaction;
2270 	if (cur_trans)
2271 		refcount_inc(&cur_trans->use_count);
2272 	spin_unlock(&root->fs_info->trans_lock);
2273 	if (!cur_trans)
2274 		return 0;
2275 
2276 	delayed_refs = &cur_trans->delayed_refs;
2277 	spin_lock(&delayed_refs->lock);
2278 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2279 	if (!head) {
2280 		spin_unlock(&delayed_refs->lock);
2281 		btrfs_put_transaction(cur_trans);
2282 		return 0;
2283 	}
2284 
2285 	if (!mutex_trylock(&head->mutex)) {
2286 		if (path->nowait) {
2287 			spin_unlock(&delayed_refs->lock);
2288 			btrfs_put_transaction(cur_trans);
2289 			return -EAGAIN;
2290 		}
2291 
2292 		refcount_inc(&head->refs);
2293 		spin_unlock(&delayed_refs->lock);
2294 
2295 		btrfs_release_path(path);
2296 
2297 		/*
2298 		 * Mutex was contended, block until it's released and let
2299 		 * caller try again
2300 		 */
2301 		mutex_lock(&head->mutex);
2302 		mutex_unlock(&head->mutex);
2303 		btrfs_put_delayed_ref_head(head);
2304 		btrfs_put_transaction(cur_trans);
2305 		return -EAGAIN;
2306 	}
2307 	spin_unlock(&delayed_refs->lock);
2308 
2309 	spin_lock(&head->lock);
2310 	/*
2311 	 * XXX: We should replace this with a proper search function in the
2312 	 * future.
2313 	 */
2314 	for (node = rb_first_cached(&head->ref_tree); node;
2315 	     node = rb_next(node)) {
2316 		u64 ref_owner;
2317 		u64 ref_offset;
2318 
2319 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2320 		/* If it's a shared ref we know a cross reference exists */
2321 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2322 			ret = 1;
2323 			break;
2324 		}
2325 
2326 		ref_owner = btrfs_delayed_ref_owner(ref);
2327 		ref_offset = btrfs_delayed_ref_offset(ref);
2328 
2329 		/*
2330 		 * If our ref doesn't match the one we're currently looking at
2331 		 * then we have a cross reference.
2332 		 */
2333 		if (ref->ref_root != btrfs_root_id(root) ||
2334 		    ref_owner != objectid || ref_offset != offset) {
2335 			ret = 1;
2336 			break;
2337 		}
2338 	}
2339 	spin_unlock(&head->lock);
2340 	mutex_unlock(&head->mutex);
2341 	btrfs_put_transaction(cur_trans);
2342 	return ret;
2343 }
2344 
2345 static noinline int check_committed_ref(struct btrfs_root *root,
2346 					struct btrfs_path *path,
2347 					u64 objectid, u64 offset, u64 bytenr,
2348 					bool strict)
2349 {
2350 	struct btrfs_fs_info *fs_info = root->fs_info;
2351 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2352 	struct extent_buffer *leaf;
2353 	struct btrfs_extent_data_ref *ref;
2354 	struct btrfs_extent_inline_ref *iref;
2355 	struct btrfs_extent_item *ei;
2356 	struct btrfs_key key;
2357 	u32 item_size;
2358 	u32 expected_size;
2359 	int type;
2360 	int ret;
2361 
2362 	key.objectid = bytenr;
2363 	key.offset = (u64)-1;
2364 	key.type = BTRFS_EXTENT_ITEM_KEY;
2365 
2366 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2367 	if (ret < 0)
2368 		goto out;
2369 	if (ret == 0) {
2370 		/*
2371 		 * Key with offset -1 found, there would have to exist an extent
2372 		 * item with such offset, but this is out of the valid range.
2373 		 */
2374 		ret = -EUCLEAN;
2375 		goto out;
2376 	}
2377 
2378 	ret = -ENOENT;
2379 	if (path->slots[0] == 0)
2380 		goto out;
2381 
2382 	path->slots[0]--;
2383 	leaf = path->nodes[0];
2384 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2385 
2386 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2387 		goto out;
2388 
2389 	ret = 1;
2390 	item_size = btrfs_item_size(leaf, path->slots[0]);
2391 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2392 	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
2393 
2394 	/* No inline refs; we need to bail before checking for owner ref. */
2395 	if (item_size == sizeof(*ei))
2396 		goto out;
2397 
2398 	/* Check for an owner ref; skip over it to the real inline refs. */
2399 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2400 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2401 	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
2402 		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
2403 		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
2404 	}
2405 
2406 	/* If extent item has more than 1 inline ref then it's shared */
2407 	if (item_size != expected_size)
2408 		goto out;
2409 
2410 	/*
2411 	 * If extent created before last snapshot => it's shared unless the
2412 	 * snapshot has been deleted. Use the heuristic if strict is false.
2413 	 */
2414 	if (!strict &&
2415 	    (btrfs_extent_generation(leaf, ei) <=
2416 	     btrfs_root_last_snapshot(&root->root_item)))
2417 		goto out;
2418 
2419 	/* If this extent has SHARED_DATA_REF then it's shared */
2420 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2421 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2422 		goto out;
2423 
2424 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2425 	if (btrfs_extent_refs(leaf, ei) !=
2426 	    btrfs_extent_data_ref_count(leaf, ref) ||
2427 	    btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
2428 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2429 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2430 		goto out;
2431 
2432 	ret = 0;
2433 out:
2434 	return ret;
2435 }
2436 
2437 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2438 			  u64 bytenr, bool strict, struct btrfs_path *path)
2439 {
2440 	int ret;
2441 
2442 	do {
2443 		ret = check_committed_ref(root, path, objectid,
2444 					  offset, bytenr, strict);
2445 		if (ret && ret != -ENOENT)
2446 			goto out;
2447 
2448 		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2449 	} while (ret == -EAGAIN);
2450 
2451 out:
2452 	btrfs_release_path(path);
2453 	if (btrfs_is_data_reloc_root(root))
2454 		WARN_ON(ret > 0);
2455 	return ret;
2456 }
2457 
2458 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2459 			   struct btrfs_root *root,
2460 			   struct extent_buffer *buf,
2461 			   int full_backref, int inc)
2462 {
2463 	struct btrfs_fs_info *fs_info = root->fs_info;
2464 	u64 parent;
2465 	u64 ref_root;
2466 	u32 nritems;
2467 	struct btrfs_key key;
2468 	struct btrfs_file_extent_item *fi;
2469 	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2470 	int i;
2471 	int action;
2472 	int level;
2473 	int ret = 0;
2474 
2475 	if (btrfs_is_testing(fs_info))
2476 		return 0;
2477 
2478 	ref_root = btrfs_header_owner(buf);
2479 	nritems = btrfs_header_nritems(buf);
2480 	level = btrfs_header_level(buf);
2481 
2482 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2483 		return 0;
2484 
2485 	if (full_backref)
2486 		parent = buf->start;
2487 	else
2488 		parent = 0;
2489 	if (inc)
2490 		action = BTRFS_ADD_DELAYED_REF;
2491 	else
2492 		action = BTRFS_DROP_DELAYED_REF;
2493 
2494 	for (i = 0; i < nritems; i++) {
2495 		struct btrfs_ref ref = {
2496 			.action = action,
2497 			.parent = parent,
2498 			.ref_root = ref_root,
2499 		};
2500 
2501 		if (level == 0) {
2502 			btrfs_item_key_to_cpu(buf, &key, i);
2503 			if (key.type != BTRFS_EXTENT_DATA_KEY)
2504 				continue;
2505 			fi = btrfs_item_ptr(buf, i,
2506 					    struct btrfs_file_extent_item);
2507 			if (btrfs_file_extent_type(buf, fi) ==
2508 			    BTRFS_FILE_EXTENT_INLINE)
2509 				continue;
2510 			ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2511 			if (ref.bytenr == 0)
2512 				continue;
2513 
2514 			ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2515 			ref.owning_root = ref_root;
2516 
2517 			key.offset -= btrfs_file_extent_offset(buf, fi);
2518 			btrfs_init_data_ref(&ref, key.objectid, key.offset,
2519 					    btrfs_root_id(root), for_reloc);
2520 			if (inc)
2521 				ret = btrfs_inc_extent_ref(trans, &ref);
2522 			else
2523 				ret = btrfs_free_extent(trans, &ref);
2524 			if (ret)
2525 				goto fail;
2526 		} else {
2527 			/* We don't know the owning_root, leave as 0. */
2528 			ref.bytenr = btrfs_node_blockptr(buf, i);
2529 			ref.num_bytes = fs_info->nodesize;
2530 
2531 			btrfs_init_tree_ref(&ref, level - 1,
2532 					    btrfs_root_id(root), for_reloc);
2533 			if (inc)
2534 				ret = btrfs_inc_extent_ref(trans, &ref);
2535 			else
2536 				ret = btrfs_free_extent(trans, &ref);
2537 			if (ret)
2538 				goto fail;
2539 		}
2540 	}
2541 	return 0;
2542 fail:
2543 	return ret;
2544 }
2545 
2546 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2547 		  struct extent_buffer *buf, int full_backref)
2548 {
2549 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2550 }
2551 
2552 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2553 		  struct extent_buffer *buf, int full_backref)
2554 {
2555 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2556 }
2557 
2558 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2559 {
2560 	struct btrfs_fs_info *fs_info = root->fs_info;
2561 	u64 flags;
2562 	u64 ret;
2563 
2564 	if (data)
2565 		flags = BTRFS_BLOCK_GROUP_DATA;
2566 	else if (root == fs_info->chunk_root)
2567 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2568 	else
2569 		flags = BTRFS_BLOCK_GROUP_METADATA;
2570 
2571 	ret = btrfs_get_alloc_profile(fs_info, flags);
2572 	return ret;
2573 }
2574 
2575 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2576 {
2577 	struct rb_node *leftmost;
2578 	u64 bytenr = 0;
2579 
2580 	read_lock(&fs_info->block_group_cache_lock);
2581 	/* Get the block group with the lowest logical start address. */
2582 	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2583 	if (leftmost) {
2584 		struct btrfs_block_group *bg;
2585 
2586 		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2587 		bytenr = bg->start;
2588 	}
2589 	read_unlock(&fs_info->block_group_cache_lock);
2590 
2591 	return bytenr;
2592 }
2593 
2594 static int pin_down_extent(struct btrfs_trans_handle *trans,
2595 			   struct btrfs_block_group *cache,
2596 			   u64 bytenr, u64 num_bytes, int reserved)
2597 {
2598 	struct btrfs_fs_info *fs_info = cache->fs_info;
2599 
2600 	spin_lock(&cache->space_info->lock);
2601 	spin_lock(&cache->lock);
2602 	cache->pinned += num_bytes;
2603 	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2604 					     num_bytes);
2605 	if (reserved) {
2606 		cache->reserved -= num_bytes;
2607 		cache->space_info->bytes_reserved -= num_bytes;
2608 	}
2609 	spin_unlock(&cache->lock);
2610 	spin_unlock(&cache->space_info->lock);
2611 
2612 	set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2613 		       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2614 	return 0;
2615 }
2616 
2617 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2618 		     u64 bytenr, u64 num_bytes, int reserved)
2619 {
2620 	struct btrfs_block_group *cache;
2621 
2622 	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2623 	BUG_ON(!cache); /* Logic error */
2624 
2625 	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2626 
2627 	btrfs_put_block_group(cache);
2628 	return 0;
2629 }
2630 
2631 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2632 				    const struct extent_buffer *eb)
2633 {
2634 	struct btrfs_block_group *cache;
2635 	int ret;
2636 
2637 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2638 	if (!cache)
2639 		return -EINVAL;
2640 
2641 	/*
2642 	 * Fully cache the free space first so that our pin removes the free space
2643 	 * from the cache.
2644 	 */
2645 	ret = btrfs_cache_block_group(cache, true);
2646 	if (ret)
2647 		goto out;
2648 
2649 	pin_down_extent(trans, cache, eb->start, eb->len, 0);
2650 
2651 	/* remove us from the free space cache (if we're there at all) */
2652 	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2653 out:
2654 	btrfs_put_block_group(cache);
2655 	return ret;
2656 }
2657 
2658 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2659 				   u64 start, u64 num_bytes)
2660 {
2661 	int ret;
2662 	struct btrfs_block_group *block_group;
2663 
2664 	block_group = btrfs_lookup_block_group(fs_info, start);
2665 	if (!block_group)
2666 		return -EINVAL;
2667 
2668 	ret = btrfs_cache_block_group(block_group, true);
2669 	if (ret)
2670 		goto out;
2671 
2672 	ret = btrfs_remove_free_space(block_group, start, num_bytes);
2673 out:
2674 	btrfs_put_block_group(block_group);
2675 	return ret;
2676 }
2677 
2678 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2679 {
2680 	struct btrfs_fs_info *fs_info = eb->fs_info;
2681 	struct btrfs_file_extent_item *item;
2682 	struct btrfs_key key;
2683 	int found_type;
2684 	int i;
2685 	int ret = 0;
2686 
2687 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2688 		return 0;
2689 
2690 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2691 		btrfs_item_key_to_cpu(eb, &key, i);
2692 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2693 			continue;
2694 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2695 		found_type = btrfs_file_extent_type(eb, item);
2696 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2697 			continue;
2698 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2699 			continue;
2700 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2701 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2702 		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2703 		if (ret)
2704 			break;
2705 	}
2706 
2707 	return ret;
2708 }
2709 
2710 static void
2711 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2712 {
2713 	atomic_inc(&bg->reservations);
2714 }
2715 
2716 /*
2717  * Returns the free cluster for the given space info and sets empty_cluster to
2718  * what it should be based on the mount options.
2719  */
2720 static struct btrfs_free_cluster *
2721 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2722 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2723 {
2724 	struct btrfs_free_cluster *ret = NULL;
2725 
2726 	*empty_cluster = 0;
2727 	if (btrfs_mixed_space_info(space_info))
2728 		return ret;
2729 
2730 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2731 		ret = &fs_info->meta_alloc_cluster;
2732 		if (btrfs_test_opt(fs_info, SSD))
2733 			*empty_cluster = SZ_2M;
2734 		else
2735 			*empty_cluster = SZ_64K;
2736 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2737 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2738 		*empty_cluster = SZ_2M;
2739 		ret = &fs_info->data_alloc_cluster;
2740 	}
2741 
2742 	return ret;
2743 }
2744 
2745 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2746 			      u64 start, u64 end,
2747 			      const bool return_free_space)
2748 {
2749 	struct btrfs_block_group *cache = NULL;
2750 	struct btrfs_space_info *space_info;
2751 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2752 	struct btrfs_free_cluster *cluster = NULL;
2753 	u64 len;
2754 	u64 total_unpinned = 0;
2755 	u64 empty_cluster = 0;
2756 	bool readonly;
2757 	int ret = 0;
2758 
2759 	while (start <= end) {
2760 		readonly = false;
2761 		if (!cache ||
2762 		    start >= cache->start + cache->length) {
2763 			if (cache)
2764 				btrfs_put_block_group(cache);
2765 			total_unpinned = 0;
2766 			cache = btrfs_lookup_block_group(fs_info, start);
2767 			if (cache == NULL) {
2768 				/* Logic error, something removed the block group. */
2769 				ret = -EUCLEAN;
2770 				goto out;
2771 			}
2772 
2773 			cluster = fetch_cluster_info(fs_info,
2774 						     cache->space_info,
2775 						     &empty_cluster);
2776 			empty_cluster <<= 1;
2777 		}
2778 
2779 		len = cache->start + cache->length - start;
2780 		len = min(len, end + 1 - start);
2781 
2782 		if (return_free_space)
2783 			btrfs_add_free_space(cache, start, len);
2784 
2785 		start += len;
2786 		total_unpinned += len;
2787 		space_info = cache->space_info;
2788 
2789 		/*
2790 		 * If this space cluster has been marked as fragmented and we've
2791 		 * unpinned enough in this block group to potentially allow a
2792 		 * cluster to be created inside of it go ahead and clear the
2793 		 * fragmented check.
2794 		 */
2795 		if (cluster && cluster->fragmented &&
2796 		    total_unpinned > empty_cluster) {
2797 			spin_lock(&cluster->lock);
2798 			cluster->fragmented = 0;
2799 			spin_unlock(&cluster->lock);
2800 		}
2801 
2802 		spin_lock(&space_info->lock);
2803 		spin_lock(&cache->lock);
2804 		cache->pinned -= len;
2805 		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2806 		space_info->max_extent_size = 0;
2807 		if (cache->ro) {
2808 			space_info->bytes_readonly += len;
2809 			readonly = true;
2810 		} else if (btrfs_is_zoned(fs_info)) {
2811 			/* Need reset before reusing in a zoned block group */
2812 			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2813 								    len);
2814 			readonly = true;
2815 		}
2816 		spin_unlock(&cache->lock);
2817 		if (!readonly && return_free_space &&
2818 		    global_rsv->space_info == space_info) {
2819 			spin_lock(&global_rsv->lock);
2820 			if (!global_rsv->full) {
2821 				u64 to_add = min(len, global_rsv->size -
2822 						      global_rsv->reserved);
2823 
2824 				global_rsv->reserved += to_add;
2825 				btrfs_space_info_update_bytes_may_use(fs_info,
2826 						space_info, to_add);
2827 				if (global_rsv->reserved >= global_rsv->size)
2828 					global_rsv->full = 1;
2829 				len -= to_add;
2830 			}
2831 			spin_unlock(&global_rsv->lock);
2832 		}
2833 		/* Add to any tickets we may have */
2834 		if (!readonly && return_free_space && len)
2835 			btrfs_try_granting_tickets(fs_info, space_info);
2836 		spin_unlock(&space_info->lock);
2837 	}
2838 
2839 	if (cache)
2840 		btrfs_put_block_group(cache);
2841 out:
2842 	return ret;
2843 }
2844 
2845 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2846 {
2847 	struct btrfs_fs_info *fs_info = trans->fs_info;
2848 	struct btrfs_block_group *block_group, *tmp;
2849 	struct list_head *deleted_bgs;
2850 	struct extent_io_tree *unpin;
2851 	u64 start;
2852 	u64 end;
2853 	int ret;
2854 
2855 	unpin = &trans->transaction->pinned_extents;
2856 
2857 	while (!TRANS_ABORTED(trans)) {
2858 		struct extent_state *cached_state = NULL;
2859 
2860 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2861 		if (!find_first_extent_bit(unpin, 0, &start, &end,
2862 					   EXTENT_DIRTY, &cached_state)) {
2863 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2864 			break;
2865 		}
2866 
2867 		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2868 			ret = btrfs_discard_extent(fs_info, start,
2869 						   end + 1 - start, NULL);
2870 
2871 		clear_extent_dirty(unpin, start, end, &cached_state);
2872 		ret = unpin_extent_range(fs_info, start, end, true);
2873 		BUG_ON(ret);
2874 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2875 		free_extent_state(cached_state);
2876 		cond_resched();
2877 	}
2878 
2879 	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2880 		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2881 		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2882 	}
2883 
2884 	/*
2885 	 * Transaction is finished.  We don't need the lock anymore.  We
2886 	 * do need to clean up the block groups in case of a transaction
2887 	 * abort.
2888 	 */
2889 	deleted_bgs = &trans->transaction->deleted_bgs;
2890 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2891 		u64 trimmed = 0;
2892 
2893 		ret = -EROFS;
2894 		if (!TRANS_ABORTED(trans))
2895 			ret = btrfs_discard_extent(fs_info,
2896 						   block_group->start,
2897 						   block_group->length,
2898 						   &trimmed);
2899 
2900 		list_del_init(&block_group->bg_list);
2901 		btrfs_unfreeze_block_group(block_group);
2902 		btrfs_put_block_group(block_group);
2903 
2904 		if (ret) {
2905 			const char *errstr = btrfs_decode_error(ret);
2906 			btrfs_warn(fs_info,
2907 			   "discard failed while removing blockgroup: errno=%d %s",
2908 				   ret, errstr);
2909 		}
2910 	}
2911 
2912 	return 0;
2913 }
2914 
2915 /*
2916  * Parse an extent item's inline extents looking for a simple quotas owner ref.
2917  *
2918  * @fs_info:	the btrfs_fs_info for this mount
2919  * @leaf:	a leaf in the extent tree containing the extent item
2920  * @slot:	the slot in the leaf where the extent item is found
2921  *
2922  * Returns the objectid of the root that originally allocated the extent item
2923  * if the inline owner ref is expected and present, otherwise 0.
2924  *
2925  * If an extent item has an owner ref item, it will be the first inline ref
2926  * item. Therefore the logic is to check whether there are any inline ref
2927  * items, then check the type of the first one.
2928  */
2929 u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
2930 				struct extent_buffer *leaf, int slot)
2931 {
2932 	struct btrfs_extent_item *ei;
2933 	struct btrfs_extent_inline_ref *iref;
2934 	struct btrfs_extent_owner_ref *oref;
2935 	unsigned long ptr;
2936 	unsigned long end;
2937 	int type;
2938 
2939 	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
2940 		return 0;
2941 
2942 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
2943 	ptr = (unsigned long)(ei + 1);
2944 	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
2945 
2946 	/* No inline ref items of any kind, can't check type. */
2947 	if (ptr == end)
2948 		return 0;
2949 
2950 	iref = (struct btrfs_extent_inline_ref *)ptr;
2951 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
2952 
2953 	/* We found an owner ref, get the root out of it. */
2954 	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
2955 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
2956 		return btrfs_extent_owner_ref_root_id(leaf, oref);
2957 	}
2958 
2959 	/* We have inline refs, but not an owner ref. */
2960 	return 0;
2961 }
2962 
2963 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2964 				     u64 bytenr, struct btrfs_squota_delta *delta)
2965 {
2966 	int ret;
2967 	u64 num_bytes = delta->num_bytes;
2968 
2969 	if (delta->is_data) {
2970 		struct btrfs_root *csum_root;
2971 
2972 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2973 		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2974 		if (ret) {
2975 			btrfs_abort_transaction(trans, ret);
2976 			return ret;
2977 		}
2978 
2979 		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
2980 		if (ret) {
2981 			btrfs_abort_transaction(trans, ret);
2982 			return ret;
2983 		}
2984 	}
2985 
2986 	ret = btrfs_record_squota_delta(trans->fs_info, delta);
2987 	if (ret) {
2988 		btrfs_abort_transaction(trans, ret);
2989 		return ret;
2990 	}
2991 
2992 	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2993 	if (ret) {
2994 		btrfs_abort_transaction(trans, ret);
2995 		return ret;
2996 	}
2997 
2998 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2999 	if (ret)
3000 		btrfs_abort_transaction(trans, ret);
3001 
3002 	return ret;
3003 }
3004 
3005 #define abort_and_dump(trans, path, fmt, args...)	\
3006 ({							\
3007 	btrfs_abort_transaction(trans, -EUCLEAN);	\
3008 	btrfs_print_leaf(path->nodes[0]);		\
3009 	btrfs_crit(trans->fs_info, fmt, ##args);	\
3010 })
3011 
3012 /*
3013  * Drop one or more refs of @node.
3014  *
3015  * 1. Locate the extent refs.
3016  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
3017  *    Locate it, then reduce the refs number or remove the ref line completely.
3018  *
3019  * 2. Update the refs count in EXTENT/METADATA_ITEM
3020  *
3021  * Inline backref case:
3022  *
3023  * in extent tree we have:
3024  *
3025  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3026  *		refs 2 gen 6 flags DATA
3027  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3028  *		extent data backref root FS_TREE objectid 257 offset 0 count 1
3029  *
3030  * This function gets called with:
3031  *
3032  *    node->bytenr = 13631488
3033  *    node->num_bytes = 1048576
3034  *    root_objectid = FS_TREE
3035  *    owner_objectid = 257
3036  *    owner_offset = 0
3037  *    refs_to_drop = 1
3038  *
3039  * Then we should get some like:
3040  *
3041  * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
3042  *		refs 1 gen 6 flags DATA
3043  *		extent data backref root FS_TREE objectid 258 offset 0 count 1
3044  *
3045  * Keyed backref case:
3046  *
3047  * in extent tree we have:
3048  *
3049  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3050  *		refs 754 gen 6 flags DATA
3051  *	[...]
3052  *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
3053  *		extent data backref root FS_TREE objectid 866 offset 0 count 1
3054  *
3055  * This function get called with:
3056  *
3057  *    node->bytenr = 13631488
3058  *    node->num_bytes = 1048576
3059  *    root_objectid = FS_TREE
3060  *    owner_objectid = 866
3061  *    owner_offset = 0
3062  *    refs_to_drop = 1
3063  *
3064  * Then we should get some like:
3065  *
3066  *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
3067  *		refs 753 gen 6 flags DATA
3068  *
3069  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
3070  */
3071 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3072 			       struct btrfs_delayed_ref_head *href,
3073 			       struct btrfs_delayed_ref_node *node,
3074 			       struct btrfs_delayed_extent_op *extent_op)
3075 {
3076 	struct btrfs_fs_info *info = trans->fs_info;
3077 	struct btrfs_key key;
3078 	struct btrfs_path *path;
3079 	struct btrfs_root *extent_root;
3080 	struct extent_buffer *leaf;
3081 	struct btrfs_extent_item *ei;
3082 	struct btrfs_extent_inline_ref *iref;
3083 	int ret;
3084 	int is_data;
3085 	int extent_slot = 0;
3086 	int found_extent = 0;
3087 	int num_to_del = 1;
3088 	int refs_to_drop = node->ref_mod;
3089 	u32 item_size;
3090 	u64 refs;
3091 	u64 bytenr = node->bytenr;
3092 	u64 num_bytes = node->num_bytes;
3093 	u64 owner_objectid = btrfs_delayed_ref_owner(node);
3094 	u64 owner_offset = btrfs_delayed_ref_offset(node);
3095 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3096 	u64 delayed_ref_root = href->owning_root;
3097 
3098 	extent_root = btrfs_extent_root(info, bytenr);
3099 	ASSERT(extent_root);
3100 
3101 	path = btrfs_alloc_path();
3102 	if (!path)
3103 		return -ENOMEM;
3104 
3105 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3106 
3107 	if (!is_data && refs_to_drop != 1) {
3108 		btrfs_crit(info,
3109 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3110 			   node->bytenr, refs_to_drop);
3111 		ret = -EINVAL;
3112 		btrfs_abort_transaction(trans, ret);
3113 		goto out;
3114 	}
3115 
3116 	if (is_data)
3117 		skinny_metadata = false;
3118 
3119 	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3120 				    node->parent, node->ref_root, owner_objectid,
3121 				    owner_offset);
3122 	if (ret == 0) {
3123 		/*
3124 		 * Either the inline backref or the SHARED_DATA_REF/
3125 		 * SHARED_BLOCK_REF is found
3126 		 *
3127 		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3128 		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3129 		 */
3130 		extent_slot = path->slots[0];
3131 		while (extent_slot >= 0) {
3132 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3133 					      extent_slot);
3134 			if (key.objectid != bytenr)
3135 				break;
3136 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3137 			    key.offset == num_bytes) {
3138 				found_extent = 1;
3139 				break;
3140 			}
3141 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3142 			    key.offset == owner_objectid) {
3143 				found_extent = 1;
3144 				break;
3145 			}
3146 
3147 			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3148 			if (path->slots[0] - extent_slot > 5)
3149 				break;
3150 			extent_slot--;
3151 		}
3152 
3153 		if (!found_extent) {
3154 			if (iref) {
3155 				abort_and_dump(trans, path,
3156 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3157 					   path->slots[0]);
3158 				ret = -EUCLEAN;
3159 				goto out;
3160 			}
3161 			/* Must be SHARED_* item, remove the backref first */
3162 			ret = remove_extent_backref(trans, extent_root, path,
3163 						    NULL, refs_to_drop, is_data);
3164 			if (ret) {
3165 				btrfs_abort_transaction(trans, ret);
3166 				goto out;
3167 			}
3168 			btrfs_release_path(path);
3169 
3170 			/* Slow path to locate EXTENT/METADATA_ITEM */
3171 			key.objectid = bytenr;
3172 			key.type = BTRFS_EXTENT_ITEM_KEY;
3173 			key.offset = num_bytes;
3174 
3175 			if (!is_data && skinny_metadata) {
3176 				key.type = BTRFS_METADATA_ITEM_KEY;
3177 				key.offset = owner_objectid;
3178 			}
3179 
3180 			ret = btrfs_search_slot(trans, extent_root,
3181 						&key, path, -1, 1);
3182 			if (ret > 0 && skinny_metadata && path->slots[0]) {
3183 				/*
3184 				 * Couldn't find our skinny metadata item,
3185 				 * see if we have ye olde extent item.
3186 				 */
3187 				path->slots[0]--;
3188 				btrfs_item_key_to_cpu(path->nodes[0], &key,
3189 						      path->slots[0]);
3190 				if (key.objectid == bytenr &&
3191 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3192 				    key.offset == num_bytes)
3193 					ret = 0;
3194 			}
3195 
3196 			if (ret > 0 && skinny_metadata) {
3197 				skinny_metadata = false;
3198 				key.objectid = bytenr;
3199 				key.type = BTRFS_EXTENT_ITEM_KEY;
3200 				key.offset = num_bytes;
3201 				btrfs_release_path(path);
3202 				ret = btrfs_search_slot(trans, extent_root,
3203 							&key, path, -1, 1);
3204 			}
3205 
3206 			if (ret) {
3207 				if (ret > 0)
3208 					btrfs_print_leaf(path->nodes[0]);
3209 				btrfs_err(info,
3210 			"umm, got %d back from search, was looking for %llu, slot %d",
3211 					  ret, bytenr, path->slots[0]);
3212 			}
3213 			if (ret < 0) {
3214 				btrfs_abort_transaction(trans, ret);
3215 				goto out;
3216 			}
3217 			extent_slot = path->slots[0];
3218 		}
3219 	} else if (WARN_ON(ret == -ENOENT)) {
3220 		abort_and_dump(trans, path,
3221 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3222 			       bytenr, node->parent, node->ref_root, owner_objectid,
3223 			       owner_offset, path->slots[0]);
3224 		goto out;
3225 	} else {
3226 		btrfs_abort_transaction(trans, ret);
3227 		goto out;
3228 	}
3229 
3230 	leaf = path->nodes[0];
3231 	item_size = btrfs_item_size(leaf, extent_slot);
3232 	if (unlikely(item_size < sizeof(*ei))) {
3233 		ret = -EUCLEAN;
3234 		btrfs_err(trans->fs_info,
3235 			  "unexpected extent item size, has %u expect >= %zu",
3236 			  item_size, sizeof(*ei));
3237 		btrfs_abort_transaction(trans, ret);
3238 		goto out;
3239 	}
3240 	ei = btrfs_item_ptr(leaf, extent_slot,
3241 			    struct btrfs_extent_item);
3242 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3243 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3244 		struct btrfs_tree_block_info *bi;
3245 
3246 		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3247 			abort_and_dump(trans, path,
3248 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3249 				       key.objectid, key.type, key.offset,
3250 				       path->slots[0], owner_objectid, item_size,
3251 				       sizeof(*ei) + sizeof(*bi));
3252 			ret = -EUCLEAN;
3253 			goto out;
3254 		}
3255 		bi = (struct btrfs_tree_block_info *)(ei + 1);
3256 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3257 	}
3258 
3259 	refs = btrfs_extent_refs(leaf, ei);
3260 	if (refs < refs_to_drop) {
3261 		abort_and_dump(trans, path,
3262 		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3263 			       refs_to_drop, refs, bytenr, path->slots[0]);
3264 		ret = -EUCLEAN;
3265 		goto out;
3266 	}
3267 	refs -= refs_to_drop;
3268 
3269 	if (refs > 0) {
3270 		if (extent_op)
3271 			__run_delayed_extent_op(extent_op, leaf, ei);
3272 		/*
3273 		 * In the case of inline back ref, reference count will
3274 		 * be updated by remove_extent_backref
3275 		 */
3276 		if (iref) {
3277 			if (!found_extent) {
3278 				abort_and_dump(trans, path,
3279 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3280 					       path->slots[0]);
3281 				ret = -EUCLEAN;
3282 				goto out;
3283 			}
3284 		} else {
3285 			btrfs_set_extent_refs(leaf, ei, refs);
3286 			btrfs_mark_buffer_dirty(trans, leaf);
3287 		}
3288 		if (found_extent) {
3289 			ret = remove_extent_backref(trans, extent_root, path,
3290 						    iref, refs_to_drop, is_data);
3291 			if (ret) {
3292 				btrfs_abort_transaction(trans, ret);
3293 				goto out;
3294 			}
3295 		}
3296 	} else {
3297 		struct btrfs_squota_delta delta = {
3298 			.root = delayed_ref_root,
3299 			.num_bytes = num_bytes,
3300 			.is_data = is_data,
3301 			.is_inc = false,
3302 			.generation = btrfs_extent_generation(leaf, ei),
3303 		};
3304 
3305 		/* In this branch refs == 1 */
3306 		if (found_extent) {
3307 			if (is_data && refs_to_drop !=
3308 			    extent_data_ref_count(path, iref)) {
3309 				abort_and_dump(trans, path,
3310 		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3311 					       extent_data_ref_count(path, iref),
3312 					       refs_to_drop, path->slots[0]);
3313 				ret = -EUCLEAN;
3314 				goto out;
3315 			}
3316 			if (iref) {
3317 				if (path->slots[0] != extent_slot) {
3318 					abort_and_dump(trans, path,
3319 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3320 						       key.objectid, key.type,
3321 						       key.offset, path->slots[0]);
3322 					ret = -EUCLEAN;
3323 					goto out;
3324 				}
3325 			} else {
3326 				/*
3327 				 * No inline ref, we must be at SHARED_* item,
3328 				 * And it's single ref, it must be:
3329 				 * |	extent_slot	  ||extent_slot + 1|
3330 				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3331 				 */
3332 				if (path->slots[0] != extent_slot + 1) {
3333 					abort_and_dump(trans, path,
3334 	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3335 						       path->slots[0]);
3336 					ret = -EUCLEAN;
3337 					goto out;
3338 				}
3339 				path->slots[0] = extent_slot;
3340 				num_to_del = 2;
3341 			}
3342 		}
3343 		/*
3344 		 * We can't infer the data owner from the delayed ref, so we need
3345 		 * to try to get it from the owning ref item.
3346 		 *
3347 		 * If it is not present, then that extent was not written under
3348 		 * simple quotas mode, so we don't need to account for its deletion.
3349 		 */
3350 		if (is_data)
3351 			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
3352 								 leaf, extent_slot);
3353 
3354 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3355 				      num_to_del);
3356 		if (ret) {
3357 			btrfs_abort_transaction(trans, ret);
3358 			goto out;
3359 		}
3360 		btrfs_release_path(path);
3361 
3362 		ret = do_free_extent_accounting(trans, bytenr, &delta);
3363 	}
3364 	btrfs_release_path(path);
3365 
3366 out:
3367 	btrfs_free_path(path);
3368 	return ret;
3369 }
3370 
3371 /*
3372  * when we free an block, it is possible (and likely) that we free the last
3373  * delayed ref for that extent as well.  This searches the delayed ref tree for
3374  * a given extent, and if there are no other delayed refs to be processed, it
3375  * removes it from the tree.
3376  */
3377 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3378 				      u64 bytenr)
3379 {
3380 	struct btrfs_delayed_ref_head *head;
3381 	struct btrfs_delayed_ref_root *delayed_refs;
3382 	int ret = 0;
3383 
3384 	delayed_refs = &trans->transaction->delayed_refs;
3385 	spin_lock(&delayed_refs->lock);
3386 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3387 	if (!head)
3388 		goto out_delayed_unlock;
3389 
3390 	spin_lock(&head->lock);
3391 	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3392 		goto out;
3393 
3394 	if (cleanup_extent_op(head) != NULL)
3395 		goto out;
3396 
3397 	/*
3398 	 * waiting for the lock here would deadlock.  If someone else has it
3399 	 * locked they are already in the process of dropping it anyway
3400 	 */
3401 	if (!mutex_trylock(&head->mutex))
3402 		goto out;
3403 
3404 	btrfs_delete_ref_head(delayed_refs, head);
3405 	head->processing = false;
3406 
3407 	spin_unlock(&head->lock);
3408 	spin_unlock(&delayed_refs->lock);
3409 
3410 	BUG_ON(head->extent_op);
3411 	if (head->must_insert_reserved)
3412 		ret = 1;
3413 
3414 	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3415 	mutex_unlock(&head->mutex);
3416 	btrfs_put_delayed_ref_head(head);
3417 	return ret;
3418 out:
3419 	spin_unlock(&head->lock);
3420 
3421 out_delayed_unlock:
3422 	spin_unlock(&delayed_refs->lock);
3423 	return 0;
3424 }
3425 
3426 int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3427 			  u64 root_id,
3428 			  struct extent_buffer *buf,
3429 			  u64 parent, int last_ref)
3430 {
3431 	struct btrfs_fs_info *fs_info = trans->fs_info;
3432 	struct btrfs_block_group *bg;
3433 	int ret;
3434 
3435 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3436 		struct btrfs_ref generic_ref = {
3437 			.action = BTRFS_DROP_DELAYED_REF,
3438 			.bytenr = buf->start,
3439 			.num_bytes = buf->len,
3440 			.parent = parent,
3441 			.owning_root = btrfs_header_owner(buf),
3442 			.ref_root = root_id,
3443 		};
3444 
3445 		/*
3446 		 * Assert that the extent buffer is not cleared due to
3447 		 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
3448 		 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
3449 		 * detail.
3450 		 */
3451 		ASSERT(btrfs_header_bytenr(buf) != 0);
3452 
3453 		btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
3454 		btrfs_ref_tree_mod(fs_info, &generic_ref);
3455 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3456 		if (ret < 0)
3457 			return ret;
3458 	}
3459 
3460 	if (!last_ref)
3461 		return 0;
3462 
3463 	if (btrfs_header_generation(buf) != trans->transid)
3464 		goto out;
3465 
3466 	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3467 		ret = check_ref_cleanup(trans, buf->start);
3468 		if (!ret)
3469 			goto out;
3470 	}
3471 
3472 	bg = btrfs_lookup_block_group(fs_info, buf->start);
3473 
3474 	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3475 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3476 		btrfs_put_block_group(bg);
3477 		goto out;
3478 	}
3479 
3480 	/*
3481 	 * If there are tree mod log users we may have recorded mod log
3482 	 * operations for this node.  If we re-allocate this node we
3483 	 * could replay operations on this node that happened when it
3484 	 * existed in a completely different root.  For example if it
3485 	 * was part of root A, then was reallocated to root B, and we
3486 	 * are doing a btrfs_old_search_slot(root b), we could replay
3487 	 * operations that happened when the block was part of root A,
3488 	 * giving us an inconsistent view of the btree.
3489 	 *
3490 	 * We are safe from races here because at this point no other
3491 	 * node or root points to this extent buffer, so if after this
3492 	 * check a new tree mod log user joins we will not have an
3493 	 * existing log of operations on this node that we have to
3494 	 * contend with.
3495 	 */
3496 
3497 	if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
3498 		     || btrfs_is_zoned(fs_info)) {
3499 		pin_down_extent(trans, bg, buf->start, buf->len, 1);
3500 		btrfs_put_block_group(bg);
3501 		goto out;
3502 	}
3503 
3504 	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3505 
3506 	btrfs_add_free_space(bg, buf->start, buf->len);
3507 	btrfs_free_reserved_bytes(bg, buf->len, 0);
3508 	btrfs_put_block_group(bg);
3509 	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3510 
3511 out:
3512 
3513 	/*
3514 	 * Deleting the buffer, clear the corrupt flag since it doesn't
3515 	 * matter anymore.
3516 	 */
3517 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3518 	return 0;
3519 }
3520 
3521 /* Can return -ENOMEM */
3522 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3523 {
3524 	struct btrfs_fs_info *fs_info = trans->fs_info;
3525 	int ret;
3526 
3527 	if (btrfs_is_testing(fs_info))
3528 		return 0;
3529 
3530 	/*
3531 	 * tree log blocks never actually go into the extent allocation
3532 	 * tree, just update pinning info and exit early.
3533 	 */
3534 	if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
3535 		btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
3536 		ret = 0;
3537 	} else if (ref->type == BTRFS_REF_METADATA) {
3538 		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3539 	} else {
3540 		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3541 	}
3542 
3543 	if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
3544 		btrfs_ref_tree_mod(fs_info, ref);
3545 
3546 	return ret;
3547 }
3548 
3549 enum btrfs_loop_type {
3550 	/*
3551 	 * Start caching block groups but do not wait for progress or for them
3552 	 * to be done.
3553 	 */
3554 	LOOP_CACHING_NOWAIT,
3555 
3556 	/*
3557 	 * Wait for the block group free_space >= the space we're waiting for if
3558 	 * the block group isn't cached.
3559 	 */
3560 	LOOP_CACHING_WAIT,
3561 
3562 	/*
3563 	 * Allow allocations to happen from block groups that do not yet have a
3564 	 * size classification.
3565 	 */
3566 	LOOP_UNSET_SIZE_CLASS,
3567 
3568 	/*
3569 	 * Allocate a chunk and then retry the allocation.
3570 	 */
3571 	LOOP_ALLOC_CHUNK,
3572 
3573 	/*
3574 	 * Ignore the size class restrictions for this allocation.
3575 	 */
3576 	LOOP_WRONG_SIZE_CLASS,
3577 
3578 	/*
3579 	 * Ignore the empty size, only try to allocate the number of bytes
3580 	 * needed for this allocation.
3581 	 */
3582 	LOOP_NO_EMPTY_SIZE,
3583 };
3584 
3585 static inline void
3586 btrfs_lock_block_group(struct btrfs_block_group *cache,
3587 		       int delalloc)
3588 {
3589 	if (delalloc)
3590 		down_read(&cache->data_rwsem);
3591 }
3592 
3593 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3594 		       int delalloc)
3595 {
3596 	btrfs_get_block_group(cache);
3597 	if (delalloc)
3598 		down_read(&cache->data_rwsem);
3599 }
3600 
3601 static struct btrfs_block_group *btrfs_lock_cluster(
3602 		   struct btrfs_block_group *block_group,
3603 		   struct btrfs_free_cluster *cluster,
3604 		   int delalloc)
3605 	__acquires(&cluster->refill_lock)
3606 {
3607 	struct btrfs_block_group *used_bg = NULL;
3608 
3609 	spin_lock(&cluster->refill_lock);
3610 	while (1) {
3611 		used_bg = cluster->block_group;
3612 		if (!used_bg)
3613 			return NULL;
3614 
3615 		if (used_bg == block_group)
3616 			return used_bg;
3617 
3618 		btrfs_get_block_group(used_bg);
3619 
3620 		if (!delalloc)
3621 			return used_bg;
3622 
3623 		if (down_read_trylock(&used_bg->data_rwsem))
3624 			return used_bg;
3625 
3626 		spin_unlock(&cluster->refill_lock);
3627 
3628 		/* We should only have one-level nested. */
3629 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3630 
3631 		spin_lock(&cluster->refill_lock);
3632 		if (used_bg == cluster->block_group)
3633 			return used_bg;
3634 
3635 		up_read(&used_bg->data_rwsem);
3636 		btrfs_put_block_group(used_bg);
3637 	}
3638 }
3639 
3640 static inline void
3641 btrfs_release_block_group(struct btrfs_block_group *cache,
3642 			 int delalloc)
3643 {
3644 	if (delalloc)
3645 		up_read(&cache->data_rwsem);
3646 	btrfs_put_block_group(cache);
3647 }
3648 
3649 /*
3650  * Helper function for find_free_extent().
3651  *
3652  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3653  * Return >0 to inform caller that we find nothing
3654  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3655  */
3656 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3657 				      struct find_free_extent_ctl *ffe_ctl,
3658 				      struct btrfs_block_group **cluster_bg_ret)
3659 {
3660 	struct btrfs_block_group *cluster_bg;
3661 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3662 	u64 aligned_cluster;
3663 	u64 offset;
3664 	int ret;
3665 
3666 	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3667 	if (!cluster_bg)
3668 		goto refill_cluster;
3669 	if (cluster_bg != bg && (cluster_bg->ro ||
3670 	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
3671 		goto release_cluster;
3672 
3673 	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3674 			ffe_ctl->num_bytes, cluster_bg->start,
3675 			&ffe_ctl->max_extent_size);
3676 	if (offset) {
3677 		/* We have a block, we're done */
3678 		spin_unlock(&last_ptr->refill_lock);
3679 		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3680 		*cluster_bg_ret = cluster_bg;
3681 		ffe_ctl->found_offset = offset;
3682 		return 0;
3683 	}
3684 	WARN_ON(last_ptr->block_group != cluster_bg);
3685 
3686 release_cluster:
3687 	/*
3688 	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3689 	 * lets just skip it and let the allocator find whatever block it can
3690 	 * find. If we reach this point, we will have tried the cluster
3691 	 * allocator plenty of times and not have found anything, so we are
3692 	 * likely way too fragmented for the clustering stuff to find anything.
3693 	 *
3694 	 * However, if the cluster is taken from the current block group,
3695 	 * release the cluster first, so that we stand a better chance of
3696 	 * succeeding in the unclustered allocation.
3697 	 */
3698 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3699 		spin_unlock(&last_ptr->refill_lock);
3700 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3701 		return -ENOENT;
3702 	}
3703 
3704 	/* This cluster didn't work out, free it and start over */
3705 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3706 
3707 	if (cluster_bg != bg)
3708 		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3709 
3710 refill_cluster:
3711 	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3712 		spin_unlock(&last_ptr->refill_lock);
3713 		return -ENOENT;
3714 	}
3715 
3716 	aligned_cluster = max_t(u64,
3717 			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3718 			bg->full_stripe_len);
3719 	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3720 			ffe_ctl->num_bytes, aligned_cluster);
3721 	if (ret == 0) {
3722 		/* Now pull our allocation out of this cluster */
3723 		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3724 				ffe_ctl->num_bytes, ffe_ctl->search_start,
3725 				&ffe_ctl->max_extent_size);
3726 		if (offset) {
3727 			/* We found one, proceed */
3728 			spin_unlock(&last_ptr->refill_lock);
3729 			ffe_ctl->found_offset = offset;
3730 			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3731 			return 0;
3732 		}
3733 	}
3734 	/*
3735 	 * At this point we either didn't find a cluster or we weren't able to
3736 	 * allocate a block from our cluster.  Free the cluster we've been
3737 	 * trying to use, and go to the next block group.
3738 	 */
3739 	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3740 	spin_unlock(&last_ptr->refill_lock);
3741 	return 1;
3742 }
3743 
3744 /*
3745  * Return >0 to inform caller that we find nothing
3746  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3747  */
3748 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3749 					struct find_free_extent_ctl *ffe_ctl)
3750 {
3751 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3752 	u64 offset;
3753 
3754 	/*
3755 	 * We are doing an unclustered allocation, set the fragmented flag so
3756 	 * we don't bother trying to setup a cluster again until we get more
3757 	 * space.
3758 	 */
3759 	if (unlikely(last_ptr)) {
3760 		spin_lock(&last_ptr->lock);
3761 		last_ptr->fragmented = 1;
3762 		spin_unlock(&last_ptr->lock);
3763 	}
3764 	if (ffe_ctl->cached) {
3765 		struct btrfs_free_space_ctl *free_space_ctl;
3766 
3767 		free_space_ctl = bg->free_space_ctl;
3768 		spin_lock(&free_space_ctl->tree_lock);
3769 		if (free_space_ctl->free_space <
3770 		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3771 		    ffe_ctl->empty_size) {
3772 			ffe_ctl->total_free_space = max_t(u64,
3773 					ffe_ctl->total_free_space,
3774 					free_space_ctl->free_space);
3775 			spin_unlock(&free_space_ctl->tree_lock);
3776 			return 1;
3777 		}
3778 		spin_unlock(&free_space_ctl->tree_lock);
3779 	}
3780 
3781 	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3782 			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3783 			&ffe_ctl->max_extent_size);
3784 	if (!offset)
3785 		return 1;
3786 	ffe_ctl->found_offset = offset;
3787 	return 0;
3788 }
3789 
3790 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3791 				   struct find_free_extent_ctl *ffe_ctl,
3792 				   struct btrfs_block_group **bg_ret)
3793 {
3794 	int ret;
3795 
3796 	/* We want to try and use the cluster allocator, so lets look there */
3797 	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3798 		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3799 		if (ret >= 0)
3800 			return ret;
3801 		/* ret == -ENOENT case falls through */
3802 	}
3803 
3804 	return find_free_extent_unclustered(block_group, ffe_ctl);
3805 }
3806 
3807 /*
3808  * Tree-log block group locking
3809  * ============================
3810  *
3811  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3812  * indicates the starting address of a block group, which is reserved only
3813  * for tree-log metadata.
3814  *
3815  * Lock nesting
3816  * ============
3817  *
3818  * space_info::lock
3819  *   block_group::lock
3820  *     fs_info::treelog_bg_lock
3821  */
3822 
3823 /*
3824  * Simple allocator for sequential-only block group. It only allows sequential
3825  * allocation. No need to play with trees. This function also reserves the
3826  * bytes as in btrfs_add_reserved_bytes.
3827  */
3828 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3829 			       struct find_free_extent_ctl *ffe_ctl,
3830 			       struct btrfs_block_group **bg_ret)
3831 {
3832 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3833 	struct btrfs_space_info *space_info = block_group->space_info;
3834 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3835 	u64 start = block_group->start;
3836 	u64 num_bytes = ffe_ctl->num_bytes;
3837 	u64 avail;
3838 	u64 bytenr = block_group->start;
3839 	u64 log_bytenr;
3840 	u64 data_reloc_bytenr;
3841 	int ret = 0;
3842 	bool skip = false;
3843 
3844 	ASSERT(btrfs_is_zoned(block_group->fs_info));
3845 
3846 	/*
3847 	 * Do not allow non-tree-log blocks in the dedicated tree-log block
3848 	 * group, and vice versa.
3849 	 */
3850 	spin_lock(&fs_info->treelog_bg_lock);
3851 	log_bytenr = fs_info->treelog_bg;
3852 	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3853 			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3854 		skip = true;
3855 	spin_unlock(&fs_info->treelog_bg_lock);
3856 	if (skip)
3857 		return 1;
3858 
3859 	/*
3860 	 * Do not allow non-relocation blocks in the dedicated relocation block
3861 	 * group, and vice versa.
3862 	 */
3863 	spin_lock(&fs_info->relocation_bg_lock);
3864 	data_reloc_bytenr = fs_info->data_reloc_bg;
3865 	if (data_reloc_bytenr &&
3866 	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3867 	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3868 		skip = true;
3869 	spin_unlock(&fs_info->relocation_bg_lock);
3870 	if (skip)
3871 		return 1;
3872 
3873 	/* Check RO and no space case before trying to activate it */
3874 	spin_lock(&block_group->lock);
3875 	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3876 		ret = 1;
3877 		/*
3878 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3879 		 * Return the error after taking the locks.
3880 		 */
3881 	}
3882 	spin_unlock(&block_group->lock);
3883 
3884 	/* Metadata block group is activated at write time. */
3885 	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3886 	    !btrfs_zone_activate(block_group)) {
3887 		ret = 1;
3888 		/*
3889 		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3890 		 * Return the error after taking the locks.
3891 		 */
3892 	}
3893 
3894 	spin_lock(&space_info->lock);
3895 	spin_lock(&block_group->lock);
3896 	spin_lock(&fs_info->treelog_bg_lock);
3897 	spin_lock(&fs_info->relocation_bg_lock);
3898 
3899 	if (ret)
3900 		goto out;
3901 
3902 	ASSERT(!ffe_ctl->for_treelog ||
3903 	       block_group->start == fs_info->treelog_bg ||
3904 	       fs_info->treelog_bg == 0);
3905 	ASSERT(!ffe_ctl->for_data_reloc ||
3906 	       block_group->start == fs_info->data_reloc_bg ||
3907 	       fs_info->data_reloc_bg == 0);
3908 
3909 	if (block_group->ro ||
3910 	    (!ffe_ctl->for_data_reloc &&
3911 	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3912 		ret = 1;
3913 		goto out;
3914 	}
3915 
3916 	/*
3917 	 * Do not allow currently using block group to be tree-log dedicated
3918 	 * block group.
3919 	 */
3920 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3921 	    (block_group->used || block_group->reserved)) {
3922 		ret = 1;
3923 		goto out;
3924 	}
3925 
3926 	/*
3927 	 * Do not allow currently used block group to be the data relocation
3928 	 * dedicated block group.
3929 	 */
3930 	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3931 	    (block_group->used || block_group->reserved)) {
3932 		ret = 1;
3933 		goto out;
3934 	}
3935 
3936 	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3937 	avail = block_group->zone_capacity - block_group->alloc_offset;
3938 	if (avail < num_bytes) {
3939 		if (ffe_ctl->max_extent_size < avail) {
3940 			/*
3941 			 * With sequential allocator, free space is always
3942 			 * contiguous
3943 			 */
3944 			ffe_ctl->max_extent_size = avail;
3945 			ffe_ctl->total_free_space = avail;
3946 		}
3947 		ret = 1;
3948 		goto out;
3949 	}
3950 
3951 	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3952 		fs_info->treelog_bg = block_group->start;
3953 
3954 	if (ffe_ctl->for_data_reloc) {
3955 		if (!fs_info->data_reloc_bg)
3956 			fs_info->data_reloc_bg = block_group->start;
3957 		/*
3958 		 * Do not allow allocations from this block group, unless it is
3959 		 * for data relocation. Compared to increasing the ->ro, setting
3960 		 * the ->zoned_data_reloc_ongoing flag still allows nocow
3961 		 * writers to come in. See btrfs_inc_nocow_writers().
3962 		 *
3963 		 * We need to disable an allocation to avoid an allocation of
3964 		 * regular (non-relocation data) extent. With mix of relocation
3965 		 * extents and regular extents, we can dispatch WRITE commands
3966 		 * (for relocation extents) and ZONE APPEND commands (for
3967 		 * regular extents) at the same time to the same zone, which
3968 		 * easily break the write pointer.
3969 		 *
3970 		 * Also, this flag avoids this block group to be zone finished.
3971 		 */
3972 		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3973 	}
3974 
3975 	ffe_ctl->found_offset = start + block_group->alloc_offset;
3976 	block_group->alloc_offset += num_bytes;
3977 	spin_lock(&ctl->tree_lock);
3978 	ctl->free_space -= num_bytes;
3979 	spin_unlock(&ctl->tree_lock);
3980 
3981 	/*
3982 	 * We do not check if found_offset is aligned to stripesize. The
3983 	 * address is anyway rewritten when using zone append writing.
3984 	 */
3985 
3986 	ffe_ctl->search_start = ffe_ctl->found_offset;
3987 
3988 out:
3989 	if (ret && ffe_ctl->for_treelog)
3990 		fs_info->treelog_bg = 0;
3991 	if (ret && ffe_ctl->for_data_reloc)
3992 		fs_info->data_reloc_bg = 0;
3993 	spin_unlock(&fs_info->relocation_bg_lock);
3994 	spin_unlock(&fs_info->treelog_bg_lock);
3995 	spin_unlock(&block_group->lock);
3996 	spin_unlock(&space_info->lock);
3997 	return ret;
3998 }
3999 
4000 static int do_allocation(struct btrfs_block_group *block_group,
4001 			 struct find_free_extent_ctl *ffe_ctl,
4002 			 struct btrfs_block_group **bg_ret)
4003 {
4004 	switch (ffe_ctl->policy) {
4005 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4006 		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
4007 	case BTRFS_EXTENT_ALLOC_ZONED:
4008 		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
4009 	default:
4010 		BUG();
4011 	}
4012 }
4013 
4014 static void release_block_group(struct btrfs_block_group *block_group,
4015 				struct find_free_extent_ctl *ffe_ctl,
4016 				int delalloc)
4017 {
4018 	switch (ffe_ctl->policy) {
4019 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4020 		ffe_ctl->retry_uncached = false;
4021 		break;
4022 	case BTRFS_EXTENT_ALLOC_ZONED:
4023 		/* Nothing to do */
4024 		break;
4025 	default:
4026 		BUG();
4027 	}
4028 
4029 	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4030 	       ffe_ctl->index);
4031 	btrfs_release_block_group(block_group, delalloc);
4032 }
4033 
4034 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
4035 				   struct btrfs_key *ins)
4036 {
4037 	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4038 
4039 	if (!ffe_ctl->use_cluster && last_ptr) {
4040 		spin_lock(&last_ptr->lock);
4041 		last_ptr->window_start = ins->objectid;
4042 		spin_unlock(&last_ptr->lock);
4043 	}
4044 }
4045 
4046 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
4047 			 struct btrfs_key *ins)
4048 {
4049 	switch (ffe_ctl->policy) {
4050 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4051 		found_extent_clustered(ffe_ctl, ins);
4052 		break;
4053 	case BTRFS_EXTENT_ALLOC_ZONED:
4054 		/* Nothing to do */
4055 		break;
4056 	default:
4057 		BUG();
4058 	}
4059 }
4060 
4061 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
4062 				    struct find_free_extent_ctl *ffe_ctl)
4063 {
4064 	/* Block group's activeness is not a requirement for METADATA block groups. */
4065 	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
4066 		return 0;
4067 
4068 	/* If we can activate new zone, just allocate a chunk and use it */
4069 	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
4070 		return 0;
4071 
4072 	/*
4073 	 * We already reached the max active zones. Try to finish one block
4074 	 * group to make a room for a new block group. This is only possible
4075 	 * for a data block group because btrfs_zone_finish() may need to wait
4076 	 * for a running transaction which can cause a deadlock for metadata
4077 	 * allocation.
4078 	 */
4079 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4080 		int ret = btrfs_zone_finish_one_bg(fs_info);
4081 
4082 		if (ret == 1)
4083 			return 0;
4084 		else if (ret < 0)
4085 			return ret;
4086 	}
4087 
4088 	/*
4089 	 * If we have enough free space left in an already active block group
4090 	 * and we can't activate any other zone now, do not allow allocating a
4091 	 * new chunk and let find_free_extent() retry with a smaller size.
4092 	 */
4093 	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
4094 		return -ENOSPC;
4095 
4096 	/*
4097 	 * Even min_alloc_size is not left in any block groups. Since we cannot
4098 	 * activate a new block group, allocating it may not help. Let's tell a
4099 	 * caller to try again and hope it progress something by writing some
4100 	 * parts of the region. That is only possible for data block groups,
4101 	 * where a part of the region can be written.
4102 	 */
4103 	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
4104 		return -EAGAIN;
4105 
4106 	/*
4107 	 * We cannot activate a new block group and no enough space left in any
4108 	 * block groups. So, allocating a new block group may not help. But,
4109 	 * there is nothing to do anyway, so let's go with it.
4110 	 */
4111 	return 0;
4112 }
4113 
4114 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
4115 			      struct find_free_extent_ctl *ffe_ctl)
4116 {
4117 	switch (ffe_ctl->policy) {
4118 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4119 		return 0;
4120 	case BTRFS_EXTENT_ALLOC_ZONED:
4121 		return can_allocate_chunk_zoned(fs_info, ffe_ctl);
4122 	default:
4123 		BUG();
4124 	}
4125 }
4126 
4127 /*
4128  * Return >0 means caller needs to re-search for free extent
4129  * Return 0 means we have the needed free extent.
4130  * Return <0 means we failed to locate any free extent.
4131  */
4132 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4133 					struct btrfs_key *ins,
4134 					struct find_free_extent_ctl *ffe_ctl,
4135 					bool full_search)
4136 {
4137 	struct btrfs_root *root = fs_info->chunk_root;
4138 	int ret;
4139 
4140 	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4141 	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4142 		ffe_ctl->orig_have_caching_bg = true;
4143 
4144 	if (ins->objectid) {
4145 		found_extent(ffe_ctl, ins);
4146 		return 0;
4147 	}
4148 
4149 	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4150 		return 1;
4151 
4152 	ffe_ctl->index++;
4153 	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4154 		return 1;
4155 
4156 	/* See the comments for btrfs_loop_type for an explanation of the phases. */
4157 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4158 		ffe_ctl->index = 0;
4159 		/*
4160 		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4161 		 * any uncached bgs and we've already done a full search
4162 		 * through.
4163 		 */
4164 		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4165 		    (!ffe_ctl->orig_have_caching_bg && full_search))
4166 			ffe_ctl->loop++;
4167 		ffe_ctl->loop++;
4168 
4169 		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4170 			struct btrfs_trans_handle *trans;
4171 			int exist = 0;
4172 
4173 			/* Check if allocation policy allows to create a new chunk */
4174 			ret = can_allocate_chunk(fs_info, ffe_ctl);
4175 			if (ret)
4176 				return ret;
4177 
4178 			trans = current->journal_info;
4179 			if (trans)
4180 				exist = 1;
4181 			else
4182 				trans = btrfs_join_transaction(root);
4183 
4184 			if (IS_ERR(trans)) {
4185 				ret = PTR_ERR(trans);
4186 				return ret;
4187 			}
4188 
4189 			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4190 						CHUNK_ALLOC_FORCE_FOR_EXTENT);
4191 
4192 			/* Do not bail out on ENOSPC since we can do more. */
4193 			if (ret == -ENOSPC) {
4194 				ret = 0;
4195 				ffe_ctl->loop++;
4196 			}
4197 			else if (ret < 0)
4198 				btrfs_abort_transaction(trans, ret);
4199 			else
4200 				ret = 0;
4201 			if (!exist)
4202 				btrfs_end_transaction(trans);
4203 			if (ret)
4204 				return ret;
4205 		}
4206 
4207 		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4208 			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4209 				return -ENOSPC;
4210 
4211 			/*
4212 			 * Don't loop again if we already have no empty_size and
4213 			 * no empty_cluster.
4214 			 */
4215 			if (ffe_ctl->empty_size == 0 &&
4216 			    ffe_ctl->empty_cluster == 0)
4217 				return -ENOSPC;
4218 			ffe_ctl->empty_size = 0;
4219 			ffe_ctl->empty_cluster = 0;
4220 		}
4221 		return 1;
4222 	}
4223 	return -ENOSPC;
4224 }
4225 
4226 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4227 					      struct btrfs_block_group *bg)
4228 {
4229 	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4230 		return true;
4231 	if (!btrfs_block_group_should_use_size_class(bg))
4232 		return true;
4233 	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4234 		return true;
4235 	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4236 	    bg->size_class == BTRFS_BG_SZ_NONE)
4237 		return true;
4238 	return ffe_ctl->size_class == bg->size_class;
4239 }
4240 
4241 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4242 					struct find_free_extent_ctl *ffe_ctl,
4243 					struct btrfs_space_info *space_info,
4244 					struct btrfs_key *ins)
4245 {
4246 	/*
4247 	 * If our free space is heavily fragmented we may not be able to make
4248 	 * big contiguous allocations, so instead of doing the expensive search
4249 	 * for free space, simply return ENOSPC with our max_extent_size so we
4250 	 * can go ahead and search for a more manageable chunk.
4251 	 *
4252 	 * If our max_extent_size is large enough for our allocation simply
4253 	 * disable clustering since we will likely not be able to find enough
4254 	 * space to create a cluster and induce latency trying.
4255 	 */
4256 	if (space_info->max_extent_size) {
4257 		spin_lock(&space_info->lock);
4258 		if (space_info->max_extent_size &&
4259 		    ffe_ctl->num_bytes > space_info->max_extent_size) {
4260 			ins->offset = space_info->max_extent_size;
4261 			spin_unlock(&space_info->lock);
4262 			return -ENOSPC;
4263 		} else if (space_info->max_extent_size) {
4264 			ffe_ctl->use_cluster = false;
4265 		}
4266 		spin_unlock(&space_info->lock);
4267 	}
4268 
4269 	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4270 					       &ffe_ctl->empty_cluster);
4271 	if (ffe_ctl->last_ptr) {
4272 		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4273 
4274 		spin_lock(&last_ptr->lock);
4275 		if (last_ptr->block_group)
4276 			ffe_ctl->hint_byte = last_ptr->window_start;
4277 		if (last_ptr->fragmented) {
4278 			/*
4279 			 * We still set window_start so we can keep track of the
4280 			 * last place we found an allocation to try and save
4281 			 * some time.
4282 			 */
4283 			ffe_ctl->hint_byte = last_ptr->window_start;
4284 			ffe_ctl->use_cluster = false;
4285 		}
4286 		spin_unlock(&last_ptr->lock);
4287 	}
4288 
4289 	return 0;
4290 }
4291 
4292 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4293 				    struct find_free_extent_ctl *ffe_ctl)
4294 {
4295 	if (ffe_ctl->for_treelog) {
4296 		spin_lock(&fs_info->treelog_bg_lock);
4297 		if (fs_info->treelog_bg)
4298 			ffe_ctl->hint_byte = fs_info->treelog_bg;
4299 		spin_unlock(&fs_info->treelog_bg_lock);
4300 	} else if (ffe_ctl->for_data_reloc) {
4301 		spin_lock(&fs_info->relocation_bg_lock);
4302 		if (fs_info->data_reloc_bg)
4303 			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4304 		spin_unlock(&fs_info->relocation_bg_lock);
4305 	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4306 		struct btrfs_block_group *block_group;
4307 
4308 		spin_lock(&fs_info->zone_active_bgs_lock);
4309 		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4310 			/*
4311 			 * No lock is OK here because avail is monotinically
4312 			 * decreasing, and this is just a hint.
4313 			 */
4314 			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4315 
4316 			if (block_group_bits(block_group, ffe_ctl->flags) &&
4317 			    avail >= ffe_ctl->num_bytes) {
4318 				ffe_ctl->hint_byte = block_group->start;
4319 				break;
4320 			}
4321 		}
4322 		spin_unlock(&fs_info->zone_active_bgs_lock);
4323 	}
4324 
4325 	return 0;
4326 }
4327 
4328 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4329 			      struct find_free_extent_ctl *ffe_ctl,
4330 			      struct btrfs_space_info *space_info,
4331 			      struct btrfs_key *ins)
4332 {
4333 	switch (ffe_ctl->policy) {
4334 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4335 		return prepare_allocation_clustered(fs_info, ffe_ctl,
4336 						    space_info, ins);
4337 	case BTRFS_EXTENT_ALLOC_ZONED:
4338 		return prepare_allocation_zoned(fs_info, ffe_ctl);
4339 	default:
4340 		BUG();
4341 	}
4342 }
4343 
4344 /*
4345  * walks the btree of allocated extents and find a hole of a given size.
4346  * The key ins is changed to record the hole:
4347  * ins->objectid == start position
4348  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4349  * ins->offset == the size of the hole.
4350  * Any available blocks before search_start are skipped.
4351  *
4352  * If there is no suitable free space, we will record the max size of
4353  * the free space extent currently.
4354  *
4355  * The overall logic and call chain:
4356  *
4357  * find_free_extent()
4358  * |- Iterate through all block groups
4359  * |  |- Get a valid block group
4360  * |  |- Try to do clustered allocation in that block group
4361  * |  |- Try to do unclustered allocation in that block group
4362  * |  |- Check if the result is valid
4363  * |  |  |- If valid, then exit
4364  * |  |- Jump to next block group
4365  * |
4366  * |- Push harder to find free extents
4367  *    |- If not found, re-iterate all block groups
4368  */
4369 static noinline int find_free_extent(struct btrfs_root *root,
4370 				     struct btrfs_key *ins,
4371 				     struct find_free_extent_ctl *ffe_ctl)
4372 {
4373 	struct btrfs_fs_info *fs_info = root->fs_info;
4374 	int ret = 0;
4375 	int cache_block_group_error = 0;
4376 	struct btrfs_block_group *block_group = NULL;
4377 	struct btrfs_space_info *space_info;
4378 	bool full_search = false;
4379 
4380 	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4381 
4382 	ffe_ctl->search_start = 0;
4383 	/* For clustered allocation */
4384 	ffe_ctl->empty_cluster = 0;
4385 	ffe_ctl->last_ptr = NULL;
4386 	ffe_ctl->use_cluster = true;
4387 	ffe_ctl->have_caching_bg = false;
4388 	ffe_ctl->orig_have_caching_bg = false;
4389 	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4390 	ffe_ctl->loop = 0;
4391 	ffe_ctl->retry_uncached = false;
4392 	ffe_ctl->cached = 0;
4393 	ffe_ctl->max_extent_size = 0;
4394 	ffe_ctl->total_free_space = 0;
4395 	ffe_ctl->found_offset = 0;
4396 	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4397 	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4398 
4399 	if (btrfs_is_zoned(fs_info))
4400 		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4401 
4402 	ins->type = BTRFS_EXTENT_ITEM_KEY;
4403 	ins->objectid = 0;
4404 	ins->offset = 0;
4405 
4406 	trace_find_free_extent(root, ffe_ctl);
4407 
4408 	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4409 	if (!space_info) {
4410 		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4411 		return -ENOSPC;
4412 	}
4413 
4414 	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4415 	if (ret < 0)
4416 		return ret;
4417 
4418 	ffe_ctl->search_start = max(ffe_ctl->search_start,
4419 				    first_logical_byte(fs_info));
4420 	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4421 	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4422 		block_group = btrfs_lookup_block_group(fs_info,
4423 						       ffe_ctl->search_start);
4424 		/*
4425 		 * we don't want to use the block group if it doesn't match our
4426 		 * allocation bits, or if its not cached.
4427 		 *
4428 		 * However if we are re-searching with an ideal block group
4429 		 * picked out then we don't care that the block group is cached.
4430 		 */
4431 		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4432 		    block_group->cached != BTRFS_CACHE_NO) {
4433 			down_read(&space_info->groups_sem);
4434 			if (list_empty(&block_group->list) ||
4435 			    block_group->ro) {
4436 				/*
4437 				 * someone is removing this block group,
4438 				 * we can't jump into the have_block_group
4439 				 * target because our list pointers are not
4440 				 * valid
4441 				 */
4442 				btrfs_put_block_group(block_group);
4443 				up_read(&space_info->groups_sem);
4444 			} else {
4445 				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4446 							block_group->flags);
4447 				btrfs_lock_block_group(block_group,
4448 						       ffe_ctl->delalloc);
4449 				ffe_ctl->hinted = true;
4450 				goto have_block_group;
4451 			}
4452 		} else if (block_group) {
4453 			btrfs_put_block_group(block_group);
4454 		}
4455 	}
4456 search:
4457 	trace_find_free_extent_search_loop(root, ffe_ctl);
4458 	ffe_ctl->have_caching_bg = false;
4459 	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4460 	    ffe_ctl->index == 0)
4461 		full_search = true;
4462 	down_read(&space_info->groups_sem);
4463 	list_for_each_entry(block_group,
4464 			    &space_info->block_groups[ffe_ctl->index], list) {
4465 		struct btrfs_block_group *bg_ret;
4466 
4467 		ffe_ctl->hinted = false;
4468 		/* If the block group is read-only, we can skip it entirely. */
4469 		if (unlikely(block_group->ro)) {
4470 			if (ffe_ctl->for_treelog)
4471 				btrfs_clear_treelog_bg(block_group);
4472 			if (ffe_ctl->for_data_reloc)
4473 				btrfs_clear_data_reloc_bg(block_group);
4474 			continue;
4475 		}
4476 
4477 		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4478 		ffe_ctl->search_start = block_group->start;
4479 
4480 		/*
4481 		 * this can happen if we end up cycling through all the
4482 		 * raid types, but we want to make sure we only allocate
4483 		 * for the proper type.
4484 		 */
4485 		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4486 			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4487 				BTRFS_BLOCK_GROUP_RAID1_MASK |
4488 				BTRFS_BLOCK_GROUP_RAID56_MASK |
4489 				BTRFS_BLOCK_GROUP_RAID10;
4490 
4491 			/*
4492 			 * if they asked for extra copies and this block group
4493 			 * doesn't provide them, bail.  This does allow us to
4494 			 * fill raid0 from raid1.
4495 			 */
4496 			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4497 				goto loop;
4498 
4499 			/*
4500 			 * This block group has different flags than we want.
4501 			 * It's possible that we have MIXED_GROUP flag but no
4502 			 * block group is mixed.  Just skip such block group.
4503 			 */
4504 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4505 			continue;
4506 		}
4507 
4508 have_block_group:
4509 		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4510 		ffe_ctl->cached = btrfs_block_group_done(block_group);
4511 		if (unlikely(!ffe_ctl->cached)) {
4512 			ffe_ctl->have_caching_bg = true;
4513 			ret = btrfs_cache_block_group(block_group, false);
4514 
4515 			/*
4516 			 * If we get ENOMEM here or something else we want to
4517 			 * try other block groups, because it may not be fatal.
4518 			 * However if we can't find anything else we need to
4519 			 * save our return here so that we return the actual
4520 			 * error that caused problems, not ENOSPC.
4521 			 */
4522 			if (ret < 0) {
4523 				if (!cache_block_group_error)
4524 					cache_block_group_error = ret;
4525 				ret = 0;
4526 				goto loop;
4527 			}
4528 			ret = 0;
4529 		}
4530 
4531 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4532 			if (!cache_block_group_error)
4533 				cache_block_group_error = -EIO;
4534 			goto loop;
4535 		}
4536 
4537 		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4538 			goto loop;
4539 
4540 		bg_ret = NULL;
4541 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4542 		if (ret > 0)
4543 			goto loop;
4544 
4545 		if (bg_ret && bg_ret != block_group) {
4546 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4547 			block_group = bg_ret;
4548 		}
4549 
4550 		/* Checks */
4551 		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4552 						 fs_info->stripesize);
4553 
4554 		/* move on to the next group */
4555 		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4556 		    block_group->start + block_group->length) {
4557 			btrfs_add_free_space_unused(block_group,
4558 					    ffe_ctl->found_offset,
4559 					    ffe_ctl->num_bytes);
4560 			goto loop;
4561 		}
4562 
4563 		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4564 			btrfs_add_free_space_unused(block_group,
4565 					ffe_ctl->found_offset,
4566 					ffe_ctl->search_start - ffe_ctl->found_offset);
4567 
4568 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4569 					       ffe_ctl->num_bytes,
4570 					       ffe_ctl->delalloc,
4571 					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4572 		if (ret == -EAGAIN) {
4573 			btrfs_add_free_space_unused(block_group,
4574 					ffe_ctl->found_offset,
4575 					ffe_ctl->num_bytes);
4576 			goto loop;
4577 		}
4578 		btrfs_inc_block_group_reservations(block_group);
4579 
4580 		/* we are all good, lets return */
4581 		ins->objectid = ffe_ctl->search_start;
4582 		ins->offset = ffe_ctl->num_bytes;
4583 
4584 		trace_btrfs_reserve_extent(block_group, ffe_ctl);
4585 		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4586 		break;
4587 loop:
4588 		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4589 		    !ffe_ctl->retry_uncached) {
4590 			ffe_ctl->retry_uncached = true;
4591 			btrfs_wait_block_group_cache_progress(block_group,
4592 						ffe_ctl->num_bytes +
4593 						ffe_ctl->empty_cluster +
4594 						ffe_ctl->empty_size);
4595 			goto have_block_group;
4596 		}
4597 		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4598 		cond_resched();
4599 	}
4600 	up_read(&space_info->groups_sem);
4601 
4602 	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4603 	if (ret > 0)
4604 		goto search;
4605 
4606 	if (ret == -ENOSPC && !cache_block_group_error) {
4607 		/*
4608 		 * Use ffe_ctl->total_free_space as fallback if we can't find
4609 		 * any contiguous hole.
4610 		 */
4611 		if (!ffe_ctl->max_extent_size)
4612 			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4613 		spin_lock(&space_info->lock);
4614 		space_info->max_extent_size = ffe_ctl->max_extent_size;
4615 		spin_unlock(&space_info->lock);
4616 		ins->offset = ffe_ctl->max_extent_size;
4617 	} else if (ret == -ENOSPC) {
4618 		ret = cache_block_group_error;
4619 	}
4620 	return ret;
4621 }
4622 
4623 /*
4624  * Entry point to the extent allocator. Tries to find a hole that is at least
4625  * as big as @num_bytes.
4626  *
4627  * @root           -	The root that will contain this extent
4628  *
4629  * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4630  *			is used for accounting purposes. This value differs
4631  *			from @num_bytes only in the case of compressed extents.
4632  *
4633  * @num_bytes      -	Number of bytes to allocate on-disk.
4634  *
4635  * @min_alloc_size -	Indicates the minimum amount of space that the
4636  *			allocator should try to satisfy. In some cases
4637  *			@num_bytes may be larger than what is required and if
4638  *			the filesystem is fragmented then allocation fails.
4639  *			However, the presence of @min_alloc_size gives a
4640  *			chance to try and satisfy the smaller allocation.
4641  *
4642  * @empty_size     -	A hint that you plan on doing more COW. This is the
4643  *			size in bytes the allocator should try to find free
4644  *			next to the block it returns.  This is just a hint and
4645  *			may be ignored by the allocator.
4646  *
4647  * @hint_byte      -	Hint to the allocator to start searching above the byte
4648  *			address passed. It might be ignored.
4649  *
4650  * @ins            -	This key is modified to record the found hole. It will
4651  *			have the following values:
4652  *			ins->objectid == start position
4653  *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4654  *			ins->offset == the size of the hole.
4655  *
4656  * @is_data        -	Boolean flag indicating whether an extent is
4657  *			allocated for data (true) or metadata (false)
4658  *
4659  * @delalloc       -	Boolean flag indicating whether this allocation is for
4660  *			delalloc or not. If 'true' data_rwsem of block groups
4661  *			is going to be acquired.
4662  *
4663  *
4664  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4665  * case -ENOSPC is returned then @ins->offset will contain the size of the
4666  * largest available hole the allocator managed to find.
4667  */
4668 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4669 			 u64 num_bytes, u64 min_alloc_size,
4670 			 u64 empty_size, u64 hint_byte,
4671 			 struct btrfs_key *ins, int is_data, int delalloc)
4672 {
4673 	struct btrfs_fs_info *fs_info = root->fs_info;
4674 	struct find_free_extent_ctl ffe_ctl = {};
4675 	bool final_tried = num_bytes == min_alloc_size;
4676 	u64 flags;
4677 	int ret;
4678 	bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
4679 	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4680 
4681 	flags = get_alloc_profile_by_root(root, is_data);
4682 again:
4683 	WARN_ON(num_bytes < fs_info->sectorsize);
4684 
4685 	ffe_ctl.ram_bytes = ram_bytes;
4686 	ffe_ctl.num_bytes = num_bytes;
4687 	ffe_ctl.min_alloc_size = min_alloc_size;
4688 	ffe_ctl.empty_size = empty_size;
4689 	ffe_ctl.flags = flags;
4690 	ffe_ctl.delalloc = delalloc;
4691 	ffe_ctl.hint_byte = hint_byte;
4692 	ffe_ctl.for_treelog = for_treelog;
4693 	ffe_ctl.for_data_reloc = for_data_reloc;
4694 
4695 	ret = find_free_extent(root, ins, &ffe_ctl);
4696 	if (!ret && !is_data) {
4697 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4698 	} else if (ret == -ENOSPC) {
4699 		if (!final_tried && ins->offset) {
4700 			num_bytes = min(num_bytes >> 1, ins->offset);
4701 			num_bytes = round_down(num_bytes,
4702 					       fs_info->sectorsize);
4703 			num_bytes = max(num_bytes, min_alloc_size);
4704 			ram_bytes = num_bytes;
4705 			if (num_bytes == min_alloc_size)
4706 				final_tried = true;
4707 			goto again;
4708 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4709 			struct btrfs_space_info *sinfo;
4710 
4711 			sinfo = btrfs_find_space_info(fs_info, flags);
4712 			btrfs_err(fs_info,
4713 	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4714 				  flags, num_bytes, for_treelog, for_data_reloc);
4715 			if (sinfo)
4716 				btrfs_dump_space_info(fs_info, sinfo,
4717 						      num_bytes, 1);
4718 		}
4719 	}
4720 
4721 	return ret;
4722 }
4723 
4724 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4725 			       u64 start, u64 len, int delalloc)
4726 {
4727 	struct btrfs_block_group *cache;
4728 
4729 	cache = btrfs_lookup_block_group(fs_info, start);
4730 	if (!cache) {
4731 		btrfs_err(fs_info, "Unable to find block group for %llu",
4732 			  start);
4733 		return -ENOSPC;
4734 	}
4735 
4736 	btrfs_add_free_space(cache, start, len);
4737 	btrfs_free_reserved_bytes(cache, len, delalloc);
4738 	trace_btrfs_reserved_extent_free(fs_info, start, len);
4739 
4740 	btrfs_put_block_group(cache);
4741 	return 0;
4742 }
4743 
4744 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4745 			      const struct extent_buffer *eb)
4746 {
4747 	struct btrfs_block_group *cache;
4748 	int ret = 0;
4749 
4750 	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4751 	if (!cache) {
4752 		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4753 			  eb->start);
4754 		return -ENOSPC;
4755 	}
4756 
4757 	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4758 	btrfs_put_block_group(cache);
4759 	return ret;
4760 }
4761 
4762 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4763 				 u64 num_bytes)
4764 {
4765 	struct btrfs_fs_info *fs_info = trans->fs_info;
4766 	int ret;
4767 
4768 	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4769 	if (ret)
4770 		return ret;
4771 
4772 	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4773 	if (ret) {
4774 		ASSERT(!ret);
4775 		btrfs_err(fs_info, "update block group failed for %llu %llu",
4776 			  bytenr, num_bytes);
4777 		return ret;
4778 	}
4779 
4780 	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4781 	return 0;
4782 }
4783 
4784 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4785 				      u64 parent, u64 root_objectid,
4786 				      u64 flags, u64 owner, u64 offset,
4787 				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
4788 {
4789 	struct btrfs_fs_info *fs_info = trans->fs_info;
4790 	struct btrfs_root *extent_root;
4791 	int ret;
4792 	struct btrfs_extent_item *extent_item;
4793 	struct btrfs_extent_owner_ref *oref;
4794 	struct btrfs_extent_inline_ref *iref;
4795 	struct btrfs_path *path;
4796 	struct extent_buffer *leaf;
4797 	int type;
4798 	u32 size;
4799 	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
4800 
4801 	if (parent > 0)
4802 		type = BTRFS_SHARED_DATA_REF_KEY;
4803 	else
4804 		type = BTRFS_EXTENT_DATA_REF_KEY;
4805 
4806 	size = sizeof(*extent_item);
4807 	if (simple_quota)
4808 		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
4809 	size += btrfs_extent_inline_ref_size(type);
4810 
4811 	path = btrfs_alloc_path();
4812 	if (!path)
4813 		return -ENOMEM;
4814 
4815 	extent_root = btrfs_extent_root(fs_info, ins->objectid);
4816 	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4817 	if (ret) {
4818 		btrfs_free_path(path);
4819 		return ret;
4820 	}
4821 
4822 	leaf = path->nodes[0];
4823 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4824 				     struct btrfs_extent_item);
4825 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4826 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4827 	btrfs_set_extent_flags(leaf, extent_item,
4828 			       flags | BTRFS_EXTENT_FLAG_DATA);
4829 
4830 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4831 	if (simple_quota) {
4832 		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
4833 		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
4834 		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
4835 		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
4836 	}
4837 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4838 
4839 	if (parent > 0) {
4840 		struct btrfs_shared_data_ref *ref;
4841 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4842 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4843 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4844 	} else {
4845 		struct btrfs_extent_data_ref *ref;
4846 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4847 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4848 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4849 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4850 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4851 	}
4852 
4853 	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4854 	btrfs_free_path(path);
4855 
4856 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4857 }
4858 
4859 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4860 				     struct btrfs_delayed_ref_node *node,
4861 				     struct btrfs_delayed_extent_op *extent_op)
4862 {
4863 	struct btrfs_fs_info *fs_info = trans->fs_info;
4864 	struct btrfs_root *extent_root;
4865 	int ret;
4866 	struct btrfs_extent_item *extent_item;
4867 	struct btrfs_key extent_key;
4868 	struct btrfs_tree_block_info *block_info;
4869 	struct btrfs_extent_inline_ref *iref;
4870 	struct btrfs_path *path;
4871 	struct extent_buffer *leaf;
4872 	u32 size = sizeof(*extent_item) + sizeof(*iref);
4873 	const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
4874 	/* The owner of a tree block is the level. */
4875 	int level = btrfs_delayed_ref_owner(node);
4876 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4877 
4878 	extent_key.objectid = node->bytenr;
4879 	if (skinny_metadata) {
4880 		/* The owner of a tree block is the level. */
4881 		extent_key.offset = level;
4882 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4883 	} else {
4884 		extent_key.offset = node->num_bytes;
4885 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4886 		size += sizeof(*block_info);
4887 	}
4888 
4889 	path = btrfs_alloc_path();
4890 	if (!path)
4891 		return -ENOMEM;
4892 
4893 	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4894 	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4895 				      size);
4896 	if (ret) {
4897 		btrfs_free_path(path);
4898 		return ret;
4899 	}
4900 
4901 	leaf = path->nodes[0];
4902 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4903 				     struct btrfs_extent_item);
4904 	btrfs_set_extent_refs(leaf, extent_item, 1);
4905 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4906 	btrfs_set_extent_flags(leaf, extent_item,
4907 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4908 
4909 	if (skinny_metadata) {
4910 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4911 	} else {
4912 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4913 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4914 		btrfs_set_tree_block_level(leaf, block_info, level);
4915 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4916 	}
4917 
4918 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4919 		btrfs_set_extent_inline_ref_type(leaf, iref,
4920 						 BTRFS_SHARED_BLOCK_REF_KEY);
4921 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
4922 	} else {
4923 		btrfs_set_extent_inline_ref_type(leaf, iref,
4924 						 BTRFS_TREE_BLOCK_REF_KEY);
4925 		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
4926 	}
4927 
4928 	btrfs_mark_buffer_dirty(trans, leaf);
4929 	btrfs_free_path(path);
4930 
4931 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4932 }
4933 
4934 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4935 				     struct btrfs_root *root, u64 owner,
4936 				     u64 offset, u64 ram_bytes,
4937 				     struct btrfs_key *ins)
4938 {
4939 	struct btrfs_ref generic_ref = {
4940 		.action = BTRFS_ADD_DELAYED_EXTENT,
4941 		.bytenr = ins->objectid,
4942 		.num_bytes = ins->offset,
4943 		.owning_root = btrfs_root_id(root),
4944 		.ref_root = btrfs_root_id(root),
4945 	};
4946 
4947 	ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
4948 
4949 	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
4950 		generic_ref.owning_root = root->relocation_src_root;
4951 
4952 	btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
4953 	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4954 
4955 	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4956 }
4957 
4958 /*
4959  * this is used by the tree logging recovery code.  It records that
4960  * an extent has been allocated and makes sure to clear the free
4961  * space cache bits as well
4962  */
4963 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4964 				   u64 root_objectid, u64 owner, u64 offset,
4965 				   struct btrfs_key *ins)
4966 {
4967 	struct btrfs_fs_info *fs_info = trans->fs_info;
4968 	int ret;
4969 	struct btrfs_block_group *block_group;
4970 	struct btrfs_space_info *space_info;
4971 	struct btrfs_squota_delta delta = {
4972 		.root = root_objectid,
4973 		.num_bytes = ins->offset,
4974 		.generation = trans->transid,
4975 		.is_data = true,
4976 		.is_inc = true,
4977 	};
4978 
4979 	/*
4980 	 * Mixed block groups will exclude before processing the log so we only
4981 	 * need to do the exclude dance if this fs isn't mixed.
4982 	 */
4983 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4984 		ret = __exclude_logged_extent(fs_info, ins->objectid,
4985 					      ins->offset);
4986 		if (ret)
4987 			return ret;
4988 	}
4989 
4990 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4991 	if (!block_group)
4992 		return -EINVAL;
4993 
4994 	space_info = block_group->space_info;
4995 	spin_lock(&space_info->lock);
4996 	spin_lock(&block_group->lock);
4997 	space_info->bytes_reserved += ins->offset;
4998 	block_group->reserved += ins->offset;
4999 	spin_unlock(&block_group->lock);
5000 	spin_unlock(&space_info->lock);
5001 
5002 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
5003 					 offset, ins, 1, root_objectid);
5004 	if (ret)
5005 		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
5006 	ret = btrfs_record_squota_delta(fs_info, &delta);
5007 	btrfs_put_block_group(block_group);
5008 	return ret;
5009 }
5010 
5011 #ifdef CONFIG_BTRFS_DEBUG
5012 /*
5013  * Extra safety check in case the extent tree is corrupted and extent allocator
5014  * chooses to use a tree block which is already used and locked.
5015  */
5016 static bool check_eb_lock_owner(const struct extent_buffer *eb)
5017 {
5018 	if (eb->lock_owner == current->pid) {
5019 		btrfs_err_rl(eb->fs_info,
5020 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
5021 			     eb->start, btrfs_header_owner(eb), current->pid);
5022 		return true;
5023 	}
5024 	return false;
5025 }
5026 #else
5027 static bool check_eb_lock_owner(struct extent_buffer *eb)
5028 {
5029 	return false;
5030 }
5031 #endif
5032 
5033 static struct extent_buffer *
5034 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5035 		      u64 bytenr, int level, u64 owner,
5036 		      enum btrfs_lock_nesting nest)
5037 {
5038 	struct btrfs_fs_info *fs_info = root->fs_info;
5039 	struct extent_buffer *buf;
5040 	u64 lockdep_owner = owner;
5041 
5042 	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
5043 	if (IS_ERR(buf))
5044 		return buf;
5045 
5046 	if (check_eb_lock_owner(buf)) {
5047 		free_extent_buffer(buf);
5048 		return ERR_PTR(-EUCLEAN);
5049 	}
5050 
5051 	/*
5052 	 * The reloc trees are just snapshots, so we need them to appear to be
5053 	 * just like any other fs tree WRT lockdep.
5054 	 *
5055 	 * The exception however is in replace_path() in relocation, where we
5056 	 * hold the lock on the original fs root and then search for the reloc
5057 	 * root.  At that point we need to make sure any reloc root buffers are
5058 	 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
5059 	 * lockdep happy.
5060 	 */
5061 	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
5062 	    !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
5063 		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
5064 
5065 	/* btrfs_clear_buffer_dirty() accesses generation field. */
5066 	btrfs_set_header_generation(buf, trans->transid);
5067 
5068 	/*
5069 	 * This needs to stay, because we could allocate a freed block from an
5070 	 * old tree into a new tree, so we need to make sure this new block is
5071 	 * set to the appropriate level and owner.
5072 	 */
5073 	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
5074 
5075 	btrfs_tree_lock_nested(buf, nest);
5076 	btrfs_clear_buffer_dirty(trans, buf);
5077 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
5078 	clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
5079 
5080 	set_extent_buffer_uptodate(buf);
5081 
5082 	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
5083 	btrfs_set_header_level(buf, level);
5084 	btrfs_set_header_bytenr(buf, buf->start);
5085 	btrfs_set_header_generation(buf, trans->transid);
5086 	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
5087 	btrfs_set_header_owner(buf, owner);
5088 	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
5089 	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
5090 	if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
5091 		buf->log_index = root->log_transid % 2;
5092 		/*
5093 		 * we allow two log transactions at a time, use different
5094 		 * EXTENT bit to differentiate dirty pages.
5095 		 */
5096 		if (buf->log_index == 0)
5097 			set_extent_bit(&root->dirty_log_pages, buf->start,
5098 				       buf->start + buf->len - 1,
5099 				       EXTENT_DIRTY, NULL);
5100 		else
5101 			set_extent_bit(&root->dirty_log_pages, buf->start,
5102 				       buf->start + buf->len - 1,
5103 				       EXTENT_NEW, NULL);
5104 	} else {
5105 		buf->log_index = -1;
5106 		set_extent_bit(&trans->transaction->dirty_pages, buf->start,
5107 			       buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
5108 	}
5109 	/* this returns a buffer locked for blocking */
5110 	return buf;
5111 }
5112 
5113 /*
5114  * finds a free extent and does all the dirty work required for allocation
5115  * returns the tree buffer or an ERR_PTR on error.
5116  */
5117 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
5118 					     struct btrfs_root *root,
5119 					     u64 parent, u64 root_objectid,
5120 					     const struct btrfs_disk_key *key,
5121 					     int level, u64 hint,
5122 					     u64 empty_size,
5123 					     u64 reloc_src_root,
5124 					     enum btrfs_lock_nesting nest)
5125 {
5126 	struct btrfs_fs_info *fs_info = root->fs_info;
5127 	struct btrfs_key ins;
5128 	struct btrfs_block_rsv *block_rsv;
5129 	struct extent_buffer *buf;
5130 	u64 flags = 0;
5131 	int ret;
5132 	u32 blocksize = fs_info->nodesize;
5133 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
5134 	u64 owning_root;
5135 
5136 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5137 	if (btrfs_is_testing(fs_info)) {
5138 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
5139 					    level, root_objectid, nest);
5140 		if (!IS_ERR(buf))
5141 			root->alloc_bytenr += blocksize;
5142 		return buf;
5143 	}
5144 #endif
5145 
5146 	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
5147 	if (IS_ERR(block_rsv))
5148 		return ERR_CAST(block_rsv);
5149 
5150 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
5151 				   empty_size, hint, &ins, 0, 0);
5152 	if (ret)
5153 		goto out_unuse;
5154 
5155 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
5156 				    root_objectid, nest);
5157 	if (IS_ERR(buf)) {
5158 		ret = PTR_ERR(buf);
5159 		goto out_free_reserved;
5160 	}
5161 	owning_root = btrfs_header_owner(buf);
5162 
5163 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5164 		if (parent == 0)
5165 			parent = ins.objectid;
5166 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5167 		owning_root = reloc_src_root;
5168 	} else
5169 		BUG_ON(parent > 0);
5170 
5171 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5172 		struct btrfs_delayed_extent_op *extent_op;
5173 		struct btrfs_ref generic_ref = {
5174 			.action = BTRFS_ADD_DELAYED_EXTENT,
5175 			.bytenr = ins.objectid,
5176 			.num_bytes = ins.offset,
5177 			.parent = parent,
5178 			.owning_root = owning_root,
5179 			.ref_root = root_objectid,
5180 		};
5181 
5182 		if (!skinny_metadata || flags != 0) {
5183 			extent_op = btrfs_alloc_delayed_extent_op();
5184 			if (!extent_op) {
5185 				ret = -ENOMEM;
5186 				goto out_free_buf;
5187 			}
5188 			if (key)
5189 				memcpy(&extent_op->key, key, sizeof(extent_op->key));
5190 			else
5191 				memset(&extent_op->key, 0, sizeof(extent_op->key));
5192 			extent_op->flags_to_set = flags;
5193 			extent_op->update_key = (skinny_metadata ? false : true);
5194 			extent_op->update_flags = (flags != 0);
5195 		} else {
5196 			extent_op = NULL;
5197 		}
5198 
5199 		btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
5200 		btrfs_ref_tree_mod(fs_info, &generic_ref);
5201 		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5202 		if (ret) {
5203 			btrfs_free_delayed_extent_op(extent_op);
5204 			goto out_free_buf;
5205 		}
5206 	}
5207 	return buf;
5208 
5209 out_free_buf:
5210 	btrfs_tree_unlock(buf);
5211 	free_extent_buffer(buf);
5212 out_free_reserved:
5213 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5214 out_unuse:
5215 	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5216 	return ERR_PTR(ret);
5217 }
5218 
5219 struct walk_control {
5220 	u64 refs[BTRFS_MAX_LEVEL];
5221 	u64 flags[BTRFS_MAX_LEVEL];
5222 	struct btrfs_key update_progress;
5223 	struct btrfs_key drop_progress;
5224 	int drop_level;
5225 	int stage;
5226 	int level;
5227 	int shared_level;
5228 	int update_ref;
5229 	int keep_locks;
5230 	int reada_slot;
5231 	int reada_count;
5232 	int restarted;
5233 	/* Indicate that extent info needs to be looked up when walking the tree. */
5234 	int lookup_info;
5235 };
5236 
5237 /*
5238  * This is our normal stage.  We are traversing blocks the current snapshot owns
5239  * and we are dropping any of our references to any children we are able to, and
5240  * then freeing the block once we've processed all of the children.
5241  */
5242 #define DROP_REFERENCE	1
5243 
5244 /*
5245  * We enter this stage when we have to walk into a child block (meaning we can't
5246  * simply drop our reference to it from our current parent node) and there are
5247  * more than one reference on it.  If we are the owner of any of the children
5248  * blocks from the current parent node then we have to do the FULL_BACKREF dance
5249  * on them in order to drop our normal ref and add the shared ref.
5250  */
5251 #define UPDATE_BACKREF	2
5252 
5253 /*
5254  * Decide if we need to walk down into this node to adjust the references.
5255  *
5256  * @root:	the root we are currently deleting
5257  * @wc:		the walk control for this deletion
5258  * @eb:		the parent eb that we're currently visiting
5259  * @refs:	the number of refs for wc->level - 1
5260  * @flags:	the flags for wc->level - 1
5261  * @slot:	the slot in the eb that we're currently checking
5262  *
5263  * This is meant to be called when we're evaluating if a node we point to at
5264  * wc->level should be read and walked into, or if we can simply delete our
5265  * reference to it.  We return true if we should walk into the node, false if we
5266  * can skip it.
5267  *
5268  * We have assertions in here to make sure this is called correctly.  We assume
5269  * that sanity checking on the blocks read to this point has been done, so any
5270  * corrupted file systems must have been caught before calling this function.
5271  */
5272 static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
5273 				  struct extent_buffer *eb, u64 refs, u64 flags, int slot)
5274 {
5275 	struct btrfs_key key;
5276 	u64 generation;
5277 	int level = wc->level;
5278 
5279 	ASSERT(level > 0);
5280 	ASSERT(wc->refs[level - 1] > 0);
5281 
5282 	/*
5283 	 * The update backref stage we only want to skip if we already have
5284 	 * FULL_BACKREF set, otherwise we need to read.
5285 	 */
5286 	if (wc->stage == UPDATE_BACKREF) {
5287 		if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5288 			return false;
5289 		return true;
5290 	}
5291 
5292 	/*
5293 	 * We're the last ref on this block, we must walk into it and process
5294 	 * any refs it's pointing at.
5295 	 */
5296 	if (wc->refs[level - 1] == 1)
5297 		return true;
5298 
5299 	/*
5300 	 * If we're already FULL_BACKREF then we know we can just drop our
5301 	 * current reference.
5302 	 */
5303 	if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5304 		return false;
5305 
5306 	/*
5307 	 * This block is older than our creation generation, we can drop our
5308 	 * reference to it.
5309 	 */
5310 	generation = btrfs_node_ptr_generation(eb, slot);
5311 	if (!wc->update_ref || generation <= root->root_key.offset)
5312 		return false;
5313 
5314 	/*
5315 	 * This block was processed from a previous snapshot deletion run, we
5316 	 * can skip it.
5317 	 */
5318 	btrfs_node_key_to_cpu(eb, &key, slot);
5319 	if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
5320 		return false;
5321 
5322 	/* All other cases we need to wander into the node. */
5323 	return true;
5324 }
5325 
5326 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5327 				     struct btrfs_root *root,
5328 				     struct walk_control *wc,
5329 				     struct btrfs_path *path)
5330 {
5331 	struct btrfs_fs_info *fs_info = root->fs_info;
5332 	u64 bytenr;
5333 	u64 generation;
5334 	u64 refs;
5335 	u64 flags;
5336 	u32 nritems;
5337 	struct extent_buffer *eb;
5338 	int ret;
5339 	int slot;
5340 	int nread = 0;
5341 
5342 	if (path->slots[wc->level] < wc->reada_slot) {
5343 		wc->reada_count = wc->reada_count * 2 / 3;
5344 		wc->reada_count = max(wc->reada_count, 2);
5345 	} else {
5346 		wc->reada_count = wc->reada_count * 3 / 2;
5347 		wc->reada_count = min_t(int, wc->reada_count,
5348 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5349 	}
5350 
5351 	eb = path->nodes[wc->level];
5352 	nritems = btrfs_header_nritems(eb);
5353 
5354 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5355 		if (nread >= wc->reada_count)
5356 			break;
5357 
5358 		cond_resched();
5359 		bytenr = btrfs_node_blockptr(eb, slot);
5360 		generation = btrfs_node_ptr_generation(eb, slot);
5361 
5362 		if (slot == path->slots[wc->level])
5363 			goto reada;
5364 
5365 		if (wc->stage == UPDATE_BACKREF &&
5366 		    generation <= root->root_key.offset)
5367 			continue;
5368 
5369 		/* We don't lock the tree block, it's OK to be racy here */
5370 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5371 					       wc->level - 1, 1, &refs,
5372 					       &flags, NULL);
5373 		/* We don't care about errors in readahead. */
5374 		if (ret < 0)
5375 			continue;
5376 
5377 		/*
5378 		 * This could be racey, it's conceivable that we raced and end
5379 		 * up with a bogus refs count, if that's the case just skip, if
5380 		 * we are actually corrupt we will notice when we look up
5381 		 * everything again with our locks.
5382 		 */
5383 		if (refs == 0)
5384 			continue;
5385 
5386 		/* If we don't need to visit this node don't reada. */
5387 		if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
5388 			continue;
5389 reada:
5390 		btrfs_readahead_node_child(eb, slot);
5391 		nread++;
5392 	}
5393 	wc->reada_slot = slot;
5394 }
5395 
5396 /*
5397  * helper to process tree block while walking down the tree.
5398  *
5399  * when wc->stage == UPDATE_BACKREF, this function updates
5400  * back refs for pointers in the block.
5401  *
5402  * NOTE: return value 1 means we should stop walking down.
5403  */
5404 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5405 				   struct btrfs_root *root,
5406 				   struct btrfs_path *path,
5407 				   struct walk_control *wc)
5408 {
5409 	struct btrfs_fs_info *fs_info = root->fs_info;
5410 	int level = wc->level;
5411 	struct extent_buffer *eb = path->nodes[level];
5412 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5413 	int ret;
5414 
5415 	if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
5416 		return 1;
5417 
5418 	/*
5419 	 * when reference count of tree block is 1, it won't increase
5420 	 * again. once full backref flag is set, we never clear it.
5421 	 */
5422 	if (wc->lookup_info &&
5423 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5424 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5425 		ASSERT(path->locks[level]);
5426 		ret = btrfs_lookup_extent_info(trans, fs_info,
5427 					       eb->start, level, 1,
5428 					       &wc->refs[level],
5429 					       &wc->flags[level],
5430 					       NULL);
5431 		if (ret)
5432 			return ret;
5433 		if (unlikely(wc->refs[level] == 0)) {
5434 			btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5435 				  eb->start);
5436 			return -EUCLEAN;
5437 		}
5438 	}
5439 
5440 	if (wc->stage == DROP_REFERENCE) {
5441 		if (wc->refs[level] > 1)
5442 			return 1;
5443 
5444 		if (path->locks[level] && !wc->keep_locks) {
5445 			btrfs_tree_unlock_rw(eb, path->locks[level]);
5446 			path->locks[level] = 0;
5447 		}
5448 		return 0;
5449 	}
5450 
5451 	/* wc->stage == UPDATE_BACKREF */
5452 	if (!(wc->flags[level] & flag)) {
5453 		ASSERT(path->locks[level]);
5454 		ret = btrfs_inc_ref(trans, root, eb, 1);
5455 		if (ret) {
5456 			btrfs_abort_transaction(trans, ret);
5457 			return ret;
5458 		}
5459 		ret = btrfs_dec_ref(trans, root, eb, 0);
5460 		if (ret) {
5461 			btrfs_abort_transaction(trans, ret);
5462 			return ret;
5463 		}
5464 		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5465 		if (ret) {
5466 			btrfs_abort_transaction(trans, ret);
5467 			return ret;
5468 		}
5469 		wc->flags[level] |= flag;
5470 	}
5471 
5472 	/*
5473 	 * the block is shared by multiple trees, so it's not good to
5474 	 * keep the tree lock
5475 	 */
5476 	if (path->locks[level] && level > 0) {
5477 		btrfs_tree_unlock_rw(eb, path->locks[level]);
5478 		path->locks[level] = 0;
5479 	}
5480 	return 0;
5481 }
5482 
5483 /*
5484  * This is used to verify a ref exists for this root to deal with a bug where we
5485  * would have a drop_progress key that hadn't been updated properly.
5486  */
5487 static int check_ref_exists(struct btrfs_trans_handle *trans,
5488 			    struct btrfs_root *root, u64 bytenr, u64 parent,
5489 			    int level)
5490 {
5491 	struct btrfs_delayed_ref_root *delayed_refs;
5492 	struct btrfs_delayed_ref_head *head;
5493 	struct btrfs_path *path;
5494 	struct btrfs_extent_inline_ref *iref;
5495 	int ret;
5496 	bool exists = false;
5497 
5498 	path = btrfs_alloc_path();
5499 	if (!path)
5500 		return -ENOMEM;
5501 again:
5502 	ret = lookup_extent_backref(trans, path, &iref, bytenr,
5503 				    root->fs_info->nodesize, parent,
5504 				    btrfs_root_id(root), level, 0);
5505 	if (ret != -ENOENT) {
5506 		/*
5507 		 * If we get 0 then we found our reference, return 1, else
5508 		 * return the error if it's not -ENOENT;
5509 		 */
5510 		btrfs_free_path(path);
5511 		return (ret < 0 ) ? ret : 1;
5512 	}
5513 
5514 	/*
5515 	 * We could have a delayed ref with this reference, so look it up while
5516 	 * we're holding the path open to make sure we don't race with the
5517 	 * delayed ref running.
5518 	 */
5519 	delayed_refs = &trans->transaction->delayed_refs;
5520 	spin_lock(&delayed_refs->lock);
5521 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
5522 	if (!head)
5523 		goto out;
5524 	if (!mutex_trylock(&head->mutex)) {
5525 		/*
5526 		 * We're contended, means that the delayed ref is running, get a
5527 		 * reference and wait for the ref head to be complete and then
5528 		 * try again.
5529 		 */
5530 		refcount_inc(&head->refs);
5531 		spin_unlock(&delayed_refs->lock);
5532 
5533 		btrfs_release_path(path);
5534 
5535 		mutex_lock(&head->mutex);
5536 		mutex_unlock(&head->mutex);
5537 		btrfs_put_delayed_ref_head(head);
5538 		goto again;
5539 	}
5540 
5541 	exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
5542 	mutex_unlock(&head->mutex);
5543 out:
5544 	spin_unlock(&delayed_refs->lock);
5545 	btrfs_free_path(path);
5546 	return exists ? 1 : 0;
5547 }
5548 
5549 /*
5550  * We may not have an uptodate block, so if we are going to walk down into this
5551  * block we need to drop the lock, read it off of the disk, re-lock it and
5552  * return to continue dropping the snapshot.
5553  */
5554 static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
5555 				     struct btrfs_root *root,
5556 				     struct btrfs_path *path,
5557 				     struct walk_control *wc,
5558 				     struct extent_buffer *next)
5559 {
5560 	struct btrfs_tree_parent_check check = { 0 };
5561 	u64 generation;
5562 	int level = wc->level;
5563 	int ret;
5564 
5565 	btrfs_assert_tree_write_locked(next);
5566 
5567 	generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
5568 
5569 	if (btrfs_buffer_uptodate(next, generation, 0))
5570 		return 0;
5571 
5572 	check.level = level - 1;
5573 	check.transid = generation;
5574 	check.owner_root = btrfs_root_id(root);
5575 	check.has_first_key = true;
5576 	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
5577 
5578 	btrfs_tree_unlock(next);
5579 	if (level == 1)
5580 		reada_walk_down(trans, root, wc, path);
5581 	ret = btrfs_read_extent_buffer(next, &check);
5582 	if (ret) {
5583 		free_extent_buffer(next);
5584 		return ret;
5585 	}
5586 	btrfs_tree_lock(next);
5587 	wc->lookup_info = 1;
5588 	return 0;
5589 }
5590 
5591 /*
5592  * If we determine that we don't have to visit wc->level - 1 then we need to
5593  * determine if we can drop our reference.
5594  *
5595  * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
5596  *
5597  * If we are DROP_REFERENCE this will figure out if we need to drop our current
5598  * reference, skipping it if we dropped it from a previous incompleted drop, or
5599  * dropping it if we still have a reference to it.
5600  */
5601 static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5602 				struct btrfs_path *path, struct walk_control *wc,
5603 				struct extent_buffer *next, u64 owner_root)
5604 {
5605 	struct btrfs_ref ref = {
5606 		.action = BTRFS_DROP_DELAYED_REF,
5607 		.bytenr = next->start,
5608 		.num_bytes = root->fs_info->nodesize,
5609 		.owning_root = owner_root,
5610 		.ref_root = btrfs_root_id(root),
5611 	};
5612 	int level = wc->level;
5613 	int ret;
5614 
5615 	/* We are UPDATE_BACKREF, we're not dropping anything. */
5616 	if (wc->stage == UPDATE_BACKREF)
5617 		return 0;
5618 
5619 	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5620 		ref.parent = path->nodes[level]->start;
5621 	} else {
5622 		ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
5623 		if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
5624 			btrfs_err(root->fs_info, "mismatched block owner");
5625 			return -EIO;
5626 		}
5627 	}
5628 
5629 	/*
5630 	 * If we had a drop_progress we need to verify the refs are set as
5631 	 * expected.  If we find our ref then we know that from here on out
5632 	 * everything should be correct, and we can clear the
5633 	 * ->restarted flag.
5634 	 */
5635 	if (wc->restarted) {
5636 		ret = check_ref_exists(trans, root, next->start, ref.parent,
5637 				       level - 1);
5638 		if (ret <= 0)
5639 			return ret;
5640 		ret = 0;
5641 		wc->restarted = 0;
5642 	}
5643 
5644 	/*
5645 	 * Reloc tree doesn't contribute to qgroup numbers, and we have already
5646 	 * accounted them at merge time (replace_path), thus we could skip
5647 	 * expensive subtree trace here.
5648 	 */
5649 	if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
5650 	    wc->refs[level - 1] > 1) {
5651 		u64 generation = btrfs_node_ptr_generation(path->nodes[level],
5652 							   path->slots[level]);
5653 
5654 		ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
5655 		if (ret) {
5656 			btrfs_err_rl(root->fs_info,
5657 "error %d accounting shared subtree, quota is out of sync, rescan required",
5658 				     ret);
5659 		}
5660 	}
5661 
5662 	/*
5663 	 * We need to update the next key in our walk control so we can update
5664 	 * the drop_progress key accordingly.  We don't care if find_next_key
5665 	 * doesn't find a key because that means we're at the end and are going
5666 	 * to clean up now.
5667 	 */
5668 	wc->drop_level = level;
5669 	find_next_key(path, level, &wc->drop_progress);
5670 
5671 	btrfs_init_tree_ref(&ref, level - 1, 0, false);
5672 	return btrfs_free_extent(trans, &ref);
5673 }
5674 
5675 /*
5676  * helper to process tree block pointer.
5677  *
5678  * when wc->stage == DROP_REFERENCE, this function checks
5679  * reference count of the block pointed to. if the block
5680  * is shared and we need update back refs for the subtree
5681  * rooted at the block, this function changes wc->stage to
5682  * UPDATE_BACKREF. if the block is shared and there is no
5683  * need to update back, this function drops the reference
5684  * to the block.
5685  *
5686  * NOTE: return value 1 means we should stop walking down.
5687  */
5688 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5689 				 struct btrfs_root *root,
5690 				 struct btrfs_path *path,
5691 				 struct walk_control *wc)
5692 {
5693 	struct btrfs_fs_info *fs_info = root->fs_info;
5694 	u64 bytenr;
5695 	u64 generation;
5696 	u64 owner_root = 0;
5697 	struct extent_buffer *next;
5698 	int level = wc->level;
5699 	int ret = 0;
5700 
5701 	generation = btrfs_node_ptr_generation(path->nodes[level],
5702 					       path->slots[level]);
5703 	/*
5704 	 * if the lower level block was created before the snapshot
5705 	 * was created, we know there is no need to update back refs
5706 	 * for the subtree
5707 	 */
5708 	if (wc->stage == UPDATE_BACKREF &&
5709 	    generation <= root->root_key.offset) {
5710 		wc->lookup_info = 1;
5711 		return 1;
5712 	}
5713 
5714 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5715 
5716 	next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
5717 					    level - 1);
5718 	if (IS_ERR(next))
5719 		return PTR_ERR(next);
5720 
5721 	btrfs_tree_lock(next);
5722 
5723 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5724 				       &wc->refs[level - 1],
5725 				       &wc->flags[level - 1],
5726 				       &owner_root);
5727 	if (ret < 0)
5728 		goto out_unlock;
5729 
5730 	if (unlikely(wc->refs[level - 1] == 0)) {
5731 		btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5732 			  bytenr);
5733 		ret = -EUCLEAN;
5734 		goto out_unlock;
5735 	}
5736 	wc->lookup_info = 0;
5737 
5738 	/* If we don't have to walk into this node skip it. */
5739 	if (!visit_node_for_delete(root, wc, path->nodes[level],
5740 				   wc->refs[level - 1], wc->flags[level - 1],
5741 				   path->slots[level]))
5742 		goto skip;
5743 
5744 	/*
5745 	 * We have to walk down into this node, and if we're currently at the
5746 	 * DROP_REFERNCE stage and this block is shared then we need to switch
5747 	 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
5748 	 */
5749 	if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
5750 		wc->stage = UPDATE_BACKREF;
5751 		wc->shared_level = level - 1;
5752 	}
5753 
5754 	ret = check_next_block_uptodate(trans, root, path, wc, next);
5755 	if (ret)
5756 		return ret;
5757 
5758 	level--;
5759 	ASSERT(level == btrfs_header_level(next));
5760 	if (level != btrfs_header_level(next)) {
5761 		btrfs_err(root->fs_info, "mismatched level");
5762 		ret = -EIO;
5763 		goto out_unlock;
5764 	}
5765 	path->nodes[level] = next;
5766 	path->slots[level] = 0;
5767 	path->locks[level] = BTRFS_WRITE_LOCK;
5768 	wc->level = level;
5769 	if (wc->level == 1)
5770 		wc->reada_slot = 0;
5771 	return 0;
5772 skip:
5773 	ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
5774 	if (ret)
5775 		goto out_unlock;
5776 	wc->refs[level - 1] = 0;
5777 	wc->flags[level - 1] = 0;
5778 	wc->lookup_info = 1;
5779 	ret = 1;
5780 
5781 out_unlock:
5782 	btrfs_tree_unlock(next);
5783 	free_extent_buffer(next);
5784 
5785 	return ret;
5786 }
5787 
5788 /*
5789  * helper to process tree block while walking up the tree.
5790  *
5791  * when wc->stage == DROP_REFERENCE, this function drops
5792  * reference count on the block.
5793  *
5794  * when wc->stage == UPDATE_BACKREF, this function changes
5795  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5796  * to UPDATE_BACKREF previously while processing the block.
5797  *
5798  * NOTE: return value 1 means we should stop walking up.
5799  */
5800 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5801 				 struct btrfs_root *root,
5802 				 struct btrfs_path *path,
5803 				 struct walk_control *wc)
5804 {
5805 	struct btrfs_fs_info *fs_info = root->fs_info;
5806 	int ret = 0;
5807 	int level = wc->level;
5808 	struct extent_buffer *eb = path->nodes[level];
5809 	u64 parent = 0;
5810 
5811 	if (wc->stage == UPDATE_BACKREF) {
5812 		ASSERT(wc->shared_level >= level);
5813 		if (level < wc->shared_level)
5814 			goto out;
5815 
5816 		ret = find_next_key(path, level + 1, &wc->update_progress);
5817 		if (ret > 0)
5818 			wc->update_ref = 0;
5819 
5820 		wc->stage = DROP_REFERENCE;
5821 		wc->shared_level = -1;
5822 		path->slots[level] = 0;
5823 
5824 		/*
5825 		 * check reference count again if the block isn't locked.
5826 		 * we should start walking down the tree again if reference
5827 		 * count is one.
5828 		 */
5829 		if (!path->locks[level]) {
5830 			ASSERT(level > 0);
5831 			btrfs_tree_lock(eb);
5832 			path->locks[level] = BTRFS_WRITE_LOCK;
5833 
5834 			ret = btrfs_lookup_extent_info(trans, fs_info,
5835 						       eb->start, level, 1,
5836 						       &wc->refs[level],
5837 						       &wc->flags[level],
5838 						       NULL);
5839 			if (ret < 0) {
5840 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5841 				path->locks[level] = 0;
5842 				return ret;
5843 			}
5844 			if (unlikely(wc->refs[level] == 0)) {
5845 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5846 				btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5847 					  eb->start);
5848 				return -EUCLEAN;
5849 			}
5850 			if (wc->refs[level] == 1) {
5851 				btrfs_tree_unlock_rw(eb, path->locks[level]);
5852 				path->locks[level] = 0;
5853 				return 1;
5854 			}
5855 		}
5856 	}
5857 
5858 	/* wc->stage == DROP_REFERENCE */
5859 	ASSERT(path->locks[level] || wc->refs[level] == 1);
5860 
5861 	if (wc->refs[level] == 1) {
5862 		if (level == 0) {
5863 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5864 				ret = btrfs_dec_ref(trans, root, eb, 1);
5865 			else
5866 				ret = btrfs_dec_ref(trans, root, eb, 0);
5867 			if (ret) {
5868 				btrfs_abort_transaction(trans, ret);
5869 				return ret;
5870 			}
5871 			if (is_fstree(btrfs_root_id(root))) {
5872 				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5873 				if (ret) {
5874 					btrfs_err_rl(fs_info,
5875 	"error %d accounting leaf items, quota is out of sync, rescan required",
5876 					     ret);
5877 				}
5878 			}
5879 		}
5880 		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5881 		if (!path->locks[level]) {
5882 			btrfs_tree_lock(eb);
5883 			path->locks[level] = BTRFS_WRITE_LOCK;
5884 		}
5885 		btrfs_clear_buffer_dirty(trans, eb);
5886 	}
5887 
5888 	if (eb == root->node) {
5889 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5890 			parent = eb->start;
5891 		else if (btrfs_root_id(root) != btrfs_header_owner(eb))
5892 			goto owner_mismatch;
5893 	} else {
5894 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5895 			parent = path->nodes[level + 1]->start;
5896 		else if (btrfs_root_id(root) !=
5897 			 btrfs_header_owner(path->nodes[level + 1]))
5898 			goto owner_mismatch;
5899 	}
5900 
5901 	ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5902 				    wc->refs[level] == 1);
5903 	if (ret < 0)
5904 		btrfs_abort_transaction(trans, ret);
5905 out:
5906 	wc->refs[level] = 0;
5907 	wc->flags[level] = 0;
5908 	return ret;
5909 
5910 owner_mismatch:
5911 	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5912 		     btrfs_header_owner(eb), btrfs_root_id(root));
5913 	return -EUCLEAN;
5914 }
5915 
5916 /*
5917  * walk_down_tree consists of two steps.
5918  *
5919  * walk_down_proc().  Look up the reference count and reference of our current
5920  * wc->level.  At this point path->nodes[wc->level] should be populated and
5921  * uptodate, and in most cases should already be locked.  If we are in
5922  * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
5923  * we can walk back up the tree.  If we are UPDATE_BACKREF we have to set
5924  * FULL_BACKREF on this node if it's not already set, and then do the
5925  * FULL_BACKREF conversion dance, which is to drop the root reference and add
5926  * the shared reference to all of this nodes children.
5927  *
5928  * do_walk_down().  This is where we actually start iterating on the children of
5929  * our current path->nodes[wc->level].  For DROP_REFERENCE that means dropping
5930  * our reference to the children that return false from visit_node_for_delete(),
5931  * which has various conditions where we know we can just drop our reference
5932  * without visiting the node.  For UPDATE_BACKREF we will skip any children that
5933  * visit_node_for_delete() returns false for, only walking down when necessary.
5934  * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
5935  * snapshot deletion.
5936  */
5937 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5938 				   struct btrfs_root *root,
5939 				   struct btrfs_path *path,
5940 				   struct walk_control *wc)
5941 {
5942 	int level = wc->level;
5943 	int ret = 0;
5944 
5945 	wc->lookup_info = 1;
5946 	while (level >= 0) {
5947 		ret = walk_down_proc(trans, root, path, wc);
5948 		if (ret)
5949 			break;
5950 
5951 		if (level == 0)
5952 			break;
5953 
5954 		if (path->slots[level] >=
5955 		    btrfs_header_nritems(path->nodes[level]))
5956 			break;
5957 
5958 		ret = do_walk_down(trans, root, path, wc);
5959 		if (ret > 0) {
5960 			path->slots[level]++;
5961 			continue;
5962 		} else if (ret < 0)
5963 			break;
5964 		level = wc->level;
5965 	}
5966 	return (ret == 1) ? 0 : ret;
5967 }
5968 
5969 /*
5970  * walk_up_tree() is responsible for making sure we visit every slot on our
5971  * current node, and if we're at the end of that node then we call
5972  * walk_up_proc() on our current node which will do one of a few things based on
5973  * our stage.
5974  *
5975  * UPDATE_BACKREF.  If we wc->level is currently less than our wc->shared_level
5976  * then we need to walk back up the tree, and then going back down into the
5977  * other slots via walk_down_tree to update any other children from our original
5978  * wc->shared_level.  Once we're at or above our wc->shared_level we can switch
5979  * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
5980  *
5981  * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
5982  * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
5983  * in our current leaf.  After that we call btrfs_free_tree_block() on the
5984  * current node and walk up to the next node to walk down the next slot.
5985  */
5986 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5987 				 struct btrfs_root *root,
5988 				 struct btrfs_path *path,
5989 				 struct walk_control *wc, int max_level)
5990 {
5991 	int level = wc->level;
5992 	int ret;
5993 
5994 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5995 	while (level < max_level && path->nodes[level]) {
5996 		wc->level = level;
5997 		if (path->slots[level] + 1 <
5998 		    btrfs_header_nritems(path->nodes[level])) {
5999 			path->slots[level]++;
6000 			return 0;
6001 		} else {
6002 			ret = walk_up_proc(trans, root, path, wc);
6003 			if (ret > 0)
6004 				return 0;
6005 			if (ret < 0)
6006 				return ret;
6007 
6008 			if (path->locks[level]) {
6009 				btrfs_tree_unlock_rw(path->nodes[level],
6010 						     path->locks[level]);
6011 				path->locks[level] = 0;
6012 			}
6013 			free_extent_buffer(path->nodes[level]);
6014 			path->nodes[level] = NULL;
6015 			level++;
6016 		}
6017 	}
6018 	return 1;
6019 }
6020 
6021 /*
6022  * drop a subvolume tree.
6023  *
6024  * this function traverses the tree freeing any blocks that only
6025  * referenced by the tree.
6026  *
6027  * when a shared tree block is found. this function decreases its
6028  * reference count by one. if update_ref is true, this function
6029  * also make sure backrefs for the shared block and all lower level
6030  * blocks are properly updated.
6031  *
6032  * If called with for_reloc == 0, may exit early with -EAGAIN
6033  */
6034 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
6035 {
6036 	const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
6037 	struct btrfs_fs_info *fs_info = root->fs_info;
6038 	struct btrfs_path *path;
6039 	struct btrfs_trans_handle *trans;
6040 	struct btrfs_root *tree_root = fs_info->tree_root;
6041 	struct btrfs_root_item *root_item = &root->root_item;
6042 	struct walk_control *wc;
6043 	struct btrfs_key key;
6044 	const u64 rootid = btrfs_root_id(root);
6045 	int ret = 0;
6046 	int level;
6047 	bool root_dropped = false;
6048 	bool unfinished_drop = false;
6049 
6050 	btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));
6051 
6052 	path = btrfs_alloc_path();
6053 	if (!path) {
6054 		ret = -ENOMEM;
6055 		goto out;
6056 	}
6057 
6058 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6059 	if (!wc) {
6060 		btrfs_free_path(path);
6061 		ret = -ENOMEM;
6062 		goto out;
6063 	}
6064 
6065 	/*
6066 	 * Use join to avoid potential EINTR from transaction start. See
6067 	 * wait_reserve_ticket and the whole reservation callchain.
6068 	 */
6069 	if (for_reloc)
6070 		trans = btrfs_join_transaction(tree_root);
6071 	else
6072 		trans = btrfs_start_transaction(tree_root, 0);
6073 	if (IS_ERR(trans)) {
6074 		ret = PTR_ERR(trans);
6075 		goto out_free;
6076 	}
6077 
6078 	ret = btrfs_run_delayed_items(trans);
6079 	if (ret)
6080 		goto out_end_trans;
6081 
6082 	/*
6083 	 * This will help us catch people modifying the fs tree while we're
6084 	 * dropping it.  It is unsafe to mess with the fs tree while it's being
6085 	 * dropped as we unlock the root node and parent nodes as we walk down
6086 	 * the tree, assuming nothing will change.  If something does change
6087 	 * then we'll have stale information and drop references to blocks we've
6088 	 * already dropped.
6089 	 */
6090 	set_bit(BTRFS_ROOT_DELETING, &root->state);
6091 	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
6092 
6093 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6094 		level = btrfs_header_level(root->node);
6095 		path->nodes[level] = btrfs_lock_root_node(root);
6096 		path->slots[level] = 0;
6097 		path->locks[level] = BTRFS_WRITE_LOCK;
6098 		memset(&wc->update_progress, 0,
6099 		       sizeof(wc->update_progress));
6100 	} else {
6101 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6102 		memcpy(&wc->update_progress, &key,
6103 		       sizeof(wc->update_progress));
6104 
6105 		level = btrfs_root_drop_level(root_item);
6106 		BUG_ON(level == 0);
6107 		path->lowest_level = level;
6108 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6109 		path->lowest_level = 0;
6110 		if (ret < 0)
6111 			goto out_end_trans;
6112 
6113 		WARN_ON(ret > 0);
6114 		ret = 0;
6115 
6116 		/*
6117 		 * unlock our path, this is safe because only this
6118 		 * function is allowed to delete this snapshot
6119 		 */
6120 		btrfs_unlock_up_safe(path, 0);
6121 
6122 		level = btrfs_header_level(root->node);
6123 		while (1) {
6124 			btrfs_tree_lock(path->nodes[level]);
6125 			path->locks[level] = BTRFS_WRITE_LOCK;
6126 
6127 			/*
6128 			 * btrfs_lookup_extent_info() returns 0 for success,
6129 			 * or < 0 for error.
6130 			 */
6131 			ret = btrfs_lookup_extent_info(trans, fs_info,
6132 						path->nodes[level]->start,
6133 						level, 1, &wc->refs[level],
6134 						&wc->flags[level], NULL);
6135 			if (ret < 0)
6136 				goto out_end_trans;
6137 
6138 			BUG_ON(wc->refs[level] == 0);
6139 
6140 			if (level == btrfs_root_drop_level(root_item))
6141 				break;
6142 
6143 			btrfs_tree_unlock(path->nodes[level]);
6144 			path->locks[level] = 0;
6145 			WARN_ON(wc->refs[level] != 1);
6146 			level--;
6147 		}
6148 	}
6149 
6150 	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
6151 	wc->level = level;
6152 	wc->shared_level = -1;
6153 	wc->stage = DROP_REFERENCE;
6154 	wc->update_ref = update_ref;
6155 	wc->keep_locks = 0;
6156 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6157 
6158 	while (1) {
6159 
6160 		ret = walk_down_tree(trans, root, path, wc);
6161 		if (ret < 0) {
6162 			btrfs_abort_transaction(trans, ret);
6163 			break;
6164 		}
6165 
6166 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6167 		if (ret < 0) {
6168 			btrfs_abort_transaction(trans, ret);
6169 			break;
6170 		}
6171 
6172 		if (ret > 0) {
6173 			BUG_ON(wc->stage != DROP_REFERENCE);
6174 			ret = 0;
6175 			break;
6176 		}
6177 
6178 		if (wc->stage == DROP_REFERENCE) {
6179 			wc->drop_level = wc->level;
6180 			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
6181 					      &wc->drop_progress,
6182 					      path->slots[wc->drop_level]);
6183 		}
6184 		btrfs_cpu_key_to_disk(&root_item->drop_progress,
6185 				      &wc->drop_progress);
6186 		btrfs_set_root_drop_level(root_item, wc->drop_level);
6187 
6188 		BUG_ON(wc->level == 0);
6189 		if (btrfs_should_end_transaction(trans) ||
6190 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
6191 			ret = btrfs_update_root(trans, tree_root,
6192 						&root->root_key,
6193 						root_item);
6194 			if (ret) {
6195 				btrfs_abort_transaction(trans, ret);
6196 				goto out_end_trans;
6197 			}
6198 
6199 			if (!is_reloc_root)
6200 				btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6201 
6202 			btrfs_end_transaction_throttle(trans);
6203 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
6204 				btrfs_debug(fs_info,
6205 					    "drop snapshot early exit");
6206 				ret = -EAGAIN;
6207 				goto out_free;
6208 			}
6209 
6210 		       /*
6211 			* Use join to avoid potential EINTR from transaction
6212 			* start. See wait_reserve_ticket and the whole
6213 			* reservation callchain.
6214 			*/
6215 			if (for_reloc)
6216 				trans = btrfs_join_transaction(tree_root);
6217 			else
6218 				trans = btrfs_start_transaction(tree_root, 0);
6219 			if (IS_ERR(trans)) {
6220 				ret = PTR_ERR(trans);
6221 				goto out_free;
6222 			}
6223 		}
6224 	}
6225 	btrfs_release_path(path);
6226 	if (ret)
6227 		goto out_end_trans;
6228 
6229 	ret = btrfs_del_root(trans, &root->root_key);
6230 	if (ret) {
6231 		btrfs_abort_transaction(trans, ret);
6232 		goto out_end_trans;
6233 	}
6234 
6235 	if (!is_reloc_root) {
6236 		ret = btrfs_find_root(tree_root, &root->root_key, path,
6237 				      NULL, NULL);
6238 		if (ret < 0) {
6239 			btrfs_abort_transaction(trans, ret);
6240 			goto out_end_trans;
6241 		} else if (ret > 0) {
6242 			ret = 0;
6243 			/*
6244 			 * If we fail to delete the orphan item this time
6245 			 * around, it'll get picked up the next time.
6246 			 *
6247 			 * The most common failure here is just -ENOENT.
6248 			 */
6249 			btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
6250 		}
6251 	}
6252 
6253 	/*
6254 	 * This subvolume is going to be completely dropped, and won't be
6255 	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
6256 	 * commit transaction time.  So free it here manually.
6257 	 */
6258 	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
6259 	btrfs_qgroup_free_meta_all_pertrans(root);
6260 
6261 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
6262 		btrfs_add_dropped_root(trans, root);
6263 	else
6264 		btrfs_put_root(root);
6265 	root_dropped = true;
6266 out_end_trans:
6267 	if (!is_reloc_root)
6268 		btrfs_set_last_root_drop_gen(fs_info, trans->transid);
6269 
6270 	btrfs_end_transaction_throttle(trans);
6271 out_free:
6272 	kfree(wc);
6273 	btrfs_free_path(path);
6274 out:
6275 	if (!ret && root_dropped) {
6276 		ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
6277 		if (ret < 0)
6278 			btrfs_warn_rl(fs_info,
6279 				      "failed to cleanup qgroup 0/%llu: %d",
6280 				      rootid, ret);
6281 		ret = 0;
6282 	}
6283 	/*
6284 	 * We were an unfinished drop root, check to see if there are any
6285 	 * pending, and if not clear and wake up any waiters.
6286 	 */
6287 	if (!ret && unfinished_drop)
6288 		btrfs_maybe_wake_unfinished_drop(fs_info);
6289 
6290 	/*
6291 	 * So if we need to stop dropping the snapshot for whatever reason we
6292 	 * need to make sure to add it back to the dead root list so that we
6293 	 * keep trying to do the work later.  This also cleans up roots if we
6294 	 * don't have it in the radix (like when we recover after a power fail
6295 	 * or unmount) so we don't leak memory.
6296 	 */
6297 	if (!for_reloc && !root_dropped)
6298 		btrfs_add_dead_root(root);
6299 	return ret;
6300 }
6301 
6302 /*
6303  * drop subtree rooted at tree block 'node'.
6304  *
6305  * NOTE: this function will unlock and release tree block 'node'
6306  * only used by relocation code
6307  */
6308 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6309 			struct btrfs_root *root,
6310 			struct extent_buffer *node,
6311 			struct extent_buffer *parent)
6312 {
6313 	struct btrfs_fs_info *fs_info = root->fs_info;
6314 	struct btrfs_path *path;
6315 	struct walk_control *wc;
6316 	int level;
6317 	int parent_level;
6318 	int ret = 0;
6319 
6320 	BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
6321 
6322 	path = btrfs_alloc_path();
6323 	if (!path)
6324 		return -ENOMEM;
6325 
6326 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
6327 	if (!wc) {
6328 		btrfs_free_path(path);
6329 		return -ENOMEM;
6330 	}
6331 
6332 	btrfs_assert_tree_write_locked(parent);
6333 	parent_level = btrfs_header_level(parent);
6334 	atomic_inc(&parent->refs);
6335 	path->nodes[parent_level] = parent;
6336 	path->slots[parent_level] = btrfs_header_nritems(parent);
6337 
6338 	btrfs_assert_tree_write_locked(node);
6339 	level = btrfs_header_level(node);
6340 	path->nodes[level] = node;
6341 	path->slots[level] = 0;
6342 	path->locks[level] = BTRFS_WRITE_LOCK;
6343 
6344 	wc->refs[parent_level] = 1;
6345 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6346 	wc->level = level;
6347 	wc->shared_level = -1;
6348 	wc->stage = DROP_REFERENCE;
6349 	wc->update_ref = 0;
6350 	wc->keep_locks = 1;
6351 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
6352 
6353 	while (1) {
6354 		ret = walk_down_tree(trans, root, path, wc);
6355 		if (ret < 0)
6356 			break;
6357 
6358 		ret = walk_up_tree(trans, root, path, wc, parent_level);
6359 		if (ret) {
6360 			if (ret > 0)
6361 				ret = 0;
6362 			break;
6363 		}
6364 	}
6365 
6366 	kfree(wc);
6367 	btrfs_free_path(path);
6368 	return ret;
6369 }
6370 
6371 /*
6372  * Unpin the extent range in an error context and don't add the space back.
6373  * Errors are not propagated further.
6374  */
6375 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
6376 {
6377 	unpin_extent_range(fs_info, start, end, false);
6378 }
6379 
6380 /*
6381  * It used to be that old block groups would be left around forever.
6382  * Iterating over them would be enough to trim unused space.  Since we
6383  * now automatically remove them, we also need to iterate over unallocated
6384  * space.
6385  *
6386  * We don't want a transaction for this since the discard may take a
6387  * substantial amount of time.  We don't require that a transaction be
6388  * running, but we do need to take a running transaction into account
6389  * to ensure that we're not discarding chunks that were released or
6390  * allocated in the current transaction.
6391  *
6392  * Holding the chunks lock will prevent other threads from allocating
6393  * or releasing chunks, but it won't prevent a running transaction
6394  * from committing and releasing the memory that the pending chunks
6395  * list head uses.  For that, we need to take a reference to the
6396  * transaction and hold the commit root sem.  We only need to hold
6397  * it while performing the free space search since we have already
6398  * held back allocations.
6399  */
6400 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6401 {
6402 	u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6403 	int ret;
6404 
6405 	*trimmed = 0;
6406 
6407 	/* Discard not supported = nothing to do. */
6408 	if (!bdev_max_discard_sectors(device->bdev))
6409 		return 0;
6410 
6411 	/* Not writable = nothing to do. */
6412 	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6413 		return 0;
6414 
6415 	/* No free space = nothing to do. */
6416 	if (device->total_bytes <= device->bytes_used)
6417 		return 0;
6418 
6419 	ret = 0;
6420 
6421 	while (1) {
6422 		struct btrfs_fs_info *fs_info = device->fs_info;
6423 		u64 bytes;
6424 
6425 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6426 		if (ret)
6427 			break;
6428 
6429 		find_first_clear_extent_bit(&device->alloc_state, start,
6430 					    &start, &end,
6431 					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6432 
6433 		/* Check if there are any CHUNK_* bits left */
6434 		if (start > device->total_bytes) {
6435 			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6436 			btrfs_warn_in_rcu(fs_info,
6437 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6438 					  start, end - start + 1,
6439 					  btrfs_dev_name(device),
6440 					  device->total_bytes);
6441 			mutex_unlock(&fs_info->chunk_mutex);
6442 			ret = 0;
6443 			break;
6444 		}
6445 
6446 		/* Ensure we skip the reserved space on each device. */
6447 		start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6448 
6449 		/*
6450 		 * If find_first_clear_extent_bit find a range that spans the
6451 		 * end of the device it will set end to -1, in this case it's up
6452 		 * to the caller to trim the value to the size of the device.
6453 		 */
6454 		end = min(end, device->total_bytes - 1);
6455 
6456 		len = end - start + 1;
6457 
6458 		/* We didn't find any extents */
6459 		if (!len) {
6460 			mutex_unlock(&fs_info->chunk_mutex);
6461 			ret = 0;
6462 			break;
6463 		}
6464 
6465 		ret = btrfs_issue_discard(device->bdev, start, len,
6466 					  &bytes);
6467 		if (!ret)
6468 			set_extent_bit(&device->alloc_state, start,
6469 				       start + bytes - 1, CHUNK_TRIMMED, NULL);
6470 		mutex_unlock(&fs_info->chunk_mutex);
6471 
6472 		if (ret)
6473 			break;
6474 
6475 		start += len;
6476 		*trimmed += bytes;
6477 
6478 		if (btrfs_trim_interrupted()) {
6479 			ret = -ERESTARTSYS;
6480 			break;
6481 		}
6482 
6483 		cond_resched();
6484 	}
6485 
6486 	return ret;
6487 }
6488 
6489 /*
6490  * Trim the whole filesystem by:
6491  * 1) trimming the free space in each block group
6492  * 2) trimming the unallocated space on each device
6493  *
6494  * This will also continue trimming even if a block group or device encounters
6495  * an error.  The return value will be the last error, or 0 if nothing bad
6496  * happens.
6497  */
6498 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6499 {
6500 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6501 	struct btrfs_block_group *cache = NULL;
6502 	struct btrfs_device *device;
6503 	u64 group_trimmed;
6504 	u64 range_end = U64_MAX;
6505 	u64 start;
6506 	u64 end;
6507 	u64 trimmed = 0;
6508 	u64 bg_failed = 0;
6509 	u64 dev_failed = 0;
6510 	int bg_ret = 0;
6511 	int dev_ret = 0;
6512 	int ret = 0;
6513 
6514 	if (range->start == U64_MAX)
6515 		return -EINVAL;
6516 
6517 	/*
6518 	 * Check range overflow if range->len is set.
6519 	 * The default range->len is U64_MAX.
6520 	 */
6521 	if (range->len != U64_MAX &&
6522 	    check_add_overflow(range->start, range->len, &range_end))
6523 		return -EINVAL;
6524 
6525 	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6526 	for (; cache; cache = btrfs_next_block_group(cache)) {
6527 		if (cache->start >= range_end) {
6528 			btrfs_put_block_group(cache);
6529 			break;
6530 		}
6531 
6532 		start = max(range->start, cache->start);
6533 		end = min(range_end, cache->start + cache->length);
6534 
6535 		if (end - start >= range->minlen) {
6536 			if (!btrfs_block_group_done(cache)) {
6537 				ret = btrfs_cache_block_group(cache, true);
6538 				if (ret) {
6539 					bg_failed++;
6540 					bg_ret = ret;
6541 					continue;
6542 				}
6543 			}
6544 			ret = btrfs_trim_block_group(cache,
6545 						     &group_trimmed,
6546 						     start,
6547 						     end,
6548 						     range->minlen);
6549 
6550 			trimmed += group_trimmed;
6551 			if (ret) {
6552 				bg_failed++;
6553 				bg_ret = ret;
6554 				continue;
6555 			}
6556 		}
6557 	}
6558 
6559 	if (bg_failed)
6560 		btrfs_warn(fs_info,
6561 			"failed to trim %llu block group(s), last error %d",
6562 			bg_failed, bg_ret);
6563 
6564 	mutex_lock(&fs_devices->device_list_mutex);
6565 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6566 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6567 			continue;
6568 
6569 		ret = btrfs_trim_free_extents(device, &group_trimmed);
6570 
6571 		trimmed += group_trimmed;
6572 		if (ret) {
6573 			dev_failed++;
6574 			dev_ret = ret;
6575 			break;
6576 		}
6577 	}
6578 	mutex_unlock(&fs_devices->device_list_mutex);
6579 
6580 	if (dev_failed)
6581 		btrfs_warn(fs_info,
6582 			"failed to trim %llu device(s), last error %d",
6583 			dev_failed, dev_ret);
6584 	range->len = trimmed;
6585 	if (bg_ret)
6586 		return bg_ret;
6587 	return dev_ret;
6588 }
6589