xref: /linux/fs/btrfs/relocation.c (revision ae2eb64bfd9762536f60b690840adcdf622cdcce)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include <linux/error-injection.h>
13 #include "ctree.h"
14 #include "disk-io.h"
15 #include "transaction.h"
16 #include "volumes.h"
17 #include "locking.h"
18 #include "btrfs_inode.h"
19 #include "async-thread.h"
20 #include "free-space-cache.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 #include "block-group.h"
25 #include "backref.h"
26 #include "misc.h"
27 #include "subpage.h"
28 #include "zoned.h"
29 #include "inode-item.h"
30 #include "space-info.h"
31 #include "fs.h"
32 #include "accessors.h"
33 #include "extent-tree.h"
34 #include "root-tree.h"
35 #include "file-item.h"
36 #include "relocation.h"
37 #include "super.h"
38 #include "tree-checker.h"
39 #include "raid-stripe-tree.h"
40 #include "free-space-tree.h"
41 
42 /*
43  * Relocation overview
44  *
45  * [What does relocation do]
46  *
47  * The objective of relocation is to relocate all extents of the target block
48  * group to other block groups.
49  * This is utilized by resize (shrink only), profile converting, compacting
50  * space, or balance routine to spread chunks over devices.
51  *
52  * 		Before		|		After
53  * ------------------------------------------------------------------
54  *  BG A: 10 data extents	| BG A: deleted
55  *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
56  *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
57  *
58  * [How does relocation work]
59  *
60  * 1.   Mark the target block group read-only
61  *      New extents won't be allocated from the target block group.
62  *
63  * 2.1  Record each extent in the target block group
64  *      To build a proper map of extents to be relocated.
65  *
66  * 2.2  Build data reloc tree and reloc trees
67  *      Data reloc tree will contain an inode, recording all newly relocated
68  *      data extents.
69  *      There will be only one data reloc tree for one data block group.
70  *
71  *      Reloc tree will be a special snapshot of its source tree, containing
72  *      relocated tree blocks.
73  *      Each tree referring to a tree block in target block group will get its
74  *      reloc tree built.
75  *
76  * 2.3  Swap source tree with its corresponding reloc tree
77  *      Each involved tree only refers to new extents after swap.
78  *
79  * 3.   Cleanup reloc trees and data reloc tree.
80  *      As old extents in the target block group are still referenced by reloc
81  *      trees, we need to clean them up before really freeing the target block
82  *      group.
83  *
84  * The main complexity is in steps 2.2 and 2.3.
85  *
86  * The entry point of relocation is relocate_block_group() function.
87  */
88 
89 #define RELOCATION_RESERVED_NODES	256
90 /*
91  * map address of tree root to tree
92  */
93 struct mapping_node {
94 	union {
95 		/* Use rb_simple_node for search/insert */
96 		struct {
97 			struct rb_node rb_node;
98 			u64 bytenr;
99 		};
100 
101 		struct rb_simple_node simple_node;
102 	};
103 	void *data;
104 };
105 
106 struct mapping_tree {
107 	struct rb_root rb_root;
108 	spinlock_t lock;
109 };
110 
111 /*
112  * present a tree block to process
113  */
114 struct tree_block {
115 	union {
116 		/* Use rb_simple_node for search/insert */
117 		struct {
118 			struct rb_node rb_node;
119 			u64 bytenr;
120 		};
121 
122 		struct rb_simple_node simple_node;
123 	};
124 	u64 owner;
125 	struct btrfs_key key;
126 	u8 level;
127 	bool key_ready;
128 };
129 
130 #define MAX_EXTENTS 128
131 
132 struct file_extent_cluster {
133 	u64 start;
134 	u64 end;
135 	u64 boundary[MAX_EXTENTS];
136 	unsigned int nr;
137 	u64 owning_root;
138 };
139 
140 /* Stages of data relocation. */
141 enum reloc_stage {
142 	MOVE_DATA_EXTENTS,
143 	UPDATE_DATA_PTRS
144 };
145 
146 struct reloc_control {
147 	/* block group to relocate */
148 	struct btrfs_block_group *block_group;
149 	/* extent tree */
150 	struct btrfs_root *extent_root;
151 	/* inode for moving data */
152 	struct inode *data_inode;
153 
154 	struct btrfs_block_rsv *block_rsv;
155 
156 	struct btrfs_backref_cache backref_cache;
157 
158 	struct file_extent_cluster cluster;
159 	/* tree blocks have been processed */
160 	struct extent_io_tree processed_blocks;
161 	/* map start of tree root to corresponding reloc tree */
162 	struct mapping_tree reloc_root_tree;
163 	/* list of reloc trees */
164 	struct list_head reloc_roots;
165 	/* list of subvolume trees that get relocated */
166 	struct list_head dirty_subvol_roots;
167 	/* size of metadata reservation for merging reloc trees */
168 	u64 merging_rsv_size;
169 	/* size of relocated tree nodes */
170 	u64 nodes_relocated;
171 	/* reserved size for block group relocation*/
172 	u64 reserved_bytes;
173 
174 	u64 search_start;
175 	u64 extents_found;
176 
177 	enum reloc_stage stage;
178 	bool create_reloc_tree;
179 	bool merge_reloc_tree;
180 	bool found_file_extent;
181 
182 	refcount_t refs;
183 };
184 
185 static struct reloc_control *get_reloc_control(struct btrfs_fs_info *fs_info)
186 {
187 	struct reloc_control *rc;
188 
189 	/* Quick path, avoid lock contention on fs_info->reloc_ctl_lock. */
190 	if (!data_race(fs_info->reloc_ctl))
191 		return NULL;
192 
193 	spin_lock(&fs_info->reloc_ctl_lock);
194 	rc = fs_info->reloc_ctl;
195 	if (rc)
196 		refcount_inc(&rc->refs);
197 	spin_unlock(&fs_info->reloc_ctl_lock);
198 
199 	return rc;
200 }
201 
202 static void __del_reloc_root(struct btrfs_root *root);
203 
204 static noinline_for_stack void free_reloc_roots(struct list_head *list)
205 {
206 	struct btrfs_root *reloc_root, *tmp;
207 
208 	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
209 		__del_reloc_root(reloc_root);
210 }
211 
212 static void put_reloc_control(struct reloc_control *rc)
213 {
214 	if (refcount_dec_and_test(&rc->refs)) {
215 		struct mapping_node *node, *tmp;
216 
217 		if (rc->extent_root)
218 			ASSERT(rc->extent_root->fs_info->reloc_ctl != rc);
219 
220 		free_reloc_roots(&rc->reloc_roots);
221 		rbtree_postorder_for_each_entry_safe(node, tmp,
222 						     &rc->reloc_root_tree.rb_root,
223 						     rb_node)
224 			kfree(node);
225 
226 		if (rc->block_group)
227 			btrfs_put_block_group(rc->block_group);
228 
229 		kfree(rc);
230 	}
231 }
232 
233 /* Helper to delete the 'address of tree root -> reloc tree' mapping. */
234 static void __del_reloc_root(struct btrfs_root *root)
235 {
236 	struct btrfs_fs_info *fs_info = root->fs_info;
237 	struct rb_node *rb_node;
238 	struct mapping_node AUTO_KFREE(node);
239 	struct reloc_control *rc;
240 	bool put_ref = false;
241 
242 	rc = get_reloc_control(fs_info);
243 	if (rc && root->node) {
244 		spin_lock(&rc->reloc_root_tree.lock);
245 		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
246 					   root->commit_root->start);
247 		if (rb_node) {
248 			node = rb_entry(rb_node, struct mapping_node, rb_node);
249 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
250 			RB_CLEAR_NODE(&node->rb_node);
251 		}
252 		spin_unlock(&rc->reloc_root_tree.lock);
253 		ASSERT(!node || (struct btrfs_root *)node->data == root);
254 	}
255 
256 	/*
257 	 * We only put the reloc root here if it's on the list.  There's a lot
258 	 * of places where the pattern is to splice the rc->reloc_roots, process
259 	 * the reloc roots, and then add the reloc root back onto
260 	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
261 	 * list we don't want the reference being dropped, because the guy
262 	 * messing with the list is in charge of the reference.
263 	 */
264 	spin_lock(&fs_info->trans_lock);
265 	if (!list_empty(&root->root_list)) {
266 		put_ref = true;
267 		list_del_init(&root->root_list);
268 	}
269 	spin_unlock(&fs_info->trans_lock);
270 	if (put_ref)
271 		btrfs_put_root(root);
272 	if (rc)
273 		put_reloc_control(rc);
274 }
275 
276 static void mark_block_processed(struct reloc_control *rc,
277 				 struct btrfs_backref_node *node)
278 {
279 	u32 blocksize;
280 
281 	if (node->level == 0 ||
282 	    in_range(node->bytenr, rc->block_group->start,
283 		     rc->block_group->length)) {
284 		blocksize = rc->extent_root->fs_info->nodesize;
285 		btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr,
286 				     node->bytenr + blocksize - 1, EXTENT_DIRTY,
287 				     NULL);
288 	}
289 	node->processed = 1;
290 }
291 
292 /*
293  * walk up backref nodes until reach node presents tree root
294  */
295 static struct btrfs_backref_node *walk_up_backref(
296 		struct btrfs_backref_node *node,
297 		struct btrfs_backref_edge *edges[], int *index)
298 {
299 	struct btrfs_backref_edge *edge;
300 	int idx = *index;
301 
302 	while (!list_empty(&node->upper)) {
303 		edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
304 					list[LOWER]);
305 		edges[idx++] = edge;
306 		node = edge->node[UPPER];
307 	}
308 	BUG_ON(node->detached);
309 	*index = idx;
310 	return node;
311 }
312 
313 /*
314  * walk down backref nodes to find start of next reference path
315  */
316 static struct btrfs_backref_node *walk_down_backref(
317 		struct btrfs_backref_edge *edges[], int *index)
318 {
319 	struct btrfs_backref_edge *edge;
320 	struct btrfs_backref_node *lower;
321 	int idx = *index;
322 
323 	while (idx > 0) {
324 		edge = edges[idx - 1];
325 		lower = edge->node[LOWER];
326 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
327 			idx--;
328 			continue;
329 		}
330 		edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge,
331 					list[LOWER]);
332 		edges[idx - 1] = edge;
333 		*index = idx;
334 		return edge->node[UPPER];
335 	}
336 	*index = 0;
337 	return NULL;
338 }
339 
340 static bool reloc_root_is_dead(const struct btrfs_root *root)
341 {
342 	/*
343 	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
344 	 * btrfs_update_reloc_root. We need to see the updated bit before
345 	 * trying to access reloc_root
346 	 */
347 	smp_rmb();
348 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
349 		return true;
350 	return false;
351 }
352 
353 /*
354  * Check if this subvolume tree has valid reloc tree.
355  *
356  * Reloc tree after swap is considered dead, thus not considered as valid.
357  * This is enough for most callers, as they don't distinguish dead reloc root
358  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
359  * special case.
360  */
361 static bool have_reloc_root(const struct btrfs_root *root)
362 {
363 	if (reloc_root_is_dead(root))
364 		return false;
365 	if (!root->reloc_root)
366 		return false;
367 	return true;
368 }
369 
370 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
371 {
372 	struct btrfs_root *reloc_root;
373 
374 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
375 		return false;
376 
377 	/* This root has been merged with its reloc tree, we can ignore it */
378 	if (reloc_root_is_dead(root))
379 		return true;
380 
381 	reloc_root = root->reloc_root;
382 	if (!reloc_root)
383 		return false;
384 
385 	if (btrfs_header_generation(reloc_root->commit_root) ==
386 	    root->fs_info->running_transaction->transid)
387 		return false;
388 	/*
389 	 * If there is reloc tree and it was created in previous transaction
390 	 * backref lookup can find the reloc tree, so backref node for the fs
391 	 * tree root is useless for relocation.
392 	 */
393 	return true;
394 }
395 
396 /*
397  * find reloc tree by address of tree root
398  */
399 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
400 {
401 	struct reloc_control *rc = fs_info->reloc_ctl;
402 	struct rb_node *rb_node;
403 	struct mapping_node *node;
404 	struct btrfs_root *root = NULL;
405 
406 	ASSERT(rc);
407 	spin_lock(&rc->reloc_root_tree.lock);
408 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
409 	if (rb_node) {
410 		node = rb_entry(rb_node, struct mapping_node, rb_node);
411 		root = node->data;
412 	}
413 	spin_unlock(&rc->reloc_root_tree.lock);
414 	return btrfs_grab_root(root);
415 }
416 
417 /*
418  * For useless nodes, do two major clean ups:
419  *
420  * - Cleanup the children edges and nodes
421  *   If child node is also orphan (no parent) during cleanup, then the child
422  *   node will also be cleaned up.
423  *
424  * - Freeing up leaves (level 0), keeps nodes detached
425  *   For nodes, the node is still cached as "detached"
426  *
427  * Return false if @node is not in the @useless_nodes list.
428  * Return true if @node is in the @useless_nodes list.
429  */
430 static bool handle_useless_nodes(struct reloc_control *rc,
431 				 struct btrfs_backref_node *node)
432 {
433 	struct btrfs_backref_cache *cache = &rc->backref_cache;
434 	struct list_head *useless_node = &cache->useless_node;
435 	bool ret = false;
436 
437 	while (!list_empty(useless_node)) {
438 		struct btrfs_backref_node *cur;
439 
440 		cur = list_first_entry(useless_node, struct btrfs_backref_node,
441 				 list);
442 		list_del_init(&cur->list);
443 
444 		/* Only tree root nodes can be added to @useless_nodes */
445 		ASSERT(list_empty(&cur->upper));
446 
447 		if (cur == node)
448 			ret = true;
449 
450 		/* Cleanup the lower edges */
451 		while (!list_empty(&cur->lower)) {
452 			struct btrfs_backref_edge *edge;
453 			struct btrfs_backref_node *lower;
454 
455 			edge = list_first_entry(&cur->lower, struct btrfs_backref_edge,
456 						list[UPPER]);
457 			list_del(&edge->list[UPPER]);
458 			list_del(&edge->list[LOWER]);
459 			lower = edge->node[LOWER];
460 			btrfs_backref_free_edge(cache, edge);
461 
462 			/* Child node is also orphan, queue for cleanup */
463 			if (list_empty(&lower->upper))
464 				list_add(&lower->list, useless_node);
465 		}
466 		/* Mark this block processed for relocation */
467 		mark_block_processed(rc, cur);
468 
469 		/*
470 		 * Backref nodes for tree leaves are deleted from the cache.
471 		 * Backref nodes for upper level tree blocks are left in the
472 		 * cache to avoid unnecessary backref lookup.
473 		 */
474 		if (cur->level > 0) {
475 			cur->detached = 1;
476 		} else {
477 			rb_erase(&cur->rb_node, &cache->rb_root);
478 			btrfs_backref_free_node(cache, cur);
479 		}
480 	}
481 	return ret;
482 }
483 
484 /*
485  * Build backref tree for a given tree block. Root of the backref tree
486  * corresponds the tree block, leaves of the backref tree correspond roots of
487  * b-trees that reference the tree block.
488  *
489  * The basic idea of this function is check backrefs of a given block to find
490  * upper level blocks that reference the block, and then check backrefs of
491  * these upper level blocks recursively. The recursion stops when tree root is
492  * reached or backrefs for the block is cached.
493  *
494  * NOTE: if we find that backrefs for a block are cached, we know backrefs for
495  * all upper level blocks that directly/indirectly reference the block are also
496  * cached.
497  */
498 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
499 			struct btrfs_trans_handle *trans,
500 			struct reloc_control *rc, struct btrfs_key *node_key,
501 			int level, u64 bytenr)
502 {
503 	struct btrfs_backref_iter iter;
504 	struct btrfs_backref_cache *cache = &rc->backref_cache;
505 	/* For searching parent of TREE_BLOCK_REF */
506 	struct btrfs_path *path;
507 	struct btrfs_backref_node *cur;
508 	struct btrfs_backref_node *node = NULL;
509 	struct btrfs_backref_edge *edge;
510 	int ret;
511 
512 	ret = btrfs_backref_iter_init(&iter);
513 	if (ret < 0)
514 		return ERR_PTR(ret);
515 	path = btrfs_alloc_path();
516 	if (!path) {
517 		ret = -ENOMEM;
518 		goto out;
519 	}
520 
521 	node = btrfs_backref_alloc_node(cache, bytenr, level);
522 	if (!node) {
523 		ret = -ENOMEM;
524 		goto out;
525 	}
526 
527 	cur = node;
528 
529 	/* Breadth-first search to build backref cache */
530 	do {
531 		ret = btrfs_backref_add_tree_node(trans, cache, path, &iter,
532 						  node_key, cur);
533 		if (ret < 0)
534 			goto out;
535 
536 		edge = list_first_entry_or_null(&cache->pending_edge,
537 				struct btrfs_backref_edge, list[UPPER]);
538 		/*
539 		 * The pending list isn't empty, take the first block to
540 		 * process
541 		 */
542 		if (edge) {
543 			list_del_init(&edge->list[UPPER]);
544 			cur = edge->node[UPPER];
545 		}
546 	} while (edge);
547 
548 	/* Finish the upper linkage of newly added edges/nodes */
549 	ret = btrfs_backref_finish_upper_links(cache, node);
550 	if (ret < 0)
551 		goto out;
552 
553 	if (handle_useless_nodes(rc, node))
554 		node = NULL;
555 out:
556 	btrfs_free_path(iter.path);
557 	btrfs_free_path(path);
558 	if (ret) {
559 		btrfs_backref_error_cleanup(cache, node);
560 		return ERR_PTR(ret);
561 	}
562 	ASSERT(!node || !node->detached);
563 	ASSERT(list_empty(&cache->useless_node) &&
564 	       list_empty(&cache->pending_edge));
565 	return node;
566 }
567 
568 /*
569  * helper to add 'address of tree root -> reloc tree' mapping
570  */
571 static int __add_reloc_root(struct btrfs_root *root, struct reloc_control *rc)
572 {
573 	struct btrfs_fs_info *fs_info = root->fs_info;
574 	struct rb_node *rb_node;
575 	struct mapping_node *node;
576 
577 	node = kmalloc_obj(*node, GFP_NOFS);
578 	if (!node)
579 		return -ENOMEM;
580 
581 	node->bytenr = root->commit_root->start;
582 	node->data = root;
583 
584 	spin_lock(&rc->reloc_root_tree.lock);
585 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
586 	spin_unlock(&rc->reloc_root_tree.lock);
587 	if (rb_node) {
588 		btrfs_err(fs_info,
589 			    "Duplicate root found for start=%llu while inserting into relocation tree",
590 			    node->bytenr);
591 		return -EEXIST;
592 	}
593 
594 	list_add_tail(&root->root_list, &rc->reloc_roots);
595 	return 0;
596 }
597 
598 /*
599  * helper to update the 'address of tree root -> reloc tree'
600  * mapping
601  */
602 static int __update_reloc_root(struct btrfs_root *root)
603 {
604 	struct btrfs_fs_info *fs_info = root->fs_info;
605 	struct rb_node *rb_node;
606 	struct mapping_node *node = NULL;
607 	struct reloc_control *rc = fs_info->reloc_ctl;
608 
609 	spin_lock(&rc->reloc_root_tree.lock);
610 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
611 				   root->commit_root->start);
612 	if (rb_node) {
613 		node = rb_entry(rb_node, struct mapping_node, rb_node);
614 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
615 	}
616 	spin_unlock(&rc->reloc_root_tree.lock);
617 
618 	if (!node)
619 		return 0;
620 	BUG_ON((struct btrfs_root *)node->data != root);
621 
622 	spin_lock(&rc->reloc_root_tree.lock);
623 	node->bytenr = root->node->start;
624 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
625 	spin_unlock(&rc->reloc_root_tree.lock);
626 	if (rb_node)
627 		btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
628 	return 0;
629 }
630 
631 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
632 					struct btrfs_root *root, u64 objectid)
633 {
634 	struct btrfs_fs_info *fs_info = root->fs_info;
635 	struct btrfs_root *reloc_root;
636 	struct extent_buffer *eb;
637 	struct btrfs_root_item AUTO_KFREE(root_item);
638 	struct btrfs_key root_key;
639 	int ret = 0;
640 
641 	root_item = kmalloc_obj(*root_item, GFP_NOFS);
642 	if (!root_item)
643 		return ERR_PTR(-ENOMEM);
644 
645 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
646 	root_key.type = BTRFS_ROOT_ITEM_KEY;
647 	root_key.offset = objectid;
648 
649 	if (btrfs_root_id(root) == objectid) {
650 		u64 commit_root_gen;
651 
652 		/*
653 		 * Relocation will wait for cleaner thread, and any half-dropped
654 		 * subvolume will be fully cleaned up at mount time.
655 		 * So here we shouldn't hit a subvolume with non-zero drop_progress.
656 		 *
657 		 * If this isn't the case, error out since it can make us attempt to
658 		 * drop references for extents that were already dropped before.
659 		 */
660 		if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
661 			struct btrfs_key cpu_key;
662 
663 			btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
664 			btrfs_err(fs_info,
665 	"cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT,
666 				  objectid, BTRFS_KEY_FMT_VALUE(&cpu_key));
667 			return ERR_PTR(-EUCLEAN);
668 		}
669 
670 		/* called by btrfs_init_reloc_root */
671 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
672 				      BTRFS_TREE_RELOC_OBJECTID);
673 		if (ret)
674 			return ERR_PTR(ret);
675 
676 		/*
677 		 * Set the last_snapshot field to the generation of the commit
678 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
679 		 * correctly (returns true) when the relocation root is created
680 		 * either inside the critical section of a transaction commit
681 		 * (through transaction.c:qgroup_account_snapshot()) and when
682 		 * it's created before the transaction commit is started.
683 		 */
684 		commit_root_gen = btrfs_header_generation(root->commit_root);
685 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
686 	} else {
687 		/*
688 		 * called by btrfs_reloc_post_snapshot_hook.
689 		 * the source tree is a reloc tree, all tree blocks
690 		 * modified after it was created have RELOC flag
691 		 * set in their headers. so it's OK to not update
692 		 * the 'last_snapshot'.
693 		 */
694 		ret = btrfs_copy_root(trans, root, root->node, &eb,
695 				      BTRFS_TREE_RELOC_OBJECTID);
696 		if (ret)
697 			return ERR_PTR(ret);
698 	}
699 
700 	/*
701 	 * We have changed references at this point, we must abort the
702 	 * transaction if anything fails (i.e. 'goto abort').
703 	 */
704 
705 	memcpy(root_item, &root->root_item, sizeof(*root_item));
706 	btrfs_set_root_bytenr(root_item, eb->start);
707 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
708 	btrfs_set_root_generation(root_item, trans->transid);
709 
710 	if (btrfs_root_id(root) == objectid) {
711 		btrfs_set_root_refs(root_item, 0);
712 		memset(&root_item->drop_progress, 0,
713 		       sizeof(struct btrfs_disk_key));
714 		btrfs_set_root_drop_level(root_item, 0);
715 	}
716 
717 	btrfs_tree_unlock(eb);
718 	free_extent_buffer(eb);
719 
720 	ret = btrfs_insert_root(trans, fs_info->tree_root,
721 				&root_key, root_item);
722 	if (ret)
723 		goto abort;
724 
725 	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
726 	if (IS_ERR(reloc_root)) {
727 		ret = PTR_ERR(reloc_root);
728 		goto abort;
729 	}
730 	set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
731 	btrfs_set_root_last_trans(reloc_root, trans->transid);
732 	return reloc_root;
733 
734 abort:
735 	btrfs_abort_transaction(trans, ret);
736 	return ERR_PTR(ret);
737 }
738 
739 /*
740  * create reloc tree for a given fs tree. reloc tree is just a
741  * snapshot of the fs tree with special root objectid.
742  *
743  * The reloc_root comes out of here with two references, one for
744  * root->reloc_root, and another for being on the rc->reloc_roots list.
745  */
746 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
747 			  struct btrfs_root *root)
748 {
749 	struct btrfs_fs_info *fs_info = root->fs_info;
750 	struct btrfs_root *reloc_root;
751 	struct reloc_control *rc;
752 	struct btrfs_block_rsv *rsv;
753 	bool clear_rsv = false;
754 	int ret = 0;
755 
756 	rc = get_reloc_control(fs_info);
757 	if (!rc)
758 		return 0;
759 
760 	/*
761 	 * The subvolume has reloc tree but the swap is finished, no need to
762 	 * create/update the dead reloc tree
763 	 */
764 	if (reloc_root_is_dead(root))
765 		goto out;
766 
767 	/*
768 	 * This is subtle but important.  We do not do
769 	 * record_root_in_transaction for reloc roots, instead we record their
770 	 * corresponding fs root, and then here we update the last trans for the
771 	 * reloc root.  This means that we have to do this for the entire life
772 	 * of the reloc root, regardless of which stage of the relocation we are
773 	 * in.
774 	 */
775 	if (root->reloc_root) {
776 		btrfs_set_root_last_trans(root->reloc_root, trans->transid);
777 		goto out;
778 	}
779 
780 	/*
781 	 * We are merging reloc roots, we do not need new reloc trees.  Also
782 	 * reloc trees never need their own reloc tree.
783 	 */
784 	if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
785 		goto out;
786 
787 	if (!trans->reloc_reserved) {
788 		rsv = trans->block_rsv;
789 		trans->block_rsv = rc->block_rsv;
790 		clear_rsv = true;
791 	}
792 	reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
793 	if (clear_rsv)
794 		trans->block_rsv = rsv;
795 	if (IS_ERR(reloc_root)) {
796 		ret = PTR_ERR(reloc_root);
797 		goto out;
798 	}
799 
800 	ret = __add_reloc_root(reloc_root, rc);
801 	ASSERT(ret != -EEXIST);
802 	if (ret) {
803 		/* Pairs with create_reloc_root */
804 		btrfs_put_root(reloc_root);
805 		goto out;
806 	}
807 	root->reloc_root = btrfs_grab_root(reloc_root);
808 out:
809 	put_reloc_control(rc);
810 
811 	return ret;
812 }
813 
814 /*
815  * update root item of reloc tree
816  */
817 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
818 			    struct btrfs_root *root)
819 {
820 	struct btrfs_fs_info *fs_info = root->fs_info;
821 	struct btrfs_root *reloc_root;
822 	struct btrfs_root_item *root_item;
823 	struct reloc_control *rc;
824 	int ret;
825 
826 	if (!have_reloc_root(root))
827 		return 0;
828 
829 	reloc_root = root->reloc_root;
830 	root_item = &reloc_root->root_item;
831 
832 	/*
833 	 * We are probably ok here, but __del_reloc_root() will drop its ref of
834 	 * the root.  We have the ref for root->reloc_root, but just in case
835 	 * hold it while we update the reloc root.
836 	 */
837 	btrfs_grab_root(reloc_root);
838 
839 	rc = get_reloc_control(fs_info);
840 	/* root->reloc_root will stay until current relocation finished */
841 	if (rc && rc->merge_reloc_tree && btrfs_root_refs(root_item) == 0) {
842 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
843 		/*
844 		 * Mark the tree as dead before we change reloc_root so
845 		 * have_reloc_root will not touch it from now on.
846 		 */
847 		smp_wmb();
848 		__del_reloc_root(reloc_root);
849 	}
850 
851 	if (reloc_root->commit_root != reloc_root->node) {
852 		__update_reloc_root(reloc_root);
853 		btrfs_set_root_node(root_item, reloc_root->node);
854 		free_extent_buffer(reloc_root->commit_root);
855 		reloc_root->commit_root = btrfs_root_node(reloc_root);
856 	}
857 
858 	ret = btrfs_update_root(trans, fs_info->tree_root,
859 				&reloc_root->root_key, root_item);
860 	btrfs_put_root(reloc_root);
861 	if (rc)
862 		put_reloc_control(rc);
863 
864 	return ret;
865 }
866 
867 /*
868  * get new location of data
869  */
870 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
871 			    u64 bytenr, u64 num_bytes)
872 {
873 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
874 	struct btrfs_fs_info *fs_info = root->fs_info;
875 	BTRFS_PATH_AUTO_FREE(path);
876 	struct btrfs_file_extent_item *fi;
877 	struct extent_buffer *leaf;
878 	int ret;
879 
880 	path = btrfs_alloc_path();
881 	if (!path)
882 		return -ENOMEM;
883 
884 	bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start;
885 	ret = btrfs_lookup_file_extent(NULL, root, path,
886 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
887 	if (ret < 0)
888 		return ret;
889 	if (ret > 0)
890 		return -ENOENT;
891 
892 	leaf = path->nodes[0];
893 	fi = btrfs_item_ptr(leaf, path->slots[0],
894 			    struct btrfs_file_extent_item);
895 
896 	/*
897 	 * The cluster-boundary key searched above is always written by
898 	 * relocation with offset 0: either by insert_prealloc_file_extent()
899 	 * (memsets the stack item to 0) or by the front portion of a partial
900 	 * writeback (offset=0 by construction). A non-zero value here means
901 	 * the on-disk leaf does not match what relocation wrote, i.e.
902 	 * corruption. The other encoding fields are caught earlier by
903 	 * tree-checker's check_extent_data_item().
904 	 */
905 	if (unlikely(btrfs_file_extent_offset(leaf, fi))) {
906 		btrfs_print_leaf(leaf);
907 		btrfs_err(fs_info,
908 "unexpected non-zero offset in file extent item for data reloc inode %llu key offset %llu offset %llu",
909 			  btrfs_ino(BTRFS_I(reloc_inode)), bytenr,
910 			  btrfs_file_extent_offset(leaf, fi));
911 		return -EUCLEAN;
912 	}
913 
914 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
915 		return -EINVAL;
916 
917 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
918 	return 0;
919 }
920 
921 /*
922  * update file extent items in the tree leaf to point to
923  * the new locations.
924  */
925 static noinline_for_stack
926 int replace_file_extents(struct btrfs_trans_handle *trans,
927 			 struct reloc_control *rc,
928 			 struct btrfs_root *root,
929 			 struct extent_buffer *leaf)
930 {
931 	struct btrfs_fs_info *fs_info = root->fs_info;
932 	struct btrfs_key key;
933 	struct btrfs_file_extent_item *fi;
934 	struct btrfs_inode *inode = NULL;
935 	u64 parent;
936 	u64 bytenr;
937 	u64 new_bytenr = 0;
938 	u64 num_bytes;
939 	u64 end;
940 	u32 nritems;
941 	u32 i;
942 	int ret = 0;
943 	bool first = true;
944 
945 	if (rc->stage != UPDATE_DATA_PTRS)
946 		return 0;
947 
948 	/* reloc trees always use full backref */
949 	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
950 		parent = leaf->start;
951 	else
952 		parent = 0;
953 
954 	nritems = btrfs_header_nritems(leaf);
955 	for (i = 0; i < nritems; i++) {
956 		struct btrfs_ref ref = { 0 };
957 
958 		cond_resched();
959 		btrfs_item_key_to_cpu(leaf, &key, i);
960 		if (key.type != BTRFS_EXTENT_DATA_KEY)
961 			continue;
962 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
963 		if (btrfs_file_extent_type(leaf, fi) ==
964 		    BTRFS_FILE_EXTENT_INLINE)
965 			continue;
966 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
967 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
968 		if (bytenr == 0)
969 			continue;
970 		if (!in_range(bytenr, rc->block_group->start,
971 			      rc->block_group->length))
972 			continue;
973 
974 		/*
975 		 * if we are modifying block in fs tree, wait for read_folio
976 		 * to complete and drop the extent cache
977 		 */
978 		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
979 			if (first) {
980 				inode = btrfs_find_first_inode(root, key.objectid);
981 				first = false;
982 			} else if (inode && btrfs_ino(inode) < key.objectid) {
983 				btrfs_add_delayed_iput(inode);
984 				inode = btrfs_find_first_inode(root, key.objectid);
985 			}
986 			if (inode && btrfs_ino(inode) == key.objectid) {
987 				struct extent_state *cached_state = NULL;
988 
989 				end = key.offset +
990 				      btrfs_file_extent_num_bytes(leaf, fi);
991 				WARN_ON(!IS_ALIGNED(key.offset,
992 						    fs_info->sectorsize));
993 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
994 				end--;
995 				/* Take mmap lock to serialize with reflinks. */
996 				if (!down_read_trylock(&inode->i_mmap_lock))
997 					continue;
998 				ret = btrfs_try_lock_extent(&inode->io_tree, key.offset,
999 							    end, &cached_state);
1000 				if (!ret) {
1001 					up_read(&inode->i_mmap_lock);
1002 					continue;
1003 				}
1004 
1005 				btrfs_drop_extent_map_range(inode, key.offset, end, true);
1006 				btrfs_unlock_extent(&inode->io_tree, key.offset, end,
1007 						    &cached_state);
1008 				up_read(&inode->i_mmap_lock);
1009 			}
1010 		}
1011 
1012 		ret = get_new_location(rc->data_inode, &new_bytenr,
1013 				       bytenr, num_bytes);
1014 		if (ret) {
1015 			/*
1016 			 * Don't have to abort since we've not changed anything
1017 			 * in the file extent yet.
1018 			 */
1019 			break;
1020 		}
1021 
1022 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1023 
1024 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1025 		ref.action = BTRFS_ADD_DELAYED_REF;
1026 		ref.bytenr = new_bytenr;
1027 		ref.num_bytes = num_bytes;
1028 		ref.parent = parent;
1029 		ref.owning_root = btrfs_root_id(root);
1030 		ref.ref_root = btrfs_header_owner(leaf);
1031 		btrfs_init_data_ref(&ref, key.objectid, key.offset,
1032 				    btrfs_root_id(root), false);
1033 		ret = btrfs_inc_extent_ref(trans, &ref);
1034 		if (unlikely(ret)) {
1035 			btrfs_abort_transaction(trans, ret);
1036 			break;
1037 		}
1038 
1039 		ref.action = BTRFS_DROP_DELAYED_REF;
1040 		ref.bytenr = bytenr;
1041 		ref.num_bytes = num_bytes;
1042 		ref.parent = parent;
1043 		ref.owning_root = btrfs_root_id(root);
1044 		ref.ref_root = btrfs_header_owner(leaf);
1045 		btrfs_init_data_ref(&ref, key.objectid, key.offset,
1046 				    btrfs_root_id(root), false);
1047 		ret = btrfs_free_extent(trans, &ref);
1048 		if (unlikely(ret)) {
1049 			btrfs_abort_transaction(trans, ret);
1050 			break;
1051 		}
1052 	}
1053 	if (inode)
1054 		btrfs_add_delayed_iput(inode);
1055 	return ret;
1056 }
1057 
1058 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
1059 					       int slot, const struct btrfs_path *path,
1060 					       int level)
1061 {
1062 	struct btrfs_disk_key key1;
1063 	struct btrfs_disk_key key2;
1064 	btrfs_node_key(eb, &key1, slot);
1065 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1066 	return memcmp(&key1, &key2, sizeof(key1));
1067 }
1068 
1069 /*
1070  * try to replace tree blocks in fs tree with the new blocks
1071  * in reloc tree. tree blocks haven't been modified since the
1072  * reloc tree was create can be replaced.
1073  *
1074  * if a block was replaced, level of the block + 1 is returned.
1075  * if no block got replaced, 0 is returned. if there are other
1076  * errors, a negative error number is returned.
1077  */
1078 static noinline_for_stack
1079 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1080 		 struct btrfs_root *dest, struct btrfs_root *src,
1081 		 struct btrfs_path *path, struct btrfs_key *next_key,
1082 		 int lowest_level, int max_level)
1083 {
1084 	struct btrfs_fs_info *fs_info = dest->fs_info;
1085 	struct extent_buffer *eb;
1086 	struct extent_buffer *parent;
1087 	struct btrfs_ref ref = { 0 };
1088 	struct btrfs_key key;
1089 	u64 old_bytenr;
1090 	u64 new_bytenr;
1091 	u64 old_ptr_gen;
1092 	u64 new_ptr_gen;
1093 	u64 last_snapshot;
1094 	u32 blocksize;
1095 	bool cow = false;
1096 	int level;
1097 	int ret;
1098 	int slot;
1099 
1100 	ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
1101 	ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);
1102 
1103 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1104 again:
1105 	slot = path->slots[lowest_level];
1106 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1107 
1108 	eb = btrfs_lock_root_node(dest);
1109 	level = btrfs_header_level(eb);
1110 
1111 	if (level < lowest_level) {
1112 		btrfs_tree_unlock(eb);
1113 		free_extent_buffer(eb);
1114 		return 0;
1115 	}
1116 
1117 	if (cow) {
1118 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1119 				      BTRFS_NESTING_COW);
1120 		if (ret) {
1121 			btrfs_tree_unlock(eb);
1122 			free_extent_buffer(eb);
1123 			return ret;
1124 		}
1125 	}
1126 
1127 	if (next_key) {
1128 		next_key->objectid = (u64)-1;
1129 		next_key->type = (u8)-1;
1130 		next_key->offset = (u64)-1;
1131 	}
1132 
1133 	parent = eb;
1134 	while (1) {
1135 		level = btrfs_header_level(parent);
1136 		ASSERT(level >= lowest_level);
1137 
1138 		ret = btrfs_bin_search(parent, 0, &key, &slot);
1139 		if (ret < 0)
1140 			break;
1141 		if (ret && slot > 0)
1142 			slot--;
1143 
1144 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1145 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1146 
1147 		old_bytenr = btrfs_node_blockptr(parent, slot);
1148 		blocksize = fs_info->nodesize;
1149 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1150 
1151 		if (level <= max_level) {
1152 			eb = path->nodes[level];
1153 			new_bytenr = btrfs_node_blockptr(eb,
1154 							path->slots[level]);
1155 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1156 							path->slots[level]);
1157 		} else {
1158 			new_bytenr = 0;
1159 			new_ptr_gen = 0;
1160 		}
1161 
1162 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1163 			ret = level;
1164 			break;
1165 		}
1166 
1167 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1168 		    memcmp_node_keys(parent, slot, path, level)) {
1169 			if (level <= lowest_level) {
1170 				ret = 0;
1171 				break;
1172 			}
1173 
1174 			eb = btrfs_read_node_slot(parent, slot);
1175 			if (IS_ERR(eb)) {
1176 				ret = PTR_ERR(eb);
1177 				break;
1178 			}
1179 			btrfs_tree_lock(eb);
1180 			if (cow) {
1181 				ret = btrfs_cow_block(trans, dest, eb, parent,
1182 						      slot, &eb,
1183 						      BTRFS_NESTING_COW);
1184 				if (ret) {
1185 					btrfs_tree_unlock(eb);
1186 					free_extent_buffer(eb);
1187 					break;
1188 				}
1189 			}
1190 
1191 			btrfs_tree_unlock(parent);
1192 			free_extent_buffer(parent);
1193 
1194 			parent = eb;
1195 			continue;
1196 		}
1197 
1198 		if (!cow) {
1199 			btrfs_tree_unlock(parent);
1200 			free_extent_buffer(parent);
1201 			cow = true;
1202 			goto again;
1203 		}
1204 
1205 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1206 				      path->slots[level]);
1207 		btrfs_release_path(path);
1208 
1209 		path->lowest_level = level;
1210 		set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1211 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1212 		clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1213 		path->lowest_level = 0;
1214 		if (ret) {
1215 			if (ret > 0)
1216 				ret = -ENOENT;
1217 			break;
1218 		}
1219 
1220 		/*
1221 		 * Info qgroup to trace both subtrees.
1222 		 *
1223 		 * We must trace both trees.
1224 		 * 1) Tree reloc subtree
1225 		 *    If not traced, we will leak data numbers
1226 		 * 2) Fs subtree
1227 		 *    If not traced, we will double count old data
1228 		 *
1229 		 * We don't scan the subtree right now, but only record
1230 		 * the swapped tree blocks.
1231 		 * The real subtree rescan is delayed until we have new
1232 		 * CoW on the subtree root node before transaction commit.
1233 		 */
1234 		ret = btrfs_qgroup_add_swapped_blocks(dest,
1235 				rc->block_group, parent, slot,
1236 				path->nodes[level], path->slots[level],
1237 				last_snapshot);
1238 		if (ret < 0)
1239 			break;
1240 		/*
1241 		 * swap blocks in fs tree and reloc tree.
1242 		 */
1243 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1244 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1245 
1246 		btrfs_set_node_blockptr(path->nodes[level],
1247 					path->slots[level], old_bytenr);
1248 		btrfs_set_node_ptr_generation(path->nodes[level],
1249 					      path->slots[level], old_ptr_gen);
1250 
1251 		ref.action = BTRFS_ADD_DELAYED_REF;
1252 		ref.bytenr = old_bytenr;
1253 		ref.num_bytes = blocksize;
1254 		ref.parent = path->nodes[level]->start;
1255 		ref.owning_root = btrfs_root_id(src);
1256 		ref.ref_root = btrfs_root_id(src);
1257 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1258 		ret = btrfs_inc_extent_ref(trans, &ref);
1259 		if (unlikely(ret)) {
1260 			btrfs_abort_transaction(trans, ret);
1261 			break;
1262 		}
1263 
1264 		ref.action = BTRFS_ADD_DELAYED_REF;
1265 		ref.bytenr = new_bytenr;
1266 		ref.num_bytes = blocksize;
1267 		ref.parent = 0;
1268 		ref.owning_root = btrfs_root_id(dest);
1269 		ref.ref_root = btrfs_root_id(dest);
1270 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1271 		ret = btrfs_inc_extent_ref(trans, &ref);
1272 		if (unlikely(ret)) {
1273 			btrfs_abort_transaction(trans, ret);
1274 			break;
1275 		}
1276 
1277 		/* We don't know the real owning_root, use 0. */
1278 		ref.action = BTRFS_DROP_DELAYED_REF;
1279 		ref.bytenr = new_bytenr;
1280 		ref.num_bytes = blocksize;
1281 		ref.parent = path->nodes[level]->start;
1282 		ref.owning_root = 0;
1283 		ref.ref_root = btrfs_root_id(src);
1284 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1285 		ret = btrfs_free_extent(trans, &ref);
1286 		if (unlikely(ret)) {
1287 			btrfs_abort_transaction(trans, ret);
1288 			break;
1289 		}
1290 
1291 		/* We don't know the real owning_root, use 0. */
1292 		ref.action = BTRFS_DROP_DELAYED_REF;
1293 		ref.bytenr = old_bytenr;
1294 		ref.num_bytes = blocksize;
1295 		ref.parent = 0;
1296 		ref.owning_root = 0;
1297 		ref.ref_root = btrfs_root_id(dest);
1298 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
1299 		ret = btrfs_free_extent(trans, &ref);
1300 		if (unlikely(ret)) {
1301 			btrfs_abort_transaction(trans, ret);
1302 			break;
1303 		}
1304 
1305 		btrfs_unlock_up_safe(path, 0);
1306 
1307 		ret = level;
1308 		break;
1309 	}
1310 	btrfs_tree_unlock(parent);
1311 	free_extent_buffer(parent);
1312 	return ret;
1313 }
1314 
1315 /*
1316  * helper to find next relocated block in reloc tree
1317  */
1318 static noinline_for_stack
1319 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1320 		       int *level)
1321 {
1322 	struct extent_buffer *eb;
1323 	int i;
1324 	u64 last_snapshot;
1325 	u32 nritems;
1326 
1327 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1328 
1329 	for (i = 0; i < *level; i++) {
1330 		free_extent_buffer(path->nodes[i]);
1331 		path->nodes[i] = NULL;
1332 	}
1333 
1334 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1335 		eb = path->nodes[i];
1336 		nritems = btrfs_header_nritems(eb);
1337 		while (path->slots[i] + 1 < nritems) {
1338 			path->slots[i]++;
1339 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1340 			    last_snapshot)
1341 				continue;
1342 
1343 			*level = i;
1344 			return 0;
1345 		}
1346 		free_extent_buffer(path->nodes[i]);
1347 		path->nodes[i] = NULL;
1348 	}
1349 	return 1;
1350 }
1351 
1352 /*
1353  * walk down reloc tree to find relocated block of lowest level
1354  */
1355 static noinline_for_stack
1356 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1357 			 int *level)
1358 {
1359 	struct extent_buffer *eb = NULL;
1360 	int i;
1361 	u64 ptr_gen = 0;
1362 	u64 last_snapshot;
1363 	u32 nritems;
1364 
1365 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1366 
1367 	for (i = *level; i > 0; i--) {
1368 		eb = path->nodes[i];
1369 		nritems = btrfs_header_nritems(eb);
1370 		while (path->slots[i] < nritems) {
1371 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1372 			if (ptr_gen > last_snapshot)
1373 				break;
1374 			path->slots[i]++;
1375 		}
1376 		if (path->slots[i] >= nritems) {
1377 			if (i == *level)
1378 				break;
1379 			*level = i + 1;
1380 			return 0;
1381 		}
1382 		if (i == 1) {
1383 			*level = i;
1384 			return 0;
1385 		}
1386 
1387 		eb = btrfs_read_node_slot(eb, path->slots[i]);
1388 		if (IS_ERR(eb))
1389 			return PTR_ERR(eb);
1390 		BUG_ON(btrfs_header_level(eb) != i - 1);
1391 		path->nodes[i - 1] = eb;
1392 		path->slots[i - 1] = 0;
1393 	}
1394 	return 1;
1395 }
1396 
1397 /*
1398  * invalidate extent cache for file extents whose key in range of
1399  * [min_key, max_key)
1400  */
1401 static int invalidate_extent_cache(struct btrfs_root *root,
1402 				   const struct btrfs_key *min_key,
1403 				   const struct btrfs_key *max_key)
1404 {
1405 	struct btrfs_fs_info *fs_info = root->fs_info;
1406 	struct btrfs_inode *inode = NULL;
1407 	u64 objectid;
1408 	u64 start, end;
1409 	u64 ino;
1410 
1411 	objectid = min_key->objectid;
1412 	while (1) {
1413 		struct extent_state *cached_state = NULL;
1414 
1415 		cond_resched();
1416 		if (inode)
1417 			iput(&inode->vfs_inode);
1418 
1419 		if (objectid > max_key->objectid)
1420 			break;
1421 
1422 		inode = btrfs_find_first_inode(root, objectid);
1423 		if (!inode)
1424 			break;
1425 		ino = btrfs_ino(inode);
1426 
1427 		if (ino > max_key->objectid) {
1428 			iput(&inode->vfs_inode);
1429 			break;
1430 		}
1431 
1432 		objectid = ino + 1;
1433 		if (!S_ISREG(inode->vfs_inode.i_mode))
1434 			continue;
1435 
1436 		if (unlikely(min_key->objectid == ino)) {
1437 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1438 				continue;
1439 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1440 				start = 0;
1441 			else {
1442 				start = min_key->offset;
1443 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1444 			}
1445 		} else {
1446 			start = 0;
1447 		}
1448 
1449 		if (unlikely(max_key->objectid == ino)) {
1450 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1451 				continue;
1452 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1453 				end = (u64)-1;
1454 			} else {
1455 				if (max_key->offset == 0)
1456 					continue;
1457 				end = max_key->offset;
1458 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1459 				end--;
1460 			}
1461 		} else {
1462 			end = (u64)-1;
1463 		}
1464 
1465 		/* the lock_extent waits for read_folio to complete */
1466 		btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
1467 		btrfs_drop_extent_map_range(inode, start, end, true);
1468 		btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
1469 	}
1470 	return 0;
1471 }
1472 
1473 static int find_next_key(struct btrfs_path *path, int level,
1474 			 struct btrfs_key *key)
1475 
1476 {
1477 	while (level < BTRFS_MAX_LEVEL) {
1478 		if (!path->nodes[level])
1479 			break;
1480 		if (path->slots[level] + 1 <
1481 		    btrfs_header_nritems(path->nodes[level])) {
1482 			btrfs_node_key_to_cpu(path->nodes[level], key,
1483 					      path->slots[level] + 1);
1484 			return 0;
1485 		}
1486 		level++;
1487 	}
1488 	return 1;
1489 }
1490 
1491 /*
1492  * Insert current subvolume into reloc_control::dirty_subvol_roots
1493  */
1494 static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
1495 			       struct reloc_control *rc,
1496 			       struct btrfs_root *root)
1497 {
1498 	struct btrfs_root *reloc_root = root->reloc_root;
1499 	struct btrfs_root_item *reloc_root_item;
1500 	int ret;
1501 
1502 	/* @root must be a subvolume tree root with a valid reloc tree */
1503 	ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
1504 	ASSERT(reloc_root);
1505 
1506 	reloc_root_item = &reloc_root->root_item;
1507 	memset(&reloc_root_item->drop_progress, 0,
1508 		sizeof(reloc_root_item->drop_progress));
1509 	btrfs_set_root_drop_level(reloc_root_item, 0);
1510 	btrfs_set_root_refs(reloc_root_item, 0);
1511 	ret = btrfs_update_reloc_root(trans, root);
1512 	if (ret)
1513 		return ret;
1514 
1515 	if (list_empty(&root->reloc_dirty_list)) {
1516 		btrfs_grab_root(root);
1517 		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1518 	}
1519 
1520 	return 0;
1521 }
1522 
1523 static int clean_dirty_subvols(struct reloc_control *rc)
1524 {
1525 	struct btrfs_root *root;
1526 	struct btrfs_root *next;
1527 	int ret = 0;
1528 	int ret2;
1529 
1530 	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1531 				 reloc_dirty_list) {
1532 		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
1533 			/* Merged subvolume, cleanup its reloc root */
1534 			struct btrfs_root *reloc_root = root->reloc_root;
1535 
1536 			list_del_init(&root->reloc_dirty_list);
1537 			root->reloc_root = NULL;
1538 			/*
1539 			 * Need barrier to ensure clear_bit() only happens after
1540 			 * root->reloc_root = NULL. Pairs with have_reloc_root.
1541 			 */
1542 			smp_wmb();
1543 			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1544 			if (reloc_root) {
1545 				/*
1546 				 * btrfs_drop_snapshot drops our ref we hold for
1547 				 * ->reloc_root.  If it fails however we must
1548 				 * drop the ref ourselves.
1549 				 */
1550 				ret2 = btrfs_drop_snapshot(reloc_root, false, true);
1551 				if (ret2 < 0) {
1552 					btrfs_put_root(reloc_root);
1553 					if (!ret)
1554 						ret = ret2;
1555 				}
1556 			}
1557 			btrfs_put_root(root);
1558 		} else {
1559 			/* Orphan reloc tree, just clean it up */
1560 			ret2 = btrfs_drop_snapshot(root, false, true);
1561 			if (ret2 < 0) {
1562 				btrfs_put_root(root);
1563 				if (!ret)
1564 					ret = ret2;
1565 			}
1566 		}
1567 	}
1568 	return ret;
1569 }
1570 
1571 /*
1572  * merge the relocated tree blocks in reloc tree with corresponding
1573  * fs tree.
1574  */
1575 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1576 					       struct btrfs_root *root)
1577 {
1578 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1579 	struct btrfs_key key;
1580 	struct btrfs_key next_key;
1581 	struct btrfs_trans_handle *trans = NULL;
1582 	struct btrfs_root *reloc_root;
1583 	struct btrfs_root_item *root_item;
1584 	struct btrfs_path *path;
1585 	struct extent_buffer *leaf;
1586 	int reserve_level;
1587 	int level;
1588 	int max_level;
1589 	bool replaced = false;
1590 	int ret = 0;
1591 	u32 min_reserved;
1592 
1593 	path = btrfs_alloc_path();
1594 	if (!path)
1595 		return -ENOMEM;
1596 	path->reada = READA_FORWARD;
1597 
1598 	reloc_root = root->reloc_root;
1599 	root_item = &reloc_root->root_item;
1600 
1601 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1602 		level = btrfs_root_level(root_item);
1603 		refcount_inc(&reloc_root->node->refs);
1604 		path->nodes[level] = reloc_root->node;
1605 		path->slots[level] = 0;
1606 	} else {
1607 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1608 
1609 		level = btrfs_root_drop_level(root_item);
1610 		BUG_ON(level == 0);
1611 		path->lowest_level = level;
1612 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1613 		path->lowest_level = 0;
1614 		if (ret < 0) {
1615 			btrfs_free_path(path);
1616 			return ret;
1617 		}
1618 
1619 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1620 				      path->slots[level]);
1621 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1622 
1623 		btrfs_unlock_up_safe(path, 0);
1624 	}
1625 
1626 	/*
1627 	 * In merge_reloc_root(), we modify the upper level pointer to swap the
1628 	 * tree blocks between reloc tree and subvolume tree.  Thus for tree
1629 	 * block COW, we COW at most from level 1 to root level for each tree.
1630 	 *
1631 	 * Thus the needed metadata size is at most root_level * nodesize,
1632 	 * and * 2 since we have two trees to COW.
1633 	 */
1634 	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1635 	min_reserved = (reserve_level << fs_info->nodesize_bits) * 2;
1636 	memset(&next_key, 0, sizeof(next_key));
1637 
1638 	while (1) {
1639 		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
1640 					     min_reserved,
1641 					     BTRFS_RESERVE_FLUSH_LIMIT);
1642 		if (ret)
1643 			goto out;
1644 		trans = btrfs_start_transaction(root, 0);
1645 		if (IS_ERR(trans)) {
1646 			ret = PTR_ERR(trans);
1647 			trans = NULL;
1648 			goto out;
1649 		}
1650 
1651 		/*
1652 		 * At this point we no longer have a reloc_control, so we can't
1653 		 * depend on btrfs_init_reloc_root to update our last_trans.
1654 		 *
1655 		 * But that's ok, we started the trans handle on our
1656 		 * corresponding fs_root, which means it's been added to the
1657 		 * dirty list.  At commit time we'll still call
1658 		 * btrfs_update_reloc_root() and update our root item
1659 		 * appropriately.
1660 		 */
1661 		btrfs_set_root_last_trans(reloc_root, trans->transid);
1662 		trans->block_rsv = rc->block_rsv;
1663 
1664 		replaced = false;
1665 		max_level = level;
1666 
1667 		ret = walk_down_reloc_tree(reloc_root, path, &level);
1668 		if (ret < 0)
1669 			goto out;
1670 		if (ret > 0)
1671 			break;
1672 
1673 		if (!find_next_key(path, level, &key) &&
1674 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1675 			ret = 0;
1676 		} else {
1677 			ret = replace_path(trans, rc, root, reloc_root, path,
1678 					   &next_key, level, max_level);
1679 		}
1680 		if (ret < 0)
1681 			goto out;
1682 		if (ret > 0) {
1683 			level = ret;
1684 			btrfs_node_key_to_cpu(path->nodes[level], &key,
1685 					      path->slots[level]);
1686 			replaced = true;
1687 		}
1688 
1689 		ret = walk_up_reloc_tree(reloc_root, path, &level);
1690 		if (ret > 0)
1691 			break;
1692 
1693 		BUG_ON(level == 0);
1694 		/*
1695 		 * save the merging progress in the drop_progress.
1696 		 * this is OK since root refs == 1 in this case.
1697 		 */
1698 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1699 			       path->slots[level]);
1700 		btrfs_set_root_drop_level(root_item, level);
1701 
1702 		btrfs_end_transaction_throttle(trans);
1703 		trans = NULL;
1704 
1705 		btrfs_btree_balance_dirty(fs_info);
1706 
1707 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1708 			invalidate_extent_cache(root, &key, &next_key);
1709 	}
1710 
1711 	/*
1712 	 * handle the case only one block in the fs tree need to be
1713 	 * relocated and the block is tree root.
1714 	 */
1715 	leaf = btrfs_lock_root_node(root);
1716 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1717 			      BTRFS_NESTING_COW);
1718 	btrfs_tree_unlock(leaf);
1719 	free_extent_buffer(leaf);
1720 out:
1721 	btrfs_free_path(path);
1722 
1723 	if (ret == 0) {
1724 		ret = insert_dirty_subvol(trans, rc, root);
1725 		if (ret)
1726 			btrfs_abort_transaction(trans, ret);
1727 	}
1728 
1729 	if (trans)
1730 		btrfs_end_transaction_throttle(trans);
1731 
1732 	btrfs_btree_balance_dirty(fs_info);
1733 
1734 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1735 		invalidate_extent_cache(root, &key, &next_key);
1736 
1737 	return ret;
1738 }
1739 
1740 static noinline_for_stack
1741 int prepare_to_merge(struct reloc_control *rc, int err)
1742 {
1743 	struct btrfs_root *root = rc->extent_root;
1744 	struct btrfs_fs_info *fs_info = root->fs_info;
1745 	struct btrfs_root *reloc_root;
1746 	struct btrfs_trans_handle *trans;
1747 	LIST_HEAD(reloc_roots);
1748 	u64 num_bytes = 0;
1749 	int ret;
1750 
1751 	mutex_lock(&fs_info->reloc_mutex);
1752 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1753 	rc->merging_rsv_size += rc->nodes_relocated * 2;
1754 	mutex_unlock(&fs_info->reloc_mutex);
1755 
1756 again:
1757 	if (!err) {
1758 		num_bytes = rc->merging_rsv_size;
1759 		ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
1760 					  BTRFS_RESERVE_FLUSH_ALL);
1761 		if (ret)
1762 			err = ret;
1763 	}
1764 
1765 	trans = btrfs_join_transaction(rc->extent_root);
1766 	if (IS_ERR(trans)) {
1767 		if (!err)
1768 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1769 						num_bytes, NULL);
1770 		return PTR_ERR(trans);
1771 	}
1772 
1773 	if (!err) {
1774 		if (num_bytes != rc->merging_rsv_size) {
1775 			btrfs_end_transaction(trans);
1776 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1777 						num_bytes, NULL);
1778 			goto again;
1779 		}
1780 	}
1781 
1782 	rc->merge_reloc_tree = true;
1783 
1784 	while (!list_empty(&rc->reloc_roots)) {
1785 		reloc_root = list_first_entry(&rc->reloc_roots,
1786 					      struct btrfs_root, root_list);
1787 		list_del_init(&reloc_root->root_list);
1788 
1789 		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1790 				false);
1791 		if (IS_ERR(root)) {
1792 			/*
1793 			 * Even if we have an error we need this reloc root
1794 			 * back on our list so we can clean up properly.
1795 			 */
1796 			list_add(&reloc_root->root_list, &reloc_roots);
1797 			btrfs_abort_transaction(trans, (int)PTR_ERR(root));
1798 			if (!err)
1799 				err = PTR_ERR(root);
1800 			break;
1801 		}
1802 
1803 		if (unlikely(root->reloc_root != reloc_root)) {
1804 			if (root->reloc_root) {
1805 				btrfs_err(fs_info,
1806 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
1807 					  btrfs_root_id(root),
1808 					  btrfs_root_id(root->reloc_root),
1809 					  root->reloc_root->root_key.type,
1810 					  root->reloc_root->root_key.offset,
1811 					  btrfs_root_generation(
1812 						  &root->reloc_root->root_item),
1813 					  btrfs_root_id(reloc_root),
1814 					  reloc_root->root_key.type,
1815 					  reloc_root->root_key.offset,
1816 					  btrfs_root_generation(
1817 						  &reloc_root->root_item));
1818 			} else {
1819 				btrfs_err(fs_info,
1820 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
1821 					  btrfs_root_id(root),
1822 					  btrfs_root_id(reloc_root),
1823 					  reloc_root->root_key.type,
1824 					  reloc_root->root_key.offset,
1825 					  btrfs_root_generation(
1826 						  &reloc_root->root_item));
1827 			}
1828 			list_add(&reloc_root->root_list, &reloc_roots);
1829 			btrfs_put_root(root);
1830 			btrfs_abort_transaction(trans, -EUCLEAN);
1831 			if (!err)
1832 				err = -EUCLEAN;
1833 			break;
1834 		}
1835 
1836 		/*
1837 		 * set reference count to 1, so btrfs_recover_relocation
1838 		 * knows it should resumes merging
1839 		 */
1840 		if (!err)
1841 			btrfs_set_root_refs(&reloc_root->root_item, 1);
1842 		ret = btrfs_update_reloc_root(trans, root);
1843 
1844 		/*
1845 		 * Even if we have an error we need this reloc root back on our
1846 		 * list so we can clean up properly.
1847 		 */
1848 		list_add(&reloc_root->root_list, &reloc_roots);
1849 		btrfs_put_root(root);
1850 
1851 		if (unlikely(ret)) {
1852 			btrfs_abort_transaction(trans, ret);
1853 			if (!err)
1854 				err = ret;
1855 			break;
1856 		}
1857 	}
1858 
1859 	list_splice(&reloc_roots, &rc->reloc_roots);
1860 
1861 	if (!err)
1862 		err = btrfs_commit_transaction(trans);
1863 	else
1864 		btrfs_end_transaction(trans);
1865 	return err;
1866 }
1867 
1868 static noinline_for_stack
1869 void merge_reloc_roots(struct reloc_control *rc)
1870 {
1871 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1872 	struct btrfs_root *root;
1873 	struct btrfs_root *reloc_root;
1874 	LIST_HEAD(reloc_roots);
1875 	bool found = false;
1876 	int ret = 0;
1877 again:
1878 	root = rc->extent_root;
1879 
1880 	/*
1881 	 * this serializes us with btrfs_record_root_in_transaction,
1882 	 * we have to make sure nobody is in the middle of
1883 	 * adding their roots to the list while we are
1884 	 * doing this splice
1885 	 */
1886 	mutex_lock(&fs_info->reloc_mutex);
1887 	list_splice_init(&rc->reloc_roots, &reloc_roots);
1888 	mutex_unlock(&fs_info->reloc_mutex);
1889 
1890 	while (!list_empty(&reloc_roots)) {
1891 		found = true;
1892 		reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
1893 
1894 		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1895 					 false);
1896 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1897 			if (WARN_ON(IS_ERR(root))) {
1898 				/*
1899 				 * For recovery we read the fs roots on mount,
1900 				 * and if we didn't find the root then we marked
1901 				 * the reloc root as a garbage root.  For normal
1902 				 * relocation obviously the root should exist in
1903 				 * memory.  However there's no reason we can't
1904 				 * handle the error properly here just in case.
1905 				 */
1906 				ret = PTR_ERR(root);
1907 				goto out;
1908 			}
1909 			if (WARN_ON(root->reloc_root != reloc_root)) {
1910 				/*
1911 				 * This can happen if on-disk metadata has some
1912 				 * corruption, e.g. bad reloc tree key offset.
1913 				 */
1914 				ret = -EINVAL;
1915 				goto out;
1916 			}
1917 			ret = merge_reloc_root(rc, root);
1918 			btrfs_put_root(root);
1919 			if (ret) {
1920 				if (list_empty(&reloc_root->root_list))
1921 					list_add_tail(&reloc_root->root_list,
1922 						      &reloc_roots);
1923 				goto out;
1924 			}
1925 		} else {
1926 			if (!IS_ERR(root)) {
1927 				if (root->reloc_root == reloc_root) {
1928 					root->reloc_root = NULL;
1929 					btrfs_put_root(reloc_root);
1930 				}
1931 				clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
1932 					  &root->state);
1933 				btrfs_put_root(root);
1934 			}
1935 
1936 			list_del_init(&reloc_root->root_list);
1937 			/* Don't forget to queue this reloc root for cleanup */
1938 			list_add_tail(&reloc_root->reloc_dirty_list,
1939 				      &rc->dirty_subvol_roots);
1940 		}
1941 	}
1942 
1943 	if (found) {
1944 		found = false;
1945 		goto again;
1946 	}
1947 out:
1948 	if (ret) {
1949 		btrfs_handle_fs_error(fs_info, ret, NULL);
1950 		free_reloc_roots(&reloc_roots);
1951 
1952 		/* new reloc root may be added */
1953 		mutex_lock(&fs_info->reloc_mutex);
1954 		list_splice_init(&rc->reloc_roots, &reloc_roots);
1955 		mutex_unlock(&fs_info->reloc_mutex);
1956 		free_reloc_roots(&reloc_roots);
1957 	}
1958 
1959 	/*
1960 	 * We used to have
1961 	 *
1962 	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1963 	 *
1964 	 * here, but it's wrong.  If we fail to start the transaction in
1965 	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
1966 	 * have actually been removed from the reloc_root_tree rb tree.  This is
1967 	 * fine because we're bailing here, and we hold a reference on the root
1968 	 * for the list that holds it, so these roots will be cleaned up when we
1969 	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
1970 	 * will be cleaned up on unmount.
1971 	 *
1972 	 * The remaining nodes will be cleaned up by put_reloc_control().
1973 	 */
1974 }
1975 
1976 static void free_block_list(struct rb_root *blocks)
1977 {
1978 	struct tree_block *block;
1979 	struct rb_node *rb_node;
1980 	while ((rb_node = rb_first(blocks))) {
1981 		block = rb_entry(rb_node, struct tree_block, rb_node);
1982 		rb_erase(rb_node, blocks);
1983 		kfree(block);
1984 	}
1985 }
1986 
1987 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1988 				      struct btrfs_root *reloc_root)
1989 {
1990 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
1991 	struct btrfs_root *root;
1992 	int ret;
1993 
1994 	if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
1995 		return 0;
1996 
1997 	root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
1998 
1999 	/*
2000 	 * This should succeed, since we can't have a reloc root without having
2001 	 * already looked up the actual root and created the reloc root for this
2002 	 * root.
2003 	 *
2004 	 * However if there's some sort of corruption where we have a ref to a
2005 	 * reloc root without a corresponding root this could return ENOENT.
2006 	 */
2007 	if (IS_ERR(root)) {
2008 		DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
2009 		return PTR_ERR(root);
2010 	}
2011 	if (unlikely(root->reloc_root != reloc_root)) {
2012 		DEBUG_WARN("unexpected reloc root found");
2013 		btrfs_err(fs_info,
2014 			  "root %llu has two reloc roots associated with it",
2015 			  reloc_root->root_key.offset);
2016 		btrfs_put_root(root);
2017 		return -EUCLEAN;
2018 	}
2019 	ret = btrfs_record_root_in_trans(trans, root);
2020 	btrfs_put_root(root);
2021 
2022 	return ret;
2023 }
2024 
2025 static noinline_for_stack
2026 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2027 				     struct reloc_control *rc,
2028 				     struct btrfs_backref_node *node,
2029 				     struct btrfs_backref_edge *edges[])
2030 {
2031 	struct btrfs_backref_node *next;
2032 	struct btrfs_root *root;
2033 	int index = 0;
2034 	int ret;
2035 
2036 	next = walk_up_backref(node, edges, &index);
2037 	root = next->root;
2038 
2039 	/*
2040 	 * If there is no root, then our references for this block are
2041 	 * incomplete, as we should be able to walk all the way up to a block
2042 	 * that is owned by a root.
2043 	 *
2044 	 * This path is only for SHAREABLE roots, so if we come upon a
2045 	 * non-SHAREABLE root then we have backrefs that resolve improperly.
2046 	 *
2047 	 * Both of these cases indicate file system corruption, or a bug in the
2048 	 * backref walking code.
2049 	 */
2050 	if (unlikely(!root)) {
2051 		btrfs_err(trans->fs_info,
2052 			  "bytenr %llu doesn't have a backref path ending in a root",
2053 			  node->bytenr);
2054 		return ERR_PTR(-EUCLEAN);
2055 	}
2056 	if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
2057 		btrfs_err(trans->fs_info,
2058 			  "bytenr %llu has multiple refs with one ending in a non-shareable root",
2059 			  node->bytenr);
2060 		return ERR_PTR(-EUCLEAN);
2061 	}
2062 
2063 	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
2064 		ret = record_reloc_root_in_trans(trans, root);
2065 		if (ret)
2066 			return ERR_PTR(ret);
2067 		goto found;
2068 	}
2069 
2070 	ret = btrfs_record_root_in_trans(trans, root);
2071 	if (ret)
2072 		return ERR_PTR(ret);
2073 	root = root->reloc_root;
2074 
2075 	/*
2076 	 * We could have raced with another thread which failed, so
2077 	 * root->reloc_root may not be set, return ENOENT in this case.
2078 	 */
2079 	if (!root)
2080 		return ERR_PTR(-ENOENT);
2081 
2082 	if (unlikely(next->new_bytenr)) {
2083 		/*
2084 		 * We just created the reloc root, so we shouldn't have
2085 		 * ->new_bytenr set yet. If it is then we have multiple roots
2086 		 *  pointing at the same bytenr which indicates corruption, or
2087 		 *  we've made a mistake in the backref walking code.
2088 		 */
2089 		ASSERT(next->new_bytenr == 0);
2090 		btrfs_err(trans->fs_info,
2091 			  "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
2092 			  node->bytenr, next->bytenr);
2093 		return ERR_PTR(-EUCLEAN);
2094 	}
2095 
2096 	next->new_bytenr = root->node->start;
2097 	btrfs_put_root(next->root);
2098 	next->root = btrfs_grab_root(root);
2099 	ASSERT(next->root);
2100 	mark_block_processed(rc, next);
2101 found:
2102 	next = node;
2103 	/* setup backref node path for btrfs_reloc_cow_block */
2104 	while (1) {
2105 		rc->backref_cache.path[next->level] = next;
2106 		if (--index < 0)
2107 			break;
2108 		next = edges[index]->node[UPPER];
2109 	}
2110 	return root;
2111 }
2112 
2113 /*
2114  * Select a tree root for relocation.
2115  *
2116  * Return NULL if the block is not shareable. We should use do_relocation() in
2117  * this case.
2118  *
2119  * Return a tree root pointer if the block is shareable.
2120  * Return -ENOENT if the block is root of reloc tree.
2121  */
2122 static noinline_for_stack
2123 struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2124 {
2125 	struct btrfs_backref_node *next;
2126 	struct btrfs_root *root;
2127 	struct btrfs_root *fs_root = NULL;
2128 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2129 	int index = 0;
2130 
2131 	next = node;
2132 	while (1) {
2133 		cond_resched();
2134 		next = walk_up_backref(next, edges, &index);
2135 		root = next->root;
2136 
2137 		/*
2138 		 * This can occur if we have incomplete extent refs leading all
2139 		 * the way up a particular path, in this case return -EUCLEAN.
2140 		 */
2141 		if (unlikely(!root))
2142 			return ERR_PTR(-EUCLEAN);
2143 
2144 		/* No other choice for non-shareable tree */
2145 		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2146 			return root;
2147 
2148 		if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
2149 			fs_root = root;
2150 
2151 		if (next != node)
2152 			return NULL;
2153 
2154 		next = walk_down_backref(edges, &index);
2155 		if (!next || next->level <= node->level)
2156 			break;
2157 	}
2158 
2159 	if (!fs_root)
2160 		return ERR_PTR(-ENOENT);
2161 	return fs_root;
2162 }
2163 
2164 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
2165 						  struct btrfs_backref_node *node)
2166 {
2167 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2168 	struct btrfs_backref_node *next = node;
2169 	struct btrfs_backref_edge *edge;
2170 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2171 	u64 num_bytes = 0;
2172 	int index = 0;
2173 
2174 	BUG_ON(node->processed);
2175 
2176 	while (next) {
2177 		cond_resched();
2178 		while (1) {
2179 			if (next->processed)
2180 				break;
2181 
2182 			num_bytes += fs_info->nodesize;
2183 
2184 			if (list_empty(&next->upper))
2185 				break;
2186 
2187 			edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
2188 						list[LOWER]);
2189 			edges[index++] = edge;
2190 			next = edge->node[UPPER];
2191 		}
2192 		next = walk_down_backref(edges, &index);
2193 	}
2194 	return num_bytes;
2195 }
2196 
2197 static int refill_metadata_space(struct btrfs_trans_handle *trans,
2198 				 struct reloc_control *rc, u64 num_bytes)
2199 {
2200 	struct btrfs_fs_info *fs_info = trans->fs_info;
2201 	int ret;
2202 
2203 	trans->block_rsv = rc->block_rsv;
2204 	rc->reserved_bytes += num_bytes;
2205 
2206 	/*
2207 	 * We are under a transaction here so we can only do limited flushing.
2208 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2209 	 * transaction and try to refill when we can flush all the things.
2210 	 */
2211 	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
2212 				     BTRFS_RESERVE_FLUSH_LIMIT);
2213 	if (ret) {
2214 		u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2215 
2216 		while (tmp <= rc->reserved_bytes)
2217 			tmp <<= 1;
2218 		/*
2219 		 * only one thread can access block_rsv at this point,
2220 		 * so we don't need hold lock to protect block_rsv.
2221 		 * we expand more reservation size here to allow enough
2222 		 * space for relocation and we will return earlier in
2223 		 * enospc case.
2224 		 */
2225 		rc->block_rsv->size = tmp + fs_info->nodesize *
2226 				      RELOCATION_RESERVED_NODES;
2227 		return -EAGAIN;
2228 	}
2229 
2230 	return 0;
2231 }
2232 
2233 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2234 				  struct reloc_control *rc,
2235 				  struct btrfs_backref_node *node)
2236 {
2237 	u64 num_bytes;
2238 
2239 	num_bytes = calcu_metadata_size(rc, node) * 2;
2240 	return refill_metadata_space(trans, rc, num_bytes);
2241 }
2242 
2243 /*
2244  * relocate a block tree, and then update pointers in upper level
2245  * blocks that reference the block to point to the new location.
2246  *
2247  * if called by link_to_upper, the block has already been relocated.
2248  * in that case this function just updates pointers.
2249  */
2250 static int do_relocation(struct btrfs_trans_handle *trans,
2251 			 struct reloc_control *rc,
2252 			 struct btrfs_backref_node *node,
2253 			 struct btrfs_key *key,
2254 			 struct btrfs_path *path, int lowest)
2255 {
2256 	struct btrfs_backref_node *upper;
2257 	struct btrfs_backref_edge *edge;
2258 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2259 	struct btrfs_root *root;
2260 	struct extent_buffer *eb;
2261 	u32 blocksize;
2262 	u64 bytenr;
2263 	int slot;
2264 	int ret = 0;
2265 
2266 	/*
2267 	 * If we are lowest then this is the first time we're processing this
2268 	 * block, and thus shouldn't have an eb associated with it yet.
2269 	 */
2270 	ASSERT(!lowest || !node->eb);
2271 
2272 	path->lowest_level = node->level + 1;
2273 	rc->backref_cache.path[node->level] = node;
2274 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2275 		cond_resched();
2276 
2277 		upper = edge->node[UPPER];
2278 		root = select_reloc_root(trans, rc, upper, edges);
2279 		if (IS_ERR(root)) {
2280 			ret = PTR_ERR(root);
2281 			goto next;
2282 		}
2283 
2284 		if (upper->eb && !upper->locked) {
2285 			if (!lowest) {
2286 				ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2287 				if (ret < 0)
2288 					goto next;
2289 				BUG_ON(ret);
2290 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2291 				if (node->eb->start == bytenr)
2292 					goto next;
2293 			}
2294 			btrfs_backref_drop_node_buffer(upper);
2295 		}
2296 
2297 		if (!upper->eb) {
2298 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2299 			if (ret) {
2300 				if (ret > 0)
2301 					ret = -ENOENT;
2302 
2303 				btrfs_release_path(path);
2304 				break;
2305 			}
2306 
2307 			if (!upper->eb) {
2308 				upper->eb = path->nodes[upper->level];
2309 				path->nodes[upper->level] = NULL;
2310 			} else {
2311 				BUG_ON(upper->eb != path->nodes[upper->level]);
2312 			}
2313 
2314 			upper->locked = 1;
2315 			path->locks[upper->level] = 0;
2316 
2317 			slot = path->slots[upper->level];
2318 			btrfs_release_path(path);
2319 		} else {
2320 			ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2321 			if (ret < 0)
2322 				goto next;
2323 			BUG_ON(ret);
2324 		}
2325 
2326 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2327 		if (lowest) {
2328 			if (unlikely(bytenr != node->bytenr)) {
2329 				btrfs_err(root->fs_info,
2330 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2331 					  bytenr, node->bytenr, slot,
2332 					  upper->eb->start);
2333 				ret = -EIO;
2334 				goto next;
2335 			}
2336 		} else {
2337 			if (node->eb->start == bytenr)
2338 				goto next;
2339 		}
2340 
2341 		blocksize = root->fs_info->nodesize;
2342 		eb = btrfs_read_node_slot(upper->eb, slot);
2343 		if (IS_ERR(eb)) {
2344 			ret = PTR_ERR(eb);
2345 			goto next;
2346 		}
2347 		btrfs_tree_lock(eb);
2348 
2349 		if (!node->eb) {
2350 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2351 					      slot, &eb, BTRFS_NESTING_COW);
2352 			btrfs_tree_unlock(eb);
2353 			free_extent_buffer(eb);
2354 			if (ret < 0)
2355 				goto next;
2356 			/*
2357 			 * We've just COWed this block, it should have updated
2358 			 * the correct backref node entry.
2359 			 */
2360 			ASSERT(node->eb == eb);
2361 		} else {
2362 			struct btrfs_ref ref = {
2363 				.action = BTRFS_ADD_DELAYED_REF,
2364 				.bytenr = node->eb->start,
2365 				.num_bytes = blocksize,
2366 				.parent = upper->eb->start,
2367 				.owning_root = btrfs_header_owner(upper->eb),
2368 				.ref_root = btrfs_header_owner(upper->eb),
2369 			};
2370 
2371 			btrfs_set_node_blockptr(upper->eb, slot,
2372 						node->eb->start);
2373 			btrfs_set_node_ptr_generation(upper->eb, slot,
2374 						      trans->transid);
2375 			btrfs_mark_buffer_dirty(trans, upper->eb);
2376 
2377 			btrfs_init_tree_ref(&ref, node->level,
2378 					    btrfs_root_id(root), false);
2379 			ret = btrfs_inc_extent_ref(trans, &ref);
2380 			if (!ret)
2381 				ret = btrfs_drop_subtree(trans, root, eb,
2382 							 upper->eb);
2383 			if (unlikely(ret))
2384 				btrfs_abort_transaction(trans, ret);
2385 		}
2386 next:
2387 		if (!upper->pending)
2388 			btrfs_backref_drop_node_buffer(upper);
2389 		else
2390 			btrfs_backref_unlock_node_buffer(upper);
2391 		if (ret)
2392 			break;
2393 	}
2394 
2395 	if (!ret && node->pending) {
2396 		btrfs_backref_drop_node_buffer(node);
2397 		list_del_init(&node->list);
2398 		node->pending = 0;
2399 	}
2400 
2401 	path->lowest_level = 0;
2402 
2403 	/*
2404 	 * We should have allocated all of our space in the block rsv and thus
2405 	 * shouldn't ENOSPC.
2406 	 */
2407 	ASSERT(ret != -ENOSPC);
2408 	return ret;
2409 }
2410 
2411 static int link_to_upper(struct btrfs_trans_handle *trans,
2412 			 struct reloc_control *rc,
2413 			 struct btrfs_backref_node *node,
2414 			 struct btrfs_path *path)
2415 {
2416 	struct btrfs_key key;
2417 
2418 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2419 	return do_relocation(trans, rc, node, &key, path, 0);
2420 }
2421 
2422 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2423 				struct reloc_control *rc,
2424 				struct btrfs_path *path, int err)
2425 {
2426 	LIST_HEAD(list);
2427 	struct btrfs_backref_cache *cache = &rc->backref_cache;
2428 	struct btrfs_backref_node *node;
2429 	int level;
2430 	int ret;
2431 
2432 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2433 		while (!list_empty(&cache->pending[level])) {
2434 			node = list_first_entry(&cache->pending[level],
2435 						struct btrfs_backref_node, list);
2436 			list_move_tail(&node->list, &list);
2437 			BUG_ON(!node->pending);
2438 
2439 			if (!err) {
2440 				ret = link_to_upper(trans, rc, node, path);
2441 				if (ret < 0)
2442 					err = ret;
2443 			}
2444 		}
2445 		list_splice_init(&list, &cache->pending[level]);
2446 	}
2447 	return err;
2448 }
2449 
2450 /*
2451  * mark a block and all blocks directly/indirectly reference the block
2452  * as processed.
2453  */
2454 static void update_processed_blocks(struct reloc_control *rc,
2455 				    struct btrfs_backref_node *node)
2456 {
2457 	struct btrfs_backref_node *next = node;
2458 	struct btrfs_backref_edge *edge;
2459 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2460 	int index = 0;
2461 
2462 	while (next) {
2463 		cond_resched();
2464 		while (1) {
2465 			if (next->processed)
2466 				break;
2467 
2468 			mark_block_processed(rc, next);
2469 
2470 			if (list_empty(&next->upper))
2471 				break;
2472 
2473 			edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
2474 						list[LOWER]);
2475 			edges[index++] = edge;
2476 			next = edge->node[UPPER];
2477 		}
2478 		next = walk_down_backref(edges, &index);
2479 	}
2480 }
2481 
2482 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2483 {
2484 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2485 
2486 	if (btrfs_test_range_bit(&rc->processed_blocks, bytenr,
2487 				 bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
2488 		return 1;
2489 	return 0;
2490 }
2491 
2492 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2493 			      struct tree_block *block)
2494 {
2495 	struct btrfs_tree_parent_check check = {
2496 		.level = block->level,
2497 		.owner_root = block->owner,
2498 		.transid = block->key.offset
2499 	};
2500 	struct extent_buffer *eb;
2501 
2502 	eb = read_tree_block(fs_info, block->bytenr, &check);
2503 	if (IS_ERR(eb))
2504 		return PTR_ERR(eb);
2505 
2506 	if (block->level == 0)
2507 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2508 	else
2509 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2510 	free_extent_buffer(eb);
2511 	block->key_ready = true;
2512 	return 0;
2513 }
2514 
2515 /*
2516  * helper function to relocate a tree block
2517  */
2518 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2519 				struct reloc_control *rc,
2520 				struct btrfs_backref_node *node,
2521 				struct btrfs_key *key,
2522 				struct btrfs_path *path)
2523 {
2524 	struct btrfs_root *root;
2525 	int ret = 0;
2526 
2527 	if (!node)
2528 		return 0;
2529 
2530 	/*
2531 	 * If we fail here we want to drop our backref_node because we are going
2532 	 * to start over and regenerate the tree for it.
2533 	 */
2534 	ret = reserve_metadata_space(trans, rc, node);
2535 	if (ret)
2536 		goto out;
2537 
2538 	BUG_ON(node->processed);
2539 	root = select_one_root(node);
2540 	if (IS_ERR(root)) {
2541 		ret = PTR_ERR(root);
2542 
2543 		/* See explanation in select_one_root for the -EUCLEAN case. */
2544 		ASSERT(ret == -ENOENT);
2545 		if (ret == -ENOENT) {
2546 			ret = 0;
2547 			update_processed_blocks(rc, node);
2548 		}
2549 		goto out;
2550 	}
2551 
2552 	if (root) {
2553 		if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2554 			/*
2555 			 * This block was the root block of a root, and this is
2556 			 * the first time we're processing the block and thus it
2557 			 * should not have had the ->new_bytenr modified.
2558 			 *
2559 			 * However in the case of corruption we could have
2560 			 * multiple refs pointing to the same block improperly,
2561 			 * and thus we would trip over these checks.  ASSERT()
2562 			 * for the developer case, because it could indicate a
2563 			 * bug in the backref code, however error out for a
2564 			 * normal user in the case of corruption.
2565 			 */
2566 			ASSERT(node->new_bytenr == 0);
2567 			if (unlikely(node->new_bytenr)) {
2568 				btrfs_err(root->fs_info,
2569 				  "bytenr %llu has improper references to it",
2570 					  node->bytenr);
2571 				ret = -EUCLEAN;
2572 				goto out;
2573 			}
2574 			ret = btrfs_record_root_in_trans(trans, root);
2575 			if (ret)
2576 				goto out;
2577 			/*
2578 			 * Another thread could have failed, need to check if we
2579 			 * have reloc_root actually set.
2580 			 */
2581 			if (!root->reloc_root) {
2582 				ret = -ENOENT;
2583 				goto out;
2584 			}
2585 			root = root->reloc_root;
2586 			node->new_bytenr = root->node->start;
2587 			btrfs_put_root(node->root);
2588 			node->root = btrfs_grab_root(root);
2589 			ASSERT(node->root);
2590 		} else {
2591 			btrfs_err(root->fs_info,
2592 				  "bytenr %llu resolved to a non-shareable root",
2593 				  node->bytenr);
2594 			ret = -EUCLEAN;
2595 			goto out;
2596 		}
2597 		if (!ret)
2598 			update_processed_blocks(rc, node);
2599 	} else {
2600 		ret = do_relocation(trans, rc, node, key, path, 1);
2601 	}
2602 out:
2603 	if (ret || node->level == 0)
2604 		btrfs_backref_cleanup_node(&rc->backref_cache, node);
2605 	return ret;
2606 }
2607 
2608 static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
2609 				  struct reloc_control *rc, struct tree_block *block,
2610 				  struct btrfs_path *path)
2611 {
2612 	struct btrfs_fs_info *fs_info = trans->fs_info;
2613 	struct btrfs_root *root;
2614 	u64 num_bytes;
2615 	int nr_levels;
2616 	int ret;
2617 
2618 	root = btrfs_get_fs_root(fs_info, block->owner, true);
2619 	if (IS_ERR(root))
2620 		return PTR_ERR(root);
2621 
2622 	nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
2623 
2624 	num_bytes = (nr_levels << fs_info->nodesize_bits);
2625 	ret = refill_metadata_space(trans, rc, num_bytes);
2626 	if (ret) {
2627 		btrfs_put_root(root);
2628 		return ret;
2629 	}
2630 	path->lowest_level = block->level;
2631 	if (root == root->fs_info->chunk_root)
2632 		btrfs_reserve_chunk_metadata(trans, false);
2633 
2634 	ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
2635 	path->lowest_level = 0;
2636 	btrfs_release_path(path);
2637 
2638 	if (root == root->fs_info->chunk_root)
2639 		btrfs_trans_release_chunk_metadata(trans);
2640 	if (ret > 0)
2641 		ret = 0;
2642 	btrfs_put_root(root);
2643 
2644 	return ret;
2645 }
2646 
2647 /*
2648  * relocate a list of blocks
2649  */
2650 static noinline_for_stack
2651 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2652 			 struct reloc_control *rc, struct rb_root *blocks)
2653 {
2654 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2655 	struct btrfs_backref_node *node;
2656 	struct btrfs_path *path;
2657 	struct tree_block *block;
2658 	struct tree_block *next;
2659 	int ret = 0;
2660 
2661 	path = btrfs_alloc_path();
2662 	if (!path) {
2663 		ret = -ENOMEM;
2664 		goto out_free_blocks;
2665 	}
2666 
2667 	/* Kick in readahead for tree blocks with missing keys */
2668 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2669 		if (!block->key_ready)
2670 			btrfs_readahead_tree_block(fs_info, block->bytenr,
2671 						   block->owner, 0,
2672 						   block->level, NULL);
2673 	}
2674 
2675 	/* Get first keys */
2676 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2677 		if (!block->key_ready) {
2678 			ret = get_tree_block_key(fs_info, block);
2679 			if (ret)
2680 				goto out_free_path;
2681 		}
2682 	}
2683 
2684 	/* Do tree relocation */
2685 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2686 		/*
2687 		 * For COWonly blocks, or the data reloc tree, we only need to
2688 		 * COW down to the block, there's no need to generate a backref
2689 		 * tree.
2690 		 */
2691 		if (block->owner &&
2692 		    (!btrfs_is_fstree(block->owner) ||
2693 		     block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
2694 			ret = relocate_cowonly_block(trans, rc, block, path);
2695 			if (ret)
2696 				break;
2697 			continue;
2698 		}
2699 
2700 		node = build_backref_tree(trans, rc, &block->key,
2701 					  block->level, block->bytenr);
2702 		if (IS_ERR(node)) {
2703 			ret = PTR_ERR(node);
2704 			goto out;
2705 		}
2706 
2707 		ret = relocate_tree_block(trans, rc, node, &block->key,
2708 					  path);
2709 		if (ret < 0)
2710 			break;
2711 	}
2712 out:
2713 	ret = finish_pending_nodes(trans, rc, path, ret);
2714 
2715 out_free_path:
2716 	btrfs_free_path(path);
2717 out_free_blocks:
2718 	free_block_list(blocks);
2719 	return ret;
2720 }
2721 
2722 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc)
2723 {
2724 	const struct file_extent_cluster *cluster = &rc->cluster;
2725 	struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
2726 	u64 alloc_hint = 0;
2727 	u64 start;
2728 	u64 end;
2729 	u64 offset = inode->reloc_block_group_start;
2730 	u64 num_bytes;
2731 	int nr;
2732 	int ret = 0;
2733 	u64 prealloc_start = cluster->start - offset;
2734 	u64 prealloc_end = cluster->end - offset;
2735 	u64 cur_offset = prealloc_start;
2736 
2737 	/*
2738 	 * For blocksize < folio size case (either bs < page size or large folios),
2739 	 * beyond i_size, all blocks are filled with zero.
2740 	 *
2741 	 * If the current cluster covers the above range, btrfs_do_readpage()
2742 	 * will skip the read, and relocate_one_folio() will later writeback
2743 	 * the padding zeros as new data, causing data corruption.
2744 	 *
2745 	 * Here we have to invalidate the cache covering our cluster.
2746 	 */
2747 	ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start,
2748 				       prealloc_end);
2749 	if (ret < 0)
2750 		return ret;
2751 
2752 	BUG_ON(cluster->start != cluster->boundary[0]);
2753 	ret = btrfs_alloc_data_chunk_ondemand(inode,
2754 					      prealloc_end + 1 - prealloc_start);
2755 	if (ret)
2756 		return ret;
2757 
2758 	btrfs_inode_lock(inode, 0);
2759 	for (nr = 0; nr < cluster->nr; nr++) {
2760 		struct extent_state *cached_state = NULL;
2761 
2762 		start = cluster->boundary[nr] - offset;
2763 		if (nr + 1 < cluster->nr)
2764 			end = cluster->boundary[nr + 1] - 1 - offset;
2765 		else
2766 			end = cluster->end - offset;
2767 
2768 		btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
2769 		num_bytes = end + 1 - start;
2770 		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2771 						num_bytes, num_bytes,
2772 						end + 1, &alloc_hint);
2773 		cur_offset = end + 1;
2774 		btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
2775 		if (ret)
2776 			break;
2777 	}
2778 	btrfs_inode_unlock(inode, 0);
2779 
2780 	if (cur_offset < prealloc_end)
2781 		btrfs_free_reserved_data_space_noquota(inode,
2782 						       prealloc_end + 1 - cur_offset);
2783 	return ret;
2784 }
2785 
2786 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc)
2787 {
2788 	struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
2789 	struct extent_map *em;
2790 	struct extent_state *cached_state = NULL;
2791 	u64 offset = inode->reloc_block_group_start;
2792 	u64 start = rc->cluster.start - offset;
2793 	u64 end = rc->cluster.end - offset;
2794 	int ret = 0;
2795 
2796 	em = btrfs_alloc_extent_map();
2797 	if (!em)
2798 		return -ENOMEM;
2799 
2800 	em->start = start;
2801 	em->len = end + 1 - start;
2802 	em->disk_bytenr = rc->cluster.start;
2803 	em->disk_num_bytes = em->len;
2804 	em->ram_bytes = em->len;
2805 	em->flags |= EXTENT_FLAG_PINNED;
2806 
2807 	btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
2808 	ret = btrfs_replace_extent_map_range(inode, em, false);
2809 	btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
2810 	btrfs_free_extent_map(em);
2811 
2812 	return ret;
2813 }
2814 
2815 /*
2816  * Allow error injection to test balance/relocation cancellation
2817  */
2818 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
2819 {
2820 	return atomic_read(&fs_info->balance_cancel_req) ||
2821 		atomic_read(&fs_info->reloc_cancel_req) ||
2822 		fatal_signal_pending(current);
2823 }
2824 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2825 
2826 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
2827 				    int cluster_nr)
2828 {
2829 	/* Last extent, use cluster end directly */
2830 	if (cluster_nr >= cluster->nr - 1)
2831 		return cluster->end;
2832 
2833 	/* Use next boundary start*/
2834 	return cluster->boundary[cluster_nr + 1] - 1;
2835 }
2836 
2837 static int relocate_one_folio(struct reloc_control *rc,
2838 			      struct file_ra_state *ra,
2839 			      int *cluster_nr, u64 *file_offset_ret)
2840 {
2841 	const struct file_extent_cluster *cluster = &rc->cluster;
2842 	struct inode *inode = rc->data_inode;
2843 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2844 	const u64 orig_file_offset = *file_offset_ret;
2845 	u64 offset = BTRFS_I(inode)->reloc_block_group_start;
2846 	const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT;
2847 	const pgoff_t index = orig_file_offset >> PAGE_SHIFT;
2848 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2849 	struct folio *folio;
2850 	u64 folio_start;
2851 	u64 folio_end;
2852 	u64 cur;
2853 	int ret;
2854 	const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
2855 
2856 	ASSERT(index <= last_index);
2857 again:
2858 	folio = filemap_lock_folio(inode->i_mapping, index);
2859 	if (IS_ERR(folio)) {
2860 
2861 		/*
2862 		 * On relocation we're doing readahead on the relocation inode,
2863 		 * but if the filesystem is backed by a RAID stripe tree we can
2864 		 * get ENOENT (e.g. due to preallocated extents not being
2865 		 * mapped in the RST) from the lookup.
2866 		 *
2867 		 * But readahead doesn't handle the error and submits invalid
2868 		 * reads to the device, causing a assertion failures.
2869 		 */
2870 		if (!use_rst)
2871 			page_cache_sync_readahead(inode->i_mapping, ra, NULL,
2872 						  index, last_index + 1 - index);
2873 		folio = __filemap_get_folio(inode->i_mapping, index,
2874 					    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
2875 					    mask);
2876 		if (IS_ERR(folio))
2877 			return PTR_ERR(folio);
2878 	}
2879 
2880 	if (folio_test_readahead(folio) && !use_rst)
2881 		page_cache_async_readahead(inode->i_mapping, ra, NULL,
2882 					   folio, last_index + 1 - index);
2883 
2884 	if (!folio_test_uptodate(folio)) {
2885 		btrfs_read_folio(NULL, folio);
2886 		folio_lock(folio);
2887 		if (unlikely(!folio_test_uptodate(folio))) {
2888 			ret = -EIO;
2889 			goto release_folio;
2890 		}
2891 		if (folio->mapping != inode->i_mapping) {
2892 			folio_unlock(folio);
2893 			folio_put(folio);
2894 			goto again;
2895 		}
2896 	}
2897 
2898 	/*
2899 	 * We could have lost folio private when we dropped the lock to read the
2900 	 * folio above, make sure we set_folio_extent_mapped() here so we have any
2901 	 * of the subpage blocksize stuff we need in place.
2902 	 */
2903 	ret = set_folio_extent_mapped(folio);
2904 	if (ret < 0)
2905 		goto release_folio;
2906 
2907 	folio_start = folio_pos(folio);
2908 	folio_end = folio_start + folio_size(folio) - 1;
2909 
2910 	/*
2911 	 * Start from the cluster, as for subpage case, the cluster can start
2912 	 * inside the folio.
2913 	 */
2914 	cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
2915 	while (cur <= folio_end) {
2916 		struct extent_state *cached_state = NULL;
2917 		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
2918 		u64 extent_end = get_cluster_boundary_end(cluster,
2919 						*cluster_nr) - offset;
2920 		u64 clamped_start = max(folio_start, extent_start);
2921 		u64 clamped_end = min(folio_end, extent_end);
2922 		u32 clamped_len = clamped_end + 1 - clamped_start;
2923 
2924 		/* Reserve metadata for this range */
2925 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
2926 						      clamped_len, clamped_len,
2927 						      false);
2928 		if (ret)
2929 			goto release_folio;
2930 
2931 		/* Mark the range delalloc and dirty for later writeback */
2932 		btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start,
2933 				  clamped_end, &cached_state);
2934 		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
2935 						clamped_end, 0, &cached_state);
2936 		if (ret) {
2937 			btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
2938 					       clamped_start, clamped_end,
2939 					       EXTENT_LOCKED | EXTENT_BOUNDARY,
2940 					       &cached_state);
2941 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
2942 							clamped_len, true);
2943 			btrfs_delalloc_release_extents(BTRFS_I(inode),
2944 						       clamped_len);
2945 			goto release_folio;
2946 		}
2947 		btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);
2948 
2949 		/*
2950 		 * Set the boundary if it's inside the folio.
2951 		 * Data relocation requires the destination extents to have the
2952 		 * same size as the source.
2953 		 * EXTENT_BOUNDARY bit prevents current extent from being merged
2954 		 * with previous extent.
2955 		 */
2956 		if (in_range(cluster->boundary[*cluster_nr] - offset,
2957 			     folio_start, folio_size(folio))) {
2958 			u64 boundary_start = cluster->boundary[*cluster_nr] -
2959 						offset;
2960 			u64 boundary_end = boundary_start +
2961 					   fs_info->sectorsize - 1;
2962 
2963 			btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree,
2964 					     boundary_start, boundary_end,
2965 					     EXTENT_BOUNDARY, NULL);
2966 		}
2967 		btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
2968 				    &cached_state);
2969 		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
2970 		cur += clamped_len;
2971 
2972 		/* Crossed extent end, go to next extent */
2973 		if (cur >= extent_end) {
2974 			(*cluster_nr)++;
2975 			/* Just finished the last extent of the cluster, exit. */
2976 			if (*cluster_nr >= cluster->nr)
2977 				break;
2978 		}
2979 	}
2980 	folio_unlock(folio);
2981 	folio_put(folio);
2982 
2983 	balance_dirty_pages_ratelimited(inode->i_mapping);
2984 	btrfs_throttle(fs_info);
2985 	if (btrfs_should_cancel_balance(fs_info))
2986 		ret = -ECANCELED;
2987 	*file_offset_ret = folio_end + 1;
2988 	return ret;
2989 
2990 release_folio:
2991 	folio_unlock(folio);
2992 	folio_put(folio);
2993 	return ret;
2994 }
2995 
2996 static int relocate_file_extent_cluster(struct reloc_control *rc)
2997 {
2998 	struct inode *inode = rc->data_inode;
2999 	const struct file_extent_cluster *cluster = &rc->cluster;
3000 	u64 offset = BTRFS_I(inode)->reloc_block_group_start;
3001 	u64 cur_file_offset = cluster->start - offset;
3002 	struct file_ra_state AUTO_KFREE(ra);
3003 	int cluster_nr = 0;
3004 	int ret = 0;
3005 
3006 	if (!cluster->nr)
3007 		return 0;
3008 
3009 	ra = kzalloc_obj(*ra, GFP_NOFS);
3010 	if (!ra)
3011 		return -ENOMEM;
3012 
3013 	ret = prealloc_file_extent_cluster(rc);
3014 	if (ret)
3015 		return ret;
3016 
3017 	file_ra_state_init(ra, inode->i_mapping);
3018 
3019 	ret = setup_relocation_extent_mapping(rc);
3020 	if (ret)
3021 		return ret;
3022 
3023 	while (cur_file_offset < cluster->end - offset) {
3024 		ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset);
3025 		if (ret)
3026 			break;
3027 	}
3028 	if (ret == 0)
3029 		WARN_ON(cluster_nr != cluster->nr);
3030 	return ret;
3031 }
3032 
3033 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc,
3034 					   const struct btrfs_key *extent_key)
3035 {
3036 	struct inode *inode = rc->data_inode;
3037 	struct file_extent_cluster *cluster = &rc->cluster;
3038 	int ret;
3039 	struct btrfs_root *root = BTRFS_I(inode)->root;
3040 
3041 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3042 		ret = relocate_file_extent_cluster(rc);
3043 		if (ret)
3044 			return ret;
3045 		cluster->nr = 0;
3046 	}
3047 
3048 	/*
3049 	 * Under simple quotas, we set root->relocation_src_root when we find
3050 	 * the extent. If adjacent extents have different owners, we can't merge
3051 	 * them while relocating. Handle this by storing the owning root that
3052 	 * started a cluster and if we see an extent from a different root break
3053 	 * cluster formation (just like the above case of non-adjacent extents).
3054 	 *
3055 	 * Without simple quotas, relocation_src_root is always 0, so we should
3056 	 * never see a mismatch, and it should have no effect on relocation
3057 	 * clusters.
3058 	 */
3059 	if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
3060 		u64 tmp = root->relocation_src_root;
3061 
3062 		/*
3063 		 * root->relocation_src_root is the state that actually affects
3064 		 * the preallocation we do here, so set it to the root owning
3065 		 * the cluster we need to relocate.
3066 		 */
3067 		root->relocation_src_root = cluster->owning_root;
3068 		ret = relocate_file_extent_cluster(rc);
3069 		if (ret)
3070 			return ret;
3071 		cluster->nr = 0;
3072 		/* And reset it back for the current extent's owning root. */
3073 		root->relocation_src_root = tmp;
3074 	}
3075 
3076 	if (!cluster->nr) {
3077 		cluster->start = extent_key->objectid;
3078 		cluster->owning_root = root->relocation_src_root;
3079 	}
3080 	else
3081 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3082 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3083 	cluster->boundary[cluster->nr] = extent_key->objectid;
3084 	cluster->nr++;
3085 
3086 	if (cluster->nr >= MAX_EXTENTS) {
3087 		ret = relocate_file_extent_cluster(rc);
3088 		if (ret)
3089 			return ret;
3090 		cluster->nr = 0;
3091 	}
3092 	return 0;
3093 }
3094 
3095 /*
3096  * helper to add a tree block to the list.
3097  * the major work is getting the generation and level of the block
3098  */
3099 static int add_tree_block(struct reloc_control *rc,
3100 			  const struct btrfs_key *extent_key,
3101 			  struct btrfs_path *path,
3102 			  struct rb_root *blocks)
3103 {
3104 	struct extent_buffer *eb;
3105 	struct btrfs_extent_item *ei;
3106 	struct btrfs_tree_block_info *bi;
3107 	struct tree_block *block;
3108 	struct rb_node *rb_node;
3109 	u32 item_size;
3110 	int level = -1;
3111 	u64 generation;
3112 	u64 owner = 0;
3113 
3114 	eb =  path->nodes[0];
3115 	item_size = btrfs_item_size(eb, path->slots[0]);
3116 
3117 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3118 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3119 		unsigned long ptr = 0, end;
3120 
3121 		ei = btrfs_item_ptr(eb, path->slots[0],
3122 				struct btrfs_extent_item);
3123 		end = (unsigned long)ei + item_size;
3124 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3125 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3126 			level = btrfs_tree_block_level(eb, bi);
3127 			ptr = (unsigned long)(bi + 1);
3128 		} else {
3129 			level = (int)extent_key->offset;
3130 			ptr = (unsigned long)(ei + 1);
3131 		}
3132 		generation = btrfs_extent_generation(eb, ei);
3133 
3134 		/*
3135 		 * We're reading random blocks without knowing their owner ahead
3136 		 * of time.  This is ok most of the time, as all reloc roots and
3137 		 * fs roots have the same lock type.  However normal trees do
3138 		 * not, and the only way to know ahead of time is to read the
3139 		 * inline ref offset.  We know it's an fs root if
3140 		 *
3141 		 * 1. There's more than one ref.
3142 		 * 2. There's a SHARED_DATA_REF_KEY set.
3143 		 * 3. FULL_BACKREF is set on the flags.
3144 		 *
3145 		 * Otherwise it's safe to assume that the ref offset == the
3146 		 * owner of this block, so we can use that when calling
3147 		 * read_tree_block.
3148 		 */
3149 		if (btrfs_extent_refs(eb, ei) == 1 &&
3150 		    !(btrfs_extent_flags(eb, ei) &
3151 		      BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
3152 		    ptr < end) {
3153 			struct btrfs_extent_inline_ref *iref;
3154 			int type;
3155 
3156 			iref = (struct btrfs_extent_inline_ref *)ptr;
3157 			type = btrfs_get_extent_inline_ref_type(eb, iref,
3158 							BTRFS_REF_TYPE_BLOCK);
3159 			if (type == BTRFS_REF_TYPE_INVALID)
3160 				return -EINVAL;
3161 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
3162 				owner = btrfs_extent_inline_ref_offset(eb, iref);
3163 		}
3164 	} else {
3165 		btrfs_print_leaf(eb);
3166 		btrfs_err(rc->block_group->fs_info,
3167 			  "unrecognized tree backref at tree block %llu slot %u",
3168 			  eb->start, path->slots[0]);
3169 		btrfs_release_path(path);
3170 		return -EUCLEAN;
3171 	}
3172 
3173 	btrfs_release_path(path);
3174 
3175 	BUG_ON(level == -1);
3176 
3177 	block = kmalloc_obj(*block, GFP_NOFS);
3178 	if (!block)
3179 		return -ENOMEM;
3180 
3181 	block->bytenr = extent_key->objectid;
3182 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3183 	block->key.offset = generation;
3184 	block->level = level;
3185 	block->key_ready = false;
3186 	block->owner = owner;
3187 
3188 	rb_node = rb_simple_insert(blocks, &block->simple_node);
3189 	if (rb_node)
3190 		btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
3191 				    -EEXIST);
3192 
3193 	return 0;
3194 }
3195 
3196 /*
3197  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3198  */
3199 static int __add_tree_block(struct reloc_control *rc,
3200 			    u64 bytenr, u32 blocksize,
3201 			    struct rb_root *blocks)
3202 {
3203 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3204 	BTRFS_PATH_AUTO_FREE(path);
3205 	struct btrfs_key key;
3206 	int ret;
3207 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3208 
3209 	if (tree_block_processed(bytenr, rc))
3210 		return 0;
3211 
3212 	if (rb_simple_search(blocks, bytenr))
3213 		return 0;
3214 
3215 	path = btrfs_alloc_path();
3216 	if (!path)
3217 		return -ENOMEM;
3218 again:
3219 	key.objectid = bytenr;
3220 	if (skinny) {
3221 		key.type = BTRFS_METADATA_ITEM_KEY;
3222 		key.offset = (u64)-1;
3223 	} else {
3224 		key.type = BTRFS_EXTENT_ITEM_KEY;
3225 		key.offset = blocksize;
3226 	}
3227 
3228 	path->search_commit_root = true;
3229 	path->skip_locking = true;
3230 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3231 	if (ret < 0)
3232 		return ret;
3233 
3234 	if (ret > 0 && skinny) {
3235 		if (path->slots[0]) {
3236 			path->slots[0]--;
3237 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3238 					      path->slots[0]);
3239 			if (key.objectid == bytenr &&
3240 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3241 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3242 			      key.offset == blocksize)))
3243 				ret = 0;
3244 		}
3245 
3246 		if (ret) {
3247 			skinny = false;
3248 			btrfs_release_path(path);
3249 			goto again;
3250 		}
3251 	}
3252 	if (WARN_ON(ret)) {
3253 		ASSERT(ret == 1);
3254 		btrfs_print_leaf(path->nodes[0]);
3255 		btrfs_err(fs_info,
3256 	     "tree block extent item (%llu) is not found in extent tree",
3257 		     bytenr);
3258 		return -EINVAL;
3259 	}
3260 
3261 	return add_tree_block(rc, &key, path, blocks);
3262 }
3263 
3264 static int delete_block_group_cache(struct btrfs_block_group *block_group,
3265 				    struct inode *inode,
3266 				    u64 ino)
3267 {
3268 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3269 	struct btrfs_root *root = fs_info->tree_root;
3270 	struct btrfs_trans_handle *trans;
3271 	struct btrfs_inode *btrfs_inode;
3272 	int ret = 0;
3273 
3274 	if (inode)
3275 		goto truncate;
3276 
3277 	btrfs_inode = btrfs_iget(ino, root);
3278 	if (IS_ERR(btrfs_inode))
3279 		return -ENOENT;
3280 	inode = &btrfs_inode->vfs_inode;
3281 
3282 truncate:
3283 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3284 						 &fs_info->global_block_rsv);
3285 	if (ret)
3286 		goto out;
3287 
3288 	trans = btrfs_join_transaction(root);
3289 	if (IS_ERR(trans)) {
3290 		ret = PTR_ERR(trans);
3291 		goto out;
3292 	}
3293 
3294 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3295 
3296 	btrfs_end_transaction(trans);
3297 	btrfs_btree_balance_dirty(fs_info);
3298 out:
3299 	iput(inode);
3300 	return ret;
3301 }
3302 
3303 /*
3304  * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3305  * cache inode, to avoid free space cache data extent blocking data relocation.
3306  */
3307 static int delete_v1_space_cache(struct extent_buffer *leaf,
3308 				 struct btrfs_block_group *block_group,
3309 				 u64 data_bytenr)
3310 {
3311 	u64 space_cache_ino;
3312 	struct btrfs_file_extent_item *ei;
3313 	struct btrfs_key key;
3314 	bool found = false;
3315 	int i;
3316 
3317 	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3318 		return 0;
3319 
3320 	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3321 		u8 type;
3322 
3323 		btrfs_item_key_to_cpu(leaf, &key, i);
3324 		if (key.type != BTRFS_EXTENT_DATA_KEY)
3325 			continue;
3326 		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3327 		type = btrfs_file_extent_type(leaf, ei);
3328 
3329 		if ((type == BTRFS_FILE_EXTENT_REG ||
3330 		     type == BTRFS_FILE_EXTENT_PREALLOC) &&
3331 		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3332 			found = true;
3333 			space_cache_ino = key.objectid;
3334 			break;
3335 		}
3336 	}
3337 	if (!found)
3338 		return -ENOENT;
3339 
3340 	return delete_block_group_cache(block_group, NULL, space_cache_ino);
3341 }
3342 
3343 /*
3344  * helper to find all tree blocks that reference a given data extent
3345  */
3346 static noinline_for_stack int add_data_references(struct reloc_control *rc,
3347 						  const struct btrfs_key *extent_key,
3348 						  struct btrfs_path *path,
3349 						  struct rb_root *blocks)
3350 {
3351 	struct btrfs_backref_walk_ctx ctx = { 0 };
3352 	struct ulist_iterator leaf_uiter;
3353 	struct ulist_node *ref_node = NULL;
3354 	const u32 blocksize = rc->extent_root->fs_info->nodesize;
3355 	int ret = 0;
3356 
3357 	btrfs_release_path(path);
3358 
3359 	ctx.bytenr = extent_key->objectid;
3360 	ctx.skip_inode_ref_list = true;
3361 	ctx.fs_info = rc->extent_root->fs_info;
3362 
3363 	ret = btrfs_find_all_leafs(&ctx);
3364 	if (ret < 0)
3365 		return ret;
3366 
3367 	ULIST_ITER_INIT(&leaf_uiter);
3368 	while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
3369 		struct btrfs_tree_parent_check check = { 0 };
3370 		struct extent_buffer *eb;
3371 
3372 		eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
3373 		if (IS_ERR(eb)) {
3374 			ret = PTR_ERR(eb);
3375 			break;
3376 		}
3377 		ret = delete_v1_space_cache(eb, rc->block_group,
3378 					    extent_key->objectid);
3379 		free_extent_buffer(eb);
3380 		if (ret < 0)
3381 			break;
3382 		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3383 		if (ret < 0)
3384 			break;
3385 	}
3386 	if (ret < 0)
3387 		free_block_list(blocks);
3388 	ulist_free(ctx.refs);
3389 	return ret;
3390 }
3391 
3392 /*
3393  * helper to find next unprocessed extent
3394  */
3395 static noinline_for_stack
3396 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3397 		     struct btrfs_key *extent_key)
3398 {
3399 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3400 	struct btrfs_key key;
3401 	struct extent_buffer *leaf;
3402 	u64 start, end, last;
3403 	int ret;
3404 
3405 	last = rc->block_group->start + rc->block_group->length;
3406 	while (1) {
3407 		bool block_found;
3408 
3409 		cond_resched();
3410 		if (rc->search_start >= last) {
3411 			ret = 1;
3412 			break;
3413 		}
3414 
3415 		key.objectid = rc->search_start;
3416 		key.type = BTRFS_EXTENT_ITEM_KEY;
3417 		key.offset = 0;
3418 
3419 		path->search_commit_root = true;
3420 		path->skip_locking = true;
3421 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3422 					0, 0);
3423 		if (ret < 0)
3424 			break;
3425 next:
3426 		leaf = path->nodes[0];
3427 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3428 			ret = btrfs_next_leaf(rc->extent_root, path);
3429 			if (ret != 0)
3430 				break;
3431 			leaf = path->nodes[0];
3432 		}
3433 
3434 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3435 		if (key.objectid >= last) {
3436 			ret = 1;
3437 			break;
3438 		}
3439 
3440 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3441 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3442 			path->slots[0]++;
3443 			goto next;
3444 		}
3445 
3446 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3447 		    key.objectid + key.offset <= rc->search_start) {
3448 			path->slots[0]++;
3449 			goto next;
3450 		}
3451 
3452 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3453 		    key.objectid + fs_info->nodesize <=
3454 		    rc->search_start) {
3455 			path->slots[0]++;
3456 			goto next;
3457 		}
3458 
3459 		block_found = btrfs_find_first_extent_bit(&rc->processed_blocks,
3460 							  key.objectid, &start, &end,
3461 							  EXTENT_DIRTY, NULL);
3462 
3463 		if (block_found && start <= key.objectid) {
3464 			btrfs_release_path(path);
3465 			rc->search_start = end + 1;
3466 		} else {
3467 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3468 				rc->search_start = key.objectid + key.offset;
3469 			else
3470 				rc->search_start = key.objectid +
3471 					fs_info->nodesize;
3472 			memcpy(extent_key, &key, sizeof(key));
3473 			return 0;
3474 		}
3475 	}
3476 	btrfs_release_path(path);
3477 	return ret;
3478 }
3479 
3480 static void set_reloc_control(struct reloc_control *rc)
3481 {
3482 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3483 
3484 	mutex_lock(&fs_info->reloc_mutex);
3485 	spin_lock(&fs_info->reloc_ctl_lock);
3486 	fs_info->reloc_ctl = rc;
3487 	spin_unlock(&fs_info->reloc_ctl_lock);
3488 	mutex_unlock(&fs_info->reloc_mutex);
3489 }
3490 
3491 static void unset_reloc_control(struct reloc_control *rc)
3492 {
3493 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3494 
3495 	mutex_lock(&fs_info->reloc_mutex);
3496 	spin_lock(&fs_info->reloc_ctl_lock);
3497 	fs_info->reloc_ctl = NULL;
3498 	spin_unlock(&fs_info->reloc_ctl_lock);
3499 	mutex_unlock(&fs_info->reloc_mutex);
3500 }
3501 
3502 static noinline_for_stack
3503 int prepare_to_relocate(struct reloc_control *rc)
3504 {
3505 	struct btrfs_trans_handle *trans;
3506 	int ret;
3507 
3508 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3509 					      BTRFS_BLOCK_RSV_TEMP);
3510 	if (!rc->block_rsv)
3511 		return -ENOMEM;
3512 
3513 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3514 	rc->search_start = rc->block_group->start;
3515 	rc->extents_found = 0;
3516 	rc->nodes_relocated = 0;
3517 	rc->merging_rsv_size = 0;
3518 	rc->reserved_bytes = 0;
3519 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3520 			      RELOCATION_RESERVED_NODES;
3521 	ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
3522 				     rc->block_rsv, rc->block_rsv->size,
3523 				     BTRFS_RESERVE_FLUSH_ALL);
3524 	if (ret)
3525 		return ret;
3526 
3527 	rc->create_reloc_tree = true;
3528 	set_reloc_control(rc);
3529 
3530 	trans = btrfs_join_transaction(rc->extent_root);
3531 	if (IS_ERR(trans)) {
3532 		unset_reloc_control(rc);
3533 		/*
3534 		 * extent tree is not a ref_cow tree and has no reloc_root to
3535 		 * cleanup.  And callers are responsible to free the above
3536 		 * block rsv.
3537 		 */
3538 		return PTR_ERR(trans);
3539 	}
3540 
3541 	ret = btrfs_commit_transaction(trans);
3542 	if (ret)
3543 		unset_reloc_control(rc);
3544 
3545 	return ret;
3546 }
3547 
3548 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3549 {
3550 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3551 	struct rb_root blocks = RB_ROOT;
3552 	struct btrfs_key key;
3553 	struct btrfs_trans_handle *trans = NULL;
3554 	BTRFS_PATH_AUTO_FREE(path);
3555 	struct btrfs_extent_item *ei;
3556 	u64 flags;
3557 	int ret;
3558 	int err = 0;
3559 	int progress = 0;
3560 
3561 	path = btrfs_alloc_path();
3562 	if (!path)
3563 		return -ENOMEM;
3564 	path->reada = READA_FORWARD;
3565 
3566 	ret = prepare_to_relocate(rc);
3567 	if (ret) {
3568 		err = ret;
3569 		goto out_free;
3570 	}
3571 
3572 	while (1) {
3573 		rc->reserved_bytes = 0;
3574 		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
3575 					     rc->block_rsv->size,
3576 					     BTRFS_RESERVE_FLUSH_ALL);
3577 		if (ret) {
3578 			err = ret;
3579 			break;
3580 		}
3581 		progress++;
3582 		trans = btrfs_start_transaction(rc->extent_root, 0);
3583 		if (IS_ERR(trans)) {
3584 			err = PTR_ERR(trans);
3585 			trans = NULL;
3586 			break;
3587 		}
3588 restart:
3589 		if (rc->backref_cache.last_trans != trans->transid)
3590 			btrfs_backref_release_cache(&rc->backref_cache);
3591 		rc->backref_cache.last_trans = trans->transid;
3592 
3593 		ret = find_next_extent(rc, path, &key);
3594 		if (ret < 0)
3595 			err = ret;
3596 		if (ret != 0)
3597 			break;
3598 
3599 		rc->extents_found++;
3600 
3601 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3602 				    struct btrfs_extent_item);
3603 		flags = btrfs_extent_flags(path->nodes[0], ei);
3604 
3605 		/*
3606 		 * If we are relocating a simple quota owned extent item, we
3607 		 * need to note the owner on the reloc data root so that when
3608 		 * we allocate the replacement item, we can attribute it to the
3609 		 * correct eventual owner (rather than the reloc data root).
3610 		 */
3611 		if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3612 			struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
3613 			u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
3614 								 path->nodes[0],
3615 								 path->slots[0]);
3616 
3617 			root->relocation_src_root = owning_root_id;
3618 		}
3619 
3620 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3621 			ret = add_tree_block(rc, &key, path, &blocks);
3622 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3623 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
3624 			ret = add_data_references(rc, &key, path, &blocks);
3625 		} else {
3626 			btrfs_release_path(path);
3627 			ret = 0;
3628 		}
3629 		if (ret < 0) {
3630 			err = ret;
3631 			break;
3632 		}
3633 
3634 		if (!RB_EMPTY_ROOT(&blocks)) {
3635 			ret = relocate_tree_blocks(trans, rc, &blocks);
3636 			if (ret < 0) {
3637 				if (ret != -EAGAIN) {
3638 					err = ret;
3639 					break;
3640 				}
3641 				rc->extents_found--;
3642 				rc->search_start = key.objectid;
3643 			}
3644 		}
3645 
3646 		btrfs_end_transaction_throttle(trans);
3647 		btrfs_btree_balance_dirty(fs_info);
3648 		trans = NULL;
3649 
3650 		if (rc->stage == MOVE_DATA_EXTENTS &&
3651 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3652 			rc->found_file_extent = true;
3653 			ret = relocate_data_extent(rc, &key);
3654 			if (ret < 0) {
3655 				err = ret;
3656 				break;
3657 			}
3658 		}
3659 		if (btrfs_should_cancel_balance(fs_info)) {
3660 			err = -ECANCELED;
3661 			break;
3662 		}
3663 	}
3664 	if (trans && progress && err == -ENOSPC) {
3665 		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3666 		if (ret == 1) {
3667 			err = 0;
3668 			progress = 0;
3669 			goto restart;
3670 		}
3671 	}
3672 
3673 	btrfs_release_path(path);
3674 	btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL);
3675 
3676 	if (trans) {
3677 		btrfs_end_transaction_throttle(trans);
3678 		btrfs_btree_balance_dirty(fs_info);
3679 	}
3680 
3681 	if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
3682 		ret = relocate_file_extent_cluster(rc);
3683 		if (ret < 0)
3684 			err = ret;
3685 	}
3686 
3687 	rc->create_reloc_tree = false;
3688 	set_reloc_control(rc);
3689 
3690 	btrfs_backref_release_cache(&rc->backref_cache);
3691 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3692 
3693 	/*
3694 	 * Even in the case when the relocation is cancelled, we should all go
3695 	 * through prepare_to_merge() and merge_reloc_roots().
3696 	 *
3697 	 * For error (including cancelled balance), prepare_to_merge() will
3698 	 * mark all reloc trees orphan, then queue them for cleanup in
3699 	 * merge_reloc_roots()
3700 	 */
3701 	err = prepare_to_merge(rc, err);
3702 
3703 	merge_reloc_roots(rc);
3704 
3705 	rc->merge_reloc_tree = false;
3706 	unset_reloc_control(rc);
3707 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3708 
3709 	/* get rid of pinned extents */
3710 	ret = btrfs_commit_current_transaction(rc->extent_root);
3711 	if (ret && !err)
3712 		err = ret;
3713 out_free:
3714 	ret = clean_dirty_subvols(rc);
3715 	if (ret < 0 && !err)
3716 		err = ret;
3717 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
3718 	return err;
3719 }
3720 
3721 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3722 				 struct btrfs_root *root, u64 objectid)
3723 {
3724 	BTRFS_PATH_AUTO_FREE(path);
3725 	struct btrfs_inode_item *item;
3726 	struct extent_buffer *leaf;
3727 	int ret;
3728 
3729 	path = btrfs_alloc_path();
3730 	if (!path)
3731 		return -ENOMEM;
3732 
3733 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3734 	if (ret)
3735 		return ret;
3736 
3737 	leaf = path->nodes[0];
3738 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3739 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3740 	btrfs_set_inode_generation(leaf, item, 1);
3741 	btrfs_set_inode_size(leaf, item, 0);
3742 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3743 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3744 					  BTRFS_INODE_PREALLOC);
3745 	return 0;
3746 }
3747 
3748 static void delete_orphan_inode(struct btrfs_trans_handle *trans,
3749 				struct btrfs_root *root, u64 objectid)
3750 {
3751 	BTRFS_PATH_AUTO_FREE(path);
3752 	struct btrfs_key key;
3753 	int ret = 0;
3754 
3755 	path = btrfs_alloc_path();
3756 	if (!path) {
3757 		ret = -ENOMEM;
3758 		goto out;
3759 	}
3760 
3761 	key.objectid = objectid;
3762 	key.type = BTRFS_INODE_ITEM_KEY;
3763 	key.offset = 0;
3764 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3765 	if (ret) {
3766 		if (ret > 0)
3767 			ret = -ENOENT;
3768 		goto out;
3769 	}
3770 	ret = btrfs_del_item(trans, root, path);
3771 out:
3772 	if (ret)
3773 		btrfs_abort_transaction(trans, ret);
3774 }
3775 
3776 /*
3777  * helper to create inode for data relocation.
3778  * the inode is in data relocation tree and its link count is 0
3779  */
3780 static noinline_for_stack struct inode *create_reloc_inode(
3781 					const struct btrfs_block_group *group)
3782 {
3783 	struct btrfs_fs_info *fs_info = group->fs_info;
3784 	struct btrfs_inode *inode = NULL;
3785 	struct btrfs_trans_handle *trans;
3786 	struct btrfs_root *root;
3787 	u64 objectid;
3788 	int ret = 0;
3789 
3790 	root = btrfs_grab_root(fs_info->data_reloc_root);
3791 	trans = btrfs_start_transaction(root, 6);
3792 	if (IS_ERR(trans)) {
3793 		btrfs_put_root(root);
3794 		return ERR_CAST(trans);
3795 	}
3796 
3797 	ret = btrfs_get_free_objectid(root, &objectid);
3798 	if (ret)
3799 		goto out;
3800 
3801 	ret = __insert_orphan_inode(trans, root, objectid);
3802 	if (ret)
3803 		goto out;
3804 
3805 	inode = btrfs_iget(objectid, root);
3806 	if (IS_ERR(inode)) {
3807 		delete_orphan_inode(trans, root, objectid);
3808 		ret = PTR_ERR(inode);
3809 		inode = NULL;
3810 		goto out;
3811 	}
3812 	inode->reloc_block_group_start = group->start;
3813 
3814 	ret = btrfs_orphan_add(trans, inode);
3815 out:
3816 	btrfs_put_root(root);
3817 	btrfs_end_transaction(trans);
3818 	btrfs_btree_balance_dirty(fs_info);
3819 	if (ret) {
3820 		if (inode)
3821 			iput(&inode->vfs_inode);
3822 		return ERR_PTR(ret);
3823 	}
3824 	return &inode->vfs_inode;
3825 }
3826 
3827 /*
3828  * Mark start of chunk relocation that is cancellable. Check if the cancellation
3829  * has been requested meanwhile and don't start in that case.
3830  * NOTE: if this returns an error, reloc_chunk_end() must not be called.
3831  *
3832  * Return:
3833  *   0             success
3834  *   -EINPROGRESS  operation is already in progress, that's probably a bug
3835  *   -ECANCELED    cancellation request was set before the operation started
3836  */
3837 static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
3838 {
3839 	if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
3840 		/* This should not happen */
3841 		btrfs_err(fs_info, "reloc already running, cannot start");
3842 		return -EINPROGRESS;
3843 	}
3844 
3845 	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
3846 		btrfs_info(fs_info, "chunk relocation canceled on start");
3847 		/* On cancel, clear all requests. */
3848 		clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
3849 		atomic_set(&fs_info->reloc_cancel_req, 0);
3850 		return -ECANCELED;
3851 	}
3852 	return 0;
3853 }
3854 
3855 /*
3856  * Mark end of chunk relocation that is cancellable and wake any waiters.
3857  * NOTE: call only if a previous call to reloc_chunk_start() succeeded.
3858  */
3859 static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
3860 {
3861 	ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags));
3862 	/* Requested after start, clear bit first so any waiters can continue */
3863 	if (atomic_read(&fs_info->reloc_cancel_req) > 0)
3864 		btrfs_info(fs_info, "chunk relocation canceled during operation");
3865 	clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
3866 	atomic_set(&fs_info->reloc_cancel_req, 0);
3867 }
3868 
3869 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3870 {
3871 	struct reloc_control *rc;
3872 
3873 	rc = kzalloc_obj(*rc, GFP_NOFS);
3874 	if (!rc)
3875 		return NULL;
3876 
3877 	INIT_LIST_HEAD(&rc->reloc_roots);
3878 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3879 	btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
3880 	rc->reloc_root_tree.rb_root = RB_ROOT;
3881 	spin_lock_init(&rc->reloc_root_tree.lock);
3882 	btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
3883 	refcount_set(&rc->refs, 1);
3884 
3885 	return rc;
3886 }
3887 
3888 /*
3889  * Print the block group being relocated
3890  */
3891 static void describe_relocation(struct btrfs_block_group *block_group)
3892 {
3893 	char buf[128] = "NONE";
3894 
3895 	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3896 
3897 	btrfs_info(block_group->fs_info, "relocating block group %llu flags %s",
3898 		   block_group->start, buf);
3899 }
3900 
3901 static const char *stage_to_string(enum reloc_stage stage)
3902 {
3903 	if (stage == MOVE_DATA_EXTENTS)
3904 		return "move data extents";
3905 	if (stage == UPDATE_DATA_PTRS)
3906 		return "update data pointers";
3907 	return "unknown";
3908 }
3909 
3910 static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path,
3911 				  struct btrfs_key *entries, unsigned int num_entries)
3912 {
3913 	int ret;
3914 	struct btrfs_fs_info *fs_info = trans->fs_info;
3915 	struct btrfs_item_batch batch;
3916 	u32 *data_sizes;
3917 	u32 max_items;
3918 
3919 	max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
3920 
3921 	data_sizes = kzalloc_objs(u32, min_t(u32, num_entries, max_items), GFP_NOFS);
3922 	if (!data_sizes)
3923 		return -ENOMEM;
3924 
3925 	while (true) {
3926 		batch.keys = entries;
3927 		batch.data_sizes = data_sizes;
3928 		batch.total_data_size = 0;
3929 		batch.nr = min_t(u32, num_entries, max_items);
3930 
3931 		ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch);
3932 		btrfs_release_path(path);
3933 
3934 		if (ret || num_entries <= max_items)
3935 			break;
3936 
3937 		num_entries -= max_items;
3938 		entries += max_items;
3939 	}
3940 
3941 	kfree(data_sizes);
3942 
3943 	return ret;
3944 }
3945 
3946 struct space_run {
3947 	u64 start;
3948 	u64 end;
3949 };
3950 
3951 static void parse_bitmap(u64 block_size, const unsigned long *bitmap,
3952 			 unsigned long size, u64 address, struct space_run *space_runs,
3953 			 unsigned int *num_space_runs)
3954 {
3955 	unsigned long pos, end;
3956 	u64 run_start, run_length;
3957 
3958 	pos = find_first_bit(bitmap, size);
3959 	if (pos == size)
3960 		return;
3961 
3962 	while (true) {
3963 		end = find_next_zero_bit(bitmap, size, pos);
3964 
3965 		run_start = address + (pos * block_size);
3966 		run_length = (end - pos) * block_size;
3967 
3968 		if (*num_space_runs != 0 &&
3969 		    space_runs[*num_space_runs - 1].end == run_start) {
3970 			space_runs[*num_space_runs - 1].end += run_length;
3971 		} else {
3972 			space_runs[*num_space_runs].start = run_start;
3973 			space_runs[*num_space_runs].end = run_start + run_length;
3974 
3975 			(*num_space_runs)++;
3976 		}
3977 
3978 		if (end == size)
3979 			break;
3980 
3981 		pos = find_next_bit(bitmap, size, end + 1);
3982 		if (pos == size)
3983 			break;
3984 	}
3985 }
3986 
3987 static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
3988 					   struct btrfs_block_group *bg, s64 diff)
3989 {
3990 	struct btrfs_fs_info *fs_info = trans->fs_info;
3991 	bool bg_already_dirty = true;
3992 	bool mark_unused = false;
3993 
3994 	spin_lock(&bg->lock);
3995 	bg->remap_bytes += diff;
3996 	if (bg->used == 0 && bg->remap_bytes == 0)
3997 		mark_unused = true;
3998 	spin_unlock(&bg->lock);
3999 
4000 	if (mark_unused)
4001 		btrfs_mark_bg_unused(bg);
4002 
4003 	spin_lock(&trans->transaction->dirty_bgs_lock);
4004 	if (list_empty(&bg->dirty_list)) {
4005 		list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
4006 		bg_already_dirty = false;
4007 		btrfs_get_block_group(bg);
4008 	}
4009 	spin_unlock(&trans->transaction->dirty_bgs_lock);
4010 
4011 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
4012 	if (!bg_already_dirty)
4013 		btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
4014 }
4015 
4016 /* Private structure for I/O from copy_remapped_data().  */
4017 struct reloc_io_private {
4018 	struct completion done;
4019 	refcount_t pending_refs;
4020 	blk_status_t status;
4021 };
4022 
4023 static void reloc_endio(struct btrfs_bio *bbio)
4024 {
4025 	struct reloc_io_private *priv = bbio->private;
4026 
4027 	if (bbio->bio.bi_status)
4028 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
4029 
4030 	if (refcount_dec_and_test(&priv->pending_refs))
4031 		complete(&priv->done);
4032 
4033 	bio_put(&bbio->bio);
4034 }
4035 
4036 static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
4037 				 struct reloc_io_private *priv,
4038 				 struct page **pages, u64 addr, u64 length,
4039 				 blk_opf_t op)
4040 {
4041 	struct btrfs_bio *bbio;
4042 	int i;
4043 
4044 	init_completion(&priv->done);
4045 	refcount_set(&priv->pending_refs, 1);
4046 	priv->status = 0;
4047 
4048 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode),
4049 			       addr, reloc_endio, priv);
4050 	bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT);
4051 	bbio->is_remap = true;
4052 
4053 	i = 0;
4054 	do {
4055 		size_t bytes = min_t(u64, length, PAGE_SIZE);
4056 
4057 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
4058 			refcount_inc(&priv->pending_refs);
4059 			btrfs_submit_bbio(bbio, 0);
4060 
4061 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, op,
4062 					       BTRFS_I(fs_info->btree_inode),
4063 					       addr, reloc_endio, priv);
4064 			bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT);
4065 			bbio->is_remap = true;
4066 			continue;
4067 		}
4068 
4069 		i++;
4070 		addr += bytes;
4071 		length -= bytes;
4072 	} while (length);
4073 
4074 	refcount_inc(&priv->pending_refs);
4075 	btrfs_submit_bbio(bbio, 0);
4076 
4077 	if (!refcount_dec_and_test(&priv->pending_refs))
4078 		wait_for_completion_io(&priv->done);
4079 
4080 	return blk_status_to_errno(READ_ONCE(priv->status));
4081 }
4082 
4083 static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
4084 			      u64 new_addr, u64 length)
4085 {
4086 	int ret;
4087 	u64 copy_len = min_t(u64, length, SZ_1M);
4088 	struct page **pages;
4089 	struct reloc_io_private priv;
4090 	unsigned int nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
4091 
4092 	pages = kzalloc_objs(struct page *, nr_pages, GFP_NOFS);
4093 	if (!pages)
4094 		return -ENOMEM;
4095 
4096 	ret = btrfs_alloc_page_array(nr_pages, pages, GFP_NOFS);
4097 	if (ret) {
4098 		ret = -ENOMEM;
4099 		goto end;
4100 	}
4101 
4102 	/* Copy 1MB at a time, to avoid using too much memory. */
4103 	do {
4104 		u64 to_copy = min_t(u64, length, copy_len);
4105 
4106 		/* Limit to one bio. */
4107 		to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT);
4108 
4109 		ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr,
4110 					    to_copy, REQ_OP_READ);
4111 		if (ret)
4112 			goto end;
4113 
4114 		ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr,
4115 					    to_copy, REQ_OP_WRITE);
4116 		if (ret)
4117 			goto end;
4118 
4119 		if (to_copy == length)
4120 			break;
4121 
4122 		old_addr += to_copy;
4123 		new_addr += to_copy;
4124 		length -= to_copy;
4125 	} while (true);
4126 
4127 	ret = 0;
4128 end:
4129 	for (int i = 0; i < nr_pages; i++) {
4130 		if (pages[i])
4131 			__free_page(pages[i]);
4132 	}
4133 	kfree(pages);
4134 
4135 	return ret;
4136 }
4137 
4138 static int add_remap_item(struct btrfs_trans_handle *trans,
4139 			  struct btrfs_path *path, u64 new_addr, u64 length,
4140 			  u64 old_addr)
4141 {
4142 	struct btrfs_fs_info *fs_info = trans->fs_info;
4143 	struct btrfs_remap_item remap = { 0 };
4144 	struct btrfs_key key;
4145 	struct extent_buffer *leaf;
4146 	int ret;
4147 
4148 	key.objectid = old_addr;
4149 	key.type = BTRFS_REMAP_KEY;
4150 	key.offset = length;
4151 
4152 	ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
4153 				      &key, sizeof(struct btrfs_remap_item));
4154 	if (ret)
4155 		return ret;
4156 
4157 	leaf = path->nodes[0];
4158 	btrfs_set_stack_remap_address(&remap, new_addr);
4159 	write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]),
4160 			    sizeof(struct btrfs_remap_item));
4161 
4162 	btrfs_release_path(path);
4163 
4164 	return 0;
4165 }
4166 
4167 static int add_remap_backref_item(struct btrfs_trans_handle *trans,
4168 				  struct btrfs_path *path, u64 new_addr,
4169 				  u64 length, u64 old_addr)
4170 {
4171 	struct btrfs_fs_info *fs_info = trans->fs_info;
4172 	struct btrfs_remap_item remap = { 0 };
4173 	struct btrfs_key key;
4174 	struct extent_buffer *leaf;
4175 	int ret;
4176 
4177 	key.objectid = new_addr;
4178 	key.type = BTRFS_REMAP_BACKREF_KEY;
4179 	key.offset = length;
4180 
4181 	ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, &key,
4182 				      sizeof(struct btrfs_remap_item));
4183 	if (ret)
4184 		return ret;
4185 
4186 	leaf = path->nodes[0];
4187 	btrfs_set_stack_remap_address(&remap, old_addr);
4188 	write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]),
4189 			    sizeof(struct btrfs_remap_item));
4190 
4191 	btrfs_release_path(path);
4192 
4193 	return 0;
4194 }
4195 
4196 static int move_existing_remap(struct btrfs_fs_info *fs_info,
4197 			       struct btrfs_path *path,
4198 			       struct btrfs_block_group *bg, u64 new_addr,
4199 			       u64 length, u64 old_addr)
4200 {
4201 	struct btrfs_trans_handle *trans;
4202 	struct extent_buffer *leaf;
4203 	struct btrfs_remap_item *remap_ptr;
4204 	struct btrfs_remap_item remap = { 0 };
4205 	struct btrfs_key key, ins;
4206 	u64 dest_addr, dest_length, min_size;
4207 	struct btrfs_block_group *dest_bg;
4208 	int ret;
4209 	const bool is_data = (bg->flags & BTRFS_BLOCK_GROUP_DATA);
4210 	struct btrfs_space_info *sinfo = bg->space_info;
4211 	bool mutex_taken = false;
4212 	bool bg_needs_free_space;
4213 
4214 	spin_lock(&sinfo->lock);
4215 	btrfs_space_info_update_bytes_may_use(sinfo, length);
4216 	spin_unlock(&sinfo->lock);
4217 
4218 	if (is_data)
4219 		min_size = fs_info->sectorsize;
4220 	else
4221 		min_size = fs_info->nodesize;
4222 
4223 	ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
4224 				   0, 0, &ins, is_data, false);
4225 	if (unlikely(ret)) {
4226 		spin_lock(&sinfo->lock);
4227 		btrfs_space_info_update_bytes_may_use(sinfo, -length);
4228 		spin_unlock(&sinfo->lock);
4229 		return ret;
4230 	}
4231 
4232 	if (ins.offset < length) {
4233 		spin_lock(&sinfo->lock);
4234 		btrfs_space_info_update_bytes_may_use(sinfo, ins.offset - length);
4235 		spin_unlock(&sinfo->lock);
4236 	}
4237 
4238 	dest_addr = ins.objectid;
4239 	dest_length = ins.offset;
4240 
4241 	dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
4242 
4243 	if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
4244 		u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
4245 
4246 		btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
4247 					   dest_length - new_length, 0);
4248 
4249 		dest_length = new_length;
4250 	}
4251 
4252 	trans = btrfs_join_transaction(fs_info->remap_root);
4253 	if (IS_ERR(trans)) {
4254 		ret = PTR_ERR(trans);
4255 		trans = NULL;
4256 		goto end;
4257 	}
4258 
4259 	mutex_lock(&fs_info->remap_mutex);
4260 	mutex_taken = true;
4261 
4262 	/* Find old remap entry. */
4263 	key.objectid = old_addr;
4264 	key.type = BTRFS_REMAP_KEY;
4265 	key.offset = length;
4266 
4267 	ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
4268 	if (ret == 1) {
4269 		/*
4270 		 * Not a problem if the remap entry wasn't found: that means
4271 		 * that another transaction has deallocated the data.
4272 		 * move_existing_remaps() loops until the BG contains no
4273 		 * remaps, so we can just return 0 in this case.
4274 		 */
4275 		btrfs_release_path(path);
4276 		ret = 0;
4277 		goto end;
4278 	} else if (unlikely(ret)) {
4279 		goto end;
4280 	}
4281 
4282 	ret = copy_remapped_data(fs_info, new_addr, dest_addr, dest_length);
4283 	if (unlikely(ret))
4284 		goto end;
4285 
4286 	/* Change data of old remap entry. */
4287 	leaf = path->nodes[0];
4288 	remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
4289 	btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
4290 	btrfs_mark_buffer_dirty(trans, leaf);
4291 
4292 	if (dest_length != length) {
4293 		key.offset = dest_length;
4294 		btrfs_set_item_key_safe(trans, path, &key);
4295 	}
4296 
4297 	btrfs_release_path(path);
4298 
4299 	if (dest_length != length) {
4300 		/* Add remap item for remainder. */
4301 		ret = add_remap_item(trans, path, new_addr + dest_length,
4302 				     length - dest_length, old_addr + dest_length);
4303 		if (unlikely(ret))
4304 			goto end;
4305 	}
4306 
4307 	/* Change or remove old backref. */
4308 	key.objectid = new_addr;
4309 	key.type = BTRFS_REMAP_BACKREF_KEY;
4310 	key.offset = length;
4311 
4312 	ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
4313 	if (unlikely(ret)) {
4314 		if (ret == 1) {
4315 			btrfs_release_path(path);
4316 			ret = -ENOENT;
4317 		}
4318 		goto end;
4319 	}
4320 
4321 	leaf = path->nodes[0];
4322 
4323 	if (dest_length == length) {
4324 		ret = btrfs_del_item(trans, fs_info->remap_root, path);
4325 		if (unlikely(ret)) {
4326 			btrfs_release_path(path);
4327 			goto end;
4328 		}
4329 	} else {
4330 		key.objectid += dest_length;
4331 		key.offset -= dest_length;
4332 		btrfs_set_item_key_safe(trans, path, &key);
4333 		btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
4334 
4335 		write_extent_buffer(leaf, &remap,
4336 				    btrfs_item_ptr_offset(leaf, path->slots[0]),
4337 				    sizeof(struct btrfs_remap_item));
4338 	}
4339 
4340 	btrfs_release_path(path);
4341 
4342 	/* Add new backref. */
4343 	ret = add_remap_backref_item(trans, path, dest_addr, dest_length, old_addr);
4344 	if (unlikely(ret))
4345 		goto end;
4346 
4347 	adjust_block_group_remap_bytes(trans, bg, -dest_length);
4348 
4349 	ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
4350 	if (unlikely(ret))
4351 		goto end;
4352 
4353 	adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
4354 
4355 	mutex_lock(&dest_bg->free_space_lock);
4356 	bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
4357 				       &dest_bg->runtime_flags);
4358 	mutex_unlock(&dest_bg->free_space_lock);
4359 
4360 	if (bg_needs_free_space) {
4361 		ret = btrfs_add_block_group_free_space(trans, dest_bg);
4362 		if (unlikely(ret))
4363 			goto end;
4364 	}
4365 
4366 	ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
4367 	if (unlikely(ret)) {
4368 		btrfs_remove_from_free_space_tree(trans, new_addr, dest_length);
4369 		goto end;
4370 	}
4371 
4372 	ret = 0;
4373 
4374 end:
4375 	if (mutex_taken)
4376 		mutex_unlock(&fs_info->remap_mutex);
4377 
4378 	btrfs_dec_block_group_reservations(fs_info, dest_addr);
4379 
4380 	if (unlikely(ret)) {
4381 		btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
4382 
4383 		if (trans) {
4384 			btrfs_abort_transaction(trans, ret);
4385 			btrfs_end_transaction(trans);
4386 		}
4387 	} else {
4388 		btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
4389 
4390 		ret = btrfs_commit_transaction(trans);
4391 	}
4392 
4393 	btrfs_put_block_group(dest_bg);
4394 
4395 	return ret;
4396 }
4397 
4398 static int move_existing_remaps(struct btrfs_fs_info *fs_info,
4399 				struct btrfs_block_group *bg,
4400 				struct btrfs_path *path)
4401 {
4402 	int ret;
4403 	struct btrfs_key key;
4404 	struct extent_buffer *leaf;
4405 	struct btrfs_remap_item *remap;
4406 	u64 old_addr;
4407 
4408 	/* Look for backrefs in remap tree. */
4409 	while (bg->remap_bytes > 0) {
4410 		key.objectid = bg->start;
4411 		key.type = BTRFS_REMAP_BACKREF_KEY;
4412 		key.offset = 0;
4413 
4414 		ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0);
4415 		if (ret < 0)
4416 			return ret;
4417 
4418 		leaf = path->nodes[0];
4419 
4420 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
4421 			ret = btrfs_next_leaf(fs_info->remap_root, path);
4422 			if (ret < 0) {
4423 				btrfs_release_path(path);
4424 				return ret;
4425 			}
4426 
4427 			if (ret) {
4428 				btrfs_release_path(path);
4429 				break;
4430 			}
4431 
4432 			leaf = path->nodes[0];
4433 		}
4434 
4435 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4436 
4437 		if (key.type != BTRFS_REMAP_BACKREF_KEY) {
4438 			path->slots[0]++;
4439 
4440 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
4441 				ret = btrfs_next_leaf(fs_info->remap_root, path);
4442 				if (ret < 0) {
4443 					btrfs_release_path(path);
4444 					return ret;
4445 				}
4446 
4447 				if (ret) {
4448 					btrfs_release_path(path);
4449 					break;
4450 				}
4451 
4452 				leaf = path->nodes[0];
4453 			}
4454 
4455 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4456 		}
4457 
4458 		remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
4459 		old_addr = btrfs_remap_address(leaf, remap);
4460 
4461 		btrfs_release_path(path);
4462 
4463 		ret = move_existing_remap(fs_info, path, bg, key.objectid,
4464 					  key.offset, old_addr);
4465 		if (ret)
4466 			return ret;
4467 	}
4468 
4469 	ASSERT(bg->remap_bytes == 0);
4470 
4471 	return 0;
4472 }
4473 
4474 static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
4475 				     struct btrfs_path *path,
4476 				     struct btrfs_block_group *bg)
4477 {
4478 	struct btrfs_fs_info *fs_info = trans->fs_info;
4479 	struct btrfs_free_space_info *fsi;
4480 	struct btrfs_key key, found_key;
4481 	struct extent_buffer *leaf;
4482 	struct btrfs_root *space_root;
4483 	u32 extent_count;
4484 	struct space_run *space_runs = NULL;
4485 	unsigned int num_space_runs = 0;
4486 	struct btrfs_key *entries = NULL;
4487 	unsigned int max_entries, num_entries;
4488 	int ret;
4489 
4490 	mutex_lock(&bg->free_space_lock);
4491 
4492 	if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) {
4493 		mutex_unlock(&bg->free_space_lock);
4494 
4495 		ret = btrfs_add_block_group_free_space(trans, bg);
4496 		if (ret)
4497 			return ret;
4498 
4499 		mutex_lock(&bg->free_space_lock);
4500 	}
4501 
4502 	fsi = btrfs_search_free_space_info(trans, bg, path, 0);
4503 	if (IS_ERR(fsi)) {
4504 		mutex_unlock(&bg->free_space_lock);
4505 		return PTR_ERR(fsi);
4506 	}
4507 
4508 	extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi);
4509 
4510 	btrfs_release_path(path);
4511 
4512 	space_runs = kmalloc_objs(*space_runs, extent_count, GFP_NOFS);
4513 	if (!space_runs) {
4514 		mutex_unlock(&bg->free_space_lock);
4515 		return -ENOMEM;
4516 	}
4517 
4518 	key.objectid = bg->start;
4519 	key.type = 0;
4520 	key.offset = 0;
4521 
4522 	space_root = btrfs_free_space_root(bg);
4523 
4524 	ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0);
4525 	if (ret < 0) {
4526 		mutex_unlock(&bg->free_space_lock);
4527 		goto out;
4528 	}
4529 
4530 	ret = 0;
4531 
4532 	while (true) {
4533 		leaf = path->nodes[0];
4534 
4535 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4536 
4537 		if (found_key.objectid >= bg->start + bg->length)
4538 			break;
4539 
4540 		if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
4541 			if (num_space_runs != 0 &&
4542 			    space_runs[num_space_runs - 1].end == found_key.objectid) {
4543 				space_runs[num_space_runs - 1].end =
4544 					found_key.objectid + found_key.offset;
4545 			} else {
4546 				ASSERT(num_space_runs < extent_count);
4547 
4548 				space_runs[num_space_runs].start = found_key.objectid;
4549 				space_runs[num_space_runs].end =
4550 					found_key.objectid + found_key.offset;
4551 
4552 				num_space_runs++;
4553 			}
4554 		} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
4555 			void *bitmap;
4556 			unsigned long offset;
4557 			u32 data_size;
4558 
4559 			offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
4560 			data_size = btrfs_item_size(leaf, path->slots[0]);
4561 
4562 			if (data_size != 0) {
4563 				bitmap = kmalloc(data_size, GFP_NOFS);
4564 				if (!bitmap) {
4565 					mutex_unlock(&bg->free_space_lock);
4566 					ret = -ENOMEM;
4567 					goto out;
4568 				}
4569 
4570 				read_extent_buffer(leaf, bitmap, offset, data_size);
4571 
4572 				parse_bitmap(fs_info->sectorsize, bitmap,
4573 					     data_size * BITS_PER_BYTE,
4574 					     found_key.objectid, space_runs,
4575 					     &num_space_runs);
4576 
4577 				ASSERT(num_space_runs <= extent_count);
4578 
4579 				kfree(bitmap);
4580 			}
4581 		}
4582 
4583 		path->slots[0]++;
4584 
4585 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
4586 			ret = btrfs_next_leaf(space_root, path);
4587 			if (ret != 0) {
4588 				if (ret == 1)
4589 					ret = 0;
4590 				break;
4591 			}
4592 			leaf = path->nodes[0];
4593 		}
4594 	}
4595 
4596 	btrfs_release_path(path);
4597 
4598 	mutex_unlock(&bg->free_space_lock);
4599 
4600 	max_entries = extent_count + 2;
4601 	entries = kmalloc_objs(*entries, max_entries, GFP_NOFS);
4602 	if (!entries) {
4603 		ret = -ENOMEM;
4604 		goto out;
4605 	}
4606 
4607 	num_entries = 0;
4608 
4609 	if (num_space_runs == 0) {
4610 		entries[num_entries].objectid = bg->start;
4611 		entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
4612 		entries[num_entries].offset = bg->length;
4613 		num_entries++;
4614 	} else {
4615 		if (space_runs[0].start > bg->start) {
4616 			entries[num_entries].objectid = bg->start;
4617 			entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
4618 			entries[num_entries].offset = space_runs[0].start - bg->start;
4619 			num_entries++;
4620 		}
4621 
4622 		for (unsigned int i = 1; i < num_space_runs; i++) {
4623 			entries[num_entries].objectid = space_runs[i - 1].end;
4624 			entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
4625 			entries[num_entries].offset =
4626 				space_runs[i].start - space_runs[i - 1].end;
4627 			num_entries++;
4628 		}
4629 
4630 		if (space_runs[num_space_runs - 1].end < bg->start + bg->length) {
4631 			entries[num_entries].objectid =
4632 				space_runs[num_space_runs - 1].end;
4633 			entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
4634 			entries[num_entries].offset =
4635 				bg->start + bg->length - space_runs[num_space_runs - 1].end;
4636 			num_entries++;
4637 		}
4638 
4639 		if (num_entries == 0)
4640 			goto out;
4641 	}
4642 
4643 	bg->identity_remap_count = num_entries;
4644 
4645 	ret = add_remap_tree_entries(trans, path, entries, num_entries);
4646 
4647 out:
4648 	kfree(entries);
4649 	kfree(space_runs);
4650 
4651 	return ret;
4652 }
4653 
4654 static int find_next_identity_remap(struct btrfs_trans_handle *trans,
4655 				    struct btrfs_path *path, u64 bg_end,
4656 				    u64 last_start, u64 *start, u64 *length)
4657 {
4658 	int ret;
4659 	struct btrfs_key key, found_key;
4660 	struct btrfs_root *remap_root = trans->fs_info->remap_root;
4661 	struct extent_buffer *leaf;
4662 
4663 	key.objectid = last_start;
4664 	key.type = BTRFS_IDENTITY_REMAP_KEY;
4665 	key.offset = 0;
4666 
4667 	ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
4668 	if (ret < 0)
4669 		goto out;
4670 
4671 	leaf = path->nodes[0];
4672 	while (true) {
4673 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
4674 			ret = btrfs_next_leaf(remap_root, path);
4675 
4676 			if (ret != 0) {
4677 				if (ret == 1)
4678 					ret = -ENOENT;
4679 				goto out;
4680 			}
4681 
4682 			leaf = path->nodes[0];
4683 		}
4684 
4685 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4686 
4687 		if (found_key.objectid >= bg_end) {
4688 			ret = -ENOENT;
4689 			goto out;
4690 		}
4691 
4692 		if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
4693 			*start = found_key.objectid;
4694 			*length = found_key.offset;
4695 			ret = 0;
4696 			goto out;
4697 		}
4698 
4699 		path->slots[0]++;
4700 	}
4701 
4702 out:
4703 	btrfs_release_path(path);
4704 
4705 	return ret;
4706 }
4707 
4708 static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
4709 				struct btrfs_chunk_map *chunk_map,
4710 				struct btrfs_path *path)
4711 {
4712 	struct btrfs_fs_info *fs_info = trans->fs_info;
4713 	struct btrfs_key key;
4714 	struct extent_buffer *leaf;
4715 	struct btrfs_chunk *chunk;
4716 	int ret;
4717 
4718 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4719 	key.type = BTRFS_CHUNK_ITEM_KEY;
4720 	key.offset = chunk_map->start;
4721 
4722 	btrfs_reserve_chunk_metadata(trans, false);
4723 
4724 	ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1);
4725 	if (ret) {
4726 		if (ret == 1) {
4727 			btrfs_release_path(path);
4728 			ret = -ENOENT;
4729 		}
4730 		btrfs_trans_release_chunk_metadata(trans);
4731 		return ret;
4732 	}
4733 
4734 	leaf = path->nodes[0];
4735 
4736 	chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
4737 	btrfs_set_chunk_num_stripes(leaf, chunk, 0);
4738 	btrfs_set_chunk_sub_stripes(leaf, chunk, 0);
4739 
4740 	btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe), 1);
4741 
4742 	btrfs_mark_buffer_dirty(trans, leaf);
4743 
4744 	btrfs_release_path(path);
4745 	btrfs_trans_release_chunk_metadata(trans);
4746 
4747 	return 0;
4748 }
4749 
4750 int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map,
4751 				   struct btrfs_block_group *bg)
4752 {
4753 	struct btrfs_fs_info *fs_info = bg->fs_info;
4754 	struct btrfs_trans_handle *trans;
4755 	int ret;
4756 	unsigned int num_items;
4757 	BTRFS_PATH_AUTO_FREE(path);
4758 
4759 	path = btrfs_alloc_path();
4760 	if (!path)
4761 		return -ENOMEM;
4762 
4763 	/*
4764 	 * One item for each entry we're removing in the dev extents tree, and
4765 	 * another for each device. DUP chunks are all on one device,
4766 	 * everything else has one device per stripe.
4767 	 */
4768 	if (bg->flags & BTRFS_BLOCK_GROUP_DUP)
4769 		num_items = chunk_map->num_stripes + 1;
4770 	else
4771 		num_items = 2 * chunk_map->num_stripes;
4772 
4773 	trans = btrfs_start_transaction_fallback_global_rsv(fs_info->tree_root, num_items);
4774 	if (IS_ERR(trans))
4775 		return PTR_ERR(trans);
4776 
4777 	ret = btrfs_remove_dev_extents(trans, chunk_map);
4778 	if (unlikely(ret)) {
4779 		btrfs_abort_transaction(trans, ret);
4780 		btrfs_end_transaction(trans);
4781 		return ret;
4782 	}
4783 
4784 	mutex_lock(&trans->fs_info->chunk_mutex);
4785 	for (unsigned int i = 0; i < chunk_map->num_stripes; i++) {
4786 		ret = btrfs_update_device(trans, chunk_map->stripes[i].dev);
4787 		if (unlikely(ret)) {
4788 			mutex_unlock(&trans->fs_info->chunk_mutex);
4789 			btrfs_abort_transaction(trans, ret);
4790 			btrfs_end_transaction(trans);
4791 			return ret;
4792 		}
4793 	}
4794 	mutex_unlock(&trans->fs_info->chunk_mutex);
4795 
4796 	write_lock(&trans->fs_info->mapping_tree_lock);
4797 	btrfs_chunk_map_device_clear_bits(chunk_map, CHUNK_ALLOCATED);
4798 	write_unlock(&trans->fs_info->mapping_tree_lock);
4799 
4800 	btrfs_remove_bg_from_sinfo(bg);
4801 
4802 	spin_lock(&bg->lock);
4803 	clear_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags);
4804 	spin_unlock(&bg->lock);
4805 
4806 	ret = remove_chunk_stripes(trans, chunk_map, path);
4807 	if (unlikely(ret)) {
4808 		btrfs_abort_transaction(trans, ret);
4809 		btrfs_end_transaction(trans);
4810 		return ret;
4811 	}
4812 
4813 	ret = btrfs_commit_transaction(trans);
4814 	if (ret)
4815 		return ret;
4816 
4817 	return 0;
4818 }
4819 
4820 static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
4821 				        struct btrfs_block_group *bg, int delta)
4822 {
4823 	struct btrfs_fs_info *fs_info = trans->fs_info;
4824 	bool bg_already_dirty = true;
4825 	bool mark_fully_remapped = false;
4826 
4827 	WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
4828 
4829 	spin_lock(&bg->lock);
4830 
4831 	bg->identity_remap_count += delta;
4832 
4833 	if (bg->identity_remap_count == 0 &&
4834 	    !test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags)) {
4835 		set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags);
4836 		mark_fully_remapped = true;
4837 	}
4838 
4839 	spin_unlock(&bg->lock);
4840 
4841 	spin_lock(&trans->transaction->dirty_bgs_lock);
4842 	if (list_empty(&bg->dirty_list)) {
4843 		list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
4844 		bg_already_dirty = false;
4845 		btrfs_get_block_group(bg);
4846 	}
4847 	spin_unlock(&trans->transaction->dirty_bgs_lock);
4848 
4849 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
4850 	if (!bg_already_dirty)
4851 		btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
4852 
4853 	if (mark_fully_remapped)
4854 		btrfs_mark_bg_fully_remapped(bg, trans);
4855 }
4856 
4857 static int add_remap_entry(struct btrfs_trans_handle *trans,
4858 			   struct btrfs_path *path,
4859 			   struct btrfs_block_group *src_bg, u64 old_addr,
4860 			   u64 new_addr, u64 length)
4861 {
4862 	struct btrfs_fs_info *fs_info = trans->fs_info;
4863 	struct btrfs_key key, new_key;
4864 	int ret;
4865 	int identity_count_delta = 0;
4866 
4867 	key.objectid = old_addr;
4868 	key.type = (u8)-1;
4869 	key.offset = (u64)-1;
4870 
4871 	ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
4872 	if (ret < 0)
4873 		goto end;
4874 
4875 	if (path->slots[0] == 0) {
4876 		ret = -ENOENT;
4877 		goto end;
4878 	}
4879 
4880 	path->slots[0]--;
4881 
4882 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4883 
4884 	if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
4885 	    key.objectid > old_addr ||
4886 	    key.objectid + key.offset <= old_addr) {
4887 		ret = -ENOENT;
4888 		goto end;
4889 	}
4890 
4891 	/* Shorten or delete identity mapping entry. */
4892 	if (key.objectid == old_addr) {
4893 		ret = btrfs_del_item(trans, fs_info->remap_root, path);
4894 		if (ret)
4895 			goto end;
4896 
4897 		identity_count_delta--;
4898 	} else {
4899 		new_key.objectid = key.objectid;
4900 		new_key.type = BTRFS_IDENTITY_REMAP_KEY;
4901 		new_key.offset = old_addr - key.objectid;
4902 
4903 		btrfs_set_item_key_safe(trans, path, &new_key);
4904 	}
4905 
4906 	btrfs_release_path(path);
4907 
4908 	/* Create new remap entry. */
4909 	ret = add_remap_item(trans, path, new_addr, length, old_addr);
4910 	if (ret)
4911 		goto end;
4912 
4913 	/* Add entry for remainder of identity mapping, if necessary. */
4914 	if (key.objectid + key.offset != old_addr + length) {
4915 		new_key.objectid = old_addr + length;
4916 		new_key.type = BTRFS_IDENTITY_REMAP_KEY;
4917 		new_key.offset = key.objectid + key.offset - old_addr - length;
4918 
4919 		ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
4920 					      path, &new_key, 0);
4921 		if (ret)
4922 			goto end;
4923 
4924 		btrfs_release_path(path);
4925 
4926 		identity_count_delta++;
4927 	}
4928 
4929 	/* Add backref. */
4930 	ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
4931 	if (ret)
4932 		goto end;
4933 
4934 	if (identity_count_delta != 0)
4935 		adjust_identity_remap_count(trans, src_bg, identity_count_delta);
4936 
4937 end:
4938 	btrfs_release_path(path);
4939 
4940 	return ret;
4941 }
4942 
4943 static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
4944 			       struct btrfs_path *path, u64 start)
4945 {
4946 	struct btrfs_fs_info *fs_info = trans->fs_info;
4947 	struct btrfs_chunk_map *chunk_map;
4948 	struct btrfs_key key;
4949 	u64 type;
4950 	int ret;
4951 	struct extent_buffer *leaf;
4952 	struct btrfs_chunk *chunk;
4953 
4954 	read_lock(&fs_info->mapping_tree_lock);
4955 
4956 	chunk_map = btrfs_find_chunk_map_nolock(fs_info, start, 1);
4957 	if (!chunk_map) {
4958 		read_unlock(&fs_info->mapping_tree_lock);
4959 		return -ENOENT;
4960 	}
4961 
4962 	chunk_map->type |= BTRFS_BLOCK_GROUP_REMAPPED;
4963 	type = chunk_map->type;
4964 
4965 	read_unlock(&fs_info->mapping_tree_lock);
4966 
4967 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4968 	key.type = BTRFS_CHUNK_ITEM_KEY;
4969 	key.offset = start;
4970 
4971 	ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1);
4972 	if (ret == 1) {
4973 		ret = -ENOENT;
4974 		goto end;
4975 	} else if (ret < 0)
4976 		goto end;
4977 
4978 	leaf = path->nodes[0];
4979 
4980 	chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
4981 	btrfs_set_chunk_type(leaf, chunk, type);
4982 	btrfs_mark_buffer_dirty(trans, leaf);
4983 
4984 	ret = 0;
4985 end:
4986 	btrfs_free_chunk_map(chunk_map);
4987 	btrfs_release_path(path);
4988 
4989 	return ret;
4990 }
4991 
4992 static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info,
4993 				struct btrfs_block_group *src_bg,
4994 				struct btrfs_path *path, u64 *last_start)
4995 {
4996 	struct btrfs_trans_handle *trans;
4997 	struct btrfs_root *extent_root;
4998 	struct btrfs_key ins;
4999 	struct btrfs_block_group *dest_bg = NULL;
5000 	u64 start = 0, remap_length = 0;
5001 	u64 length, new_addr, min_size;
5002 	int ret;
5003 	const bool is_data = (src_bg->flags & BTRFS_BLOCK_GROUP_DATA);
5004 	bool no_more = false;
5005 	bool made_reservation = false, bg_needs_free_space;
5006 	struct btrfs_space_info *sinfo = src_bg->space_info;
5007 
5008 	extent_root = btrfs_extent_root(fs_info, src_bg->start);
5009 	if (unlikely(!extent_root)) {
5010 		btrfs_err(fs_info,
5011 			  "missing extent root for block group at offset %llu",
5012 			  src_bg->start);
5013 		return -EUCLEAN;
5014 	}
5015 
5016 	trans = btrfs_start_transaction(extent_root, 0);
5017 	if (IS_ERR(trans))
5018 		return PTR_ERR(trans);
5019 
5020 	mutex_lock(&fs_info->remap_mutex);
5021 
5022 	ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
5023 				       *last_start, &start, &remap_length);
5024 	if (ret == -ENOENT) {
5025 		no_more = true;
5026 		goto next;
5027 	} else if (ret) {
5028 		mutex_unlock(&fs_info->remap_mutex);
5029 		btrfs_end_transaction(trans);
5030 		return ret;
5031 	}
5032 
5033 	/* Try to reserve enough space for block. */
5034 	spin_lock(&sinfo->lock);
5035 	btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
5036 	spin_unlock(&sinfo->lock);
5037 
5038 	if (is_data)
5039 		min_size = fs_info->sectorsize;
5040 	else
5041 		min_size = fs_info->nodesize;
5042 
5043 	/*
5044 	 * We're using btrfs_reserve_extent() to allocate a contiguous
5045 	 * logical address range, but this will become a remap item rather than
5046 	 * an extent in the extent tree.
5047 	 *
5048 	 * Short allocations are fine: it means that we chop off the beginning
5049 	 * of the identity remap that we're processing, and will tackle the
5050 	 * rest of it the next time round.
5051 	 */
5052 	ret = btrfs_reserve_extent(fs_info->fs_root, remap_length, remap_length,
5053 				   min_size, 0, 0, &ins, is_data, false);
5054 	if (ret) {
5055 		spin_lock(&sinfo->lock);
5056 		btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
5057 		spin_unlock(&sinfo->lock);
5058 
5059 		mutex_unlock(&fs_info->remap_mutex);
5060 		btrfs_end_transaction(trans);
5061 		return ret;
5062 	}
5063 
5064 	if (ins.offset < remap_length) {
5065 		spin_lock(&sinfo->lock);
5066 		btrfs_space_info_update_bytes_may_use(sinfo, ins.offset - remap_length);
5067 		spin_unlock(&sinfo->lock);
5068 	}
5069 
5070 	made_reservation = true;
5071 
5072 	new_addr = ins.objectid;
5073 	length = ins.offset;
5074 
5075 	if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
5076 		u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
5077 
5078 		btrfs_free_reserved_extent(fs_info, new_addr + new_length,
5079 					   length - new_length, 0);
5080 
5081 		length = new_length;
5082 	}
5083 
5084 	dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
5085 
5086 	mutex_lock(&dest_bg->free_space_lock);
5087 	bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
5088 				       &dest_bg->runtime_flags);
5089 	mutex_unlock(&dest_bg->free_space_lock);
5090 
5091 	if (bg_needs_free_space) {
5092 		ret = btrfs_add_block_group_free_space(trans, dest_bg);
5093 		if (ret) {
5094 			btrfs_abort_transaction(trans, ret);
5095 			goto fail;
5096 		}
5097 	}
5098 
5099 	ret = copy_remapped_data(fs_info, start, new_addr, length);
5100 	if (ret) {
5101 		btrfs_abort_transaction(trans, ret);
5102 		goto fail;
5103 	}
5104 
5105 	ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
5106 	if (ret) {
5107 		btrfs_abort_transaction(trans, ret);
5108 		goto fail;
5109 	}
5110 
5111 	ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
5112 	if (ret) {
5113 		btrfs_abort_transaction(trans, ret);
5114 		goto fail;
5115 	}
5116 
5117 	adjust_block_group_remap_bytes(trans, dest_bg, length);
5118 	btrfs_free_reserved_bytes(dest_bg, length, 0);
5119 
5120 	spin_lock(&sinfo->lock);
5121 	sinfo->bytes_readonly += length;
5122 	spin_unlock(&sinfo->lock);
5123 
5124 next:
5125 	if (dest_bg)
5126 		btrfs_put_block_group(dest_bg);
5127 
5128 	if (made_reservation)
5129 		btrfs_dec_block_group_reservations(fs_info, new_addr);
5130 
5131 	mutex_unlock(&fs_info->remap_mutex);
5132 
5133 	if (src_bg->identity_remap_count == 0) {
5134 		bool mark_fully_remapped = false;
5135 
5136 		spin_lock(&src_bg->lock);
5137 		if (!test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags)) {
5138 			mark_fully_remapped = true;
5139 			set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags);
5140 		}
5141 		spin_unlock(&src_bg->lock);
5142 
5143 		if (mark_fully_remapped)
5144 			btrfs_mark_bg_fully_remapped(src_bg, trans);
5145 	}
5146 
5147 	ret = btrfs_end_transaction(trans);
5148 	if (ret)
5149 		return ret;
5150 
5151 	if (no_more)
5152 		return 1;
5153 
5154 	*last_start = start;
5155 
5156 	return 0;
5157 
5158 fail:
5159 	if (dest_bg)
5160 		btrfs_put_block_group(dest_bg);
5161 
5162 	btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
5163 
5164 	mutex_unlock(&fs_info->remap_mutex);
5165 	btrfs_end_transaction(trans);
5166 
5167 	return ret;
5168 }
5169 
5170 static int do_remap_reloc(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
5171 			  struct btrfs_block_group *bg)
5172 {
5173 	u64 last_start = bg->start;
5174 	int ret;
5175 
5176 	while (true) {
5177 		ret = do_remap_reloc_trans(fs_info, bg, path, &last_start);
5178 		if (ret) {
5179 			if (ret == 1)
5180 				ret = 0;
5181 			break;
5182 		}
5183 	}
5184 
5185 	return ret;
5186 }
5187 
5188 int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length)
5189 {
5190 	int ret;
5191 	struct btrfs_key key, found_key;
5192 	struct extent_buffer *leaf;
5193 	struct btrfs_remap_item *remap;
5194 	BTRFS_PATH_AUTO_FREE(path);
5195 
5196 	path = btrfs_alloc_path();
5197 	if (!path)
5198 		return -ENOMEM;
5199 
5200 	key.objectid = *logical;
5201 	key.type = (u8)-1;
5202 	key.offset = (u64)-1;
5203 
5204 	ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0);
5205 	if (ret < 0)
5206 		return ret;
5207 
5208 	leaf = path->nodes[0];
5209 	if (path->slots[0] == 0)
5210 		return -ENOENT;
5211 
5212 	path->slots[0]--;
5213 
5214 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5215 
5216 	if (found_key.type != BTRFS_REMAP_KEY &&
5217 	    found_key.type != BTRFS_IDENTITY_REMAP_KEY) {
5218 		return -ENOENT;
5219 	}
5220 
5221 	if (found_key.objectid > *logical ||
5222 	    found_key.objectid + found_key.offset <= *logical) {
5223 		return -ENOENT;
5224 	}
5225 
5226 	if (*logical + *length > found_key.objectid + found_key.offset)
5227 		*length = found_key.objectid + found_key.offset - *logical;
5228 
5229 	if (found_key.type == BTRFS_IDENTITY_REMAP_KEY)
5230 		return 0;
5231 
5232 	remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
5233 	*logical += btrfs_remap_address(leaf, remap) - found_key.objectid;
5234 
5235 	return 0;
5236 }
5237 
5238 static int start_block_group_remapping(struct btrfs_fs_info *fs_info,
5239 				       struct btrfs_path *path,
5240 				       struct btrfs_block_group *bg)
5241 {
5242 	struct btrfs_trans_handle *trans;
5243 	bool bg_already_dirty = true;
5244 	int ret, ret2;
5245 
5246 	ret = btrfs_cache_block_group(bg, true);
5247 	if (ret)
5248 		return ret;
5249 
5250 	trans = btrfs_start_transaction(fs_info->remap_root, 0);
5251 	if (IS_ERR(trans))
5252 		return PTR_ERR(trans);
5253 
5254 	/* We need to run delayed refs, to make sure FST is up to date. */
5255 	ret = btrfs_run_delayed_refs(trans, U64_MAX);
5256 	if (ret) {
5257 		btrfs_end_transaction(trans);
5258 		return ret;
5259 	}
5260 
5261 	mutex_lock(&fs_info->remap_mutex);
5262 
5263 	if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
5264 		ret = 0;
5265 		goto end;
5266 	}
5267 
5268 	ret = create_remap_tree_entries(trans, path, bg);
5269 	if (unlikely(ret)) {
5270 		btrfs_abort_transaction(trans, ret);
5271 		goto end;
5272 	}
5273 
5274 	spin_lock(&bg->lock);
5275 	bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED;
5276 	spin_unlock(&bg->lock);
5277 
5278 	spin_lock(&trans->transaction->dirty_bgs_lock);
5279 	if (list_empty(&bg->dirty_list)) {
5280 		list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
5281 		bg_already_dirty = false;
5282 		btrfs_get_block_group(bg);
5283 	}
5284 	spin_unlock(&trans->transaction->dirty_bgs_lock);
5285 
5286 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
5287 	if (!bg_already_dirty)
5288 		btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
5289 
5290 	ret = mark_chunk_remapped(trans, path, bg->start);
5291 	if (unlikely(ret)) {
5292 		btrfs_abort_transaction(trans, ret);
5293 		goto end;
5294 	}
5295 
5296 	ret = btrfs_remove_block_group_free_space(trans, bg);
5297 	if (unlikely(ret)) {
5298 		btrfs_abort_transaction(trans, ret);
5299 		goto end;
5300 	}
5301 
5302 	btrfs_remove_free_space_cache(bg);
5303 
5304 end:
5305 	mutex_unlock(&fs_info->remap_mutex);
5306 
5307 	ret2 = btrfs_end_transaction(trans);
5308 	if (!ret)
5309 		ret = ret2;
5310 
5311 	return ret;
5312 }
5313 
5314 static int do_nonremap_reloc(struct btrfs_fs_info *fs_info, bool verbose,
5315 			     struct reloc_control *rc)
5316 {
5317 	int ret;
5318 
5319 	while (1) {
5320 		enum reloc_stage finishes_stage;
5321 
5322 		mutex_lock(&fs_info->cleaner_mutex);
5323 		ret = relocate_block_group(rc);
5324 		mutex_unlock(&fs_info->cleaner_mutex);
5325 
5326 		finishes_stage = rc->stage;
5327 		/*
5328 		 * We may have gotten ENOSPC after we already dirtied some
5329 		 * extents.  If writeout happens while we're relocating a
5330 		 * different block group we could end up hitting the
5331 		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
5332 		 * btrfs_reloc_cow_block.  Make sure we write everything out
5333 		 * properly so we don't trip over this problem, and then break
5334 		 * out of the loop if we hit an error.
5335 		 */
5336 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
5337 			int wb_ret;
5338 
5339 			wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode),
5340 							  0, (u64)-1);
5341 			if (wb_ret && ret == 0)
5342 				ret = wb_ret;
5343 			invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1);
5344 			rc->stage = UPDATE_DATA_PTRS;
5345 		}
5346 
5347 		if (ret < 0)
5348 			return ret;
5349 
5350 		if (rc->extents_found == 0)
5351 			break;
5352 
5353 		if (verbose)
5354 			btrfs_info(fs_info, "found %llu extents, stage: %s",
5355 				   rc->extents_found, stage_to_string(finishes_stage));
5356 	}
5357 
5358 	WARN_ON(rc->block_group->pinned > 0);
5359 	WARN_ON(rc->block_group->reserved > 0);
5360 	WARN_ON(rc->block_group->used > 0);
5361 
5362 	return 0;
5363 }
5364 
5365 /*
5366  * function to relocate all extents in a block group.
5367  */
5368 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
5369 			       bool verbose)
5370 {
5371 	struct btrfs_block_group *bg;
5372 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
5373 	struct reloc_control *rc;
5374 	struct inode *inode;
5375 	struct btrfs_path *path = NULL;
5376 	int ret;
5377 	bool bg_is_ro = false;
5378 
5379 	if (unlikely(!extent_root)) {
5380 		btrfs_err(fs_info,
5381 			  "missing extent root for block group at offset %llu",
5382 			  group_start);
5383 		return -EUCLEAN;
5384 	}
5385 
5386 	/*
5387 	 * This only gets set if we had a half-deleted snapshot on mount.  We
5388 	 * cannot allow relocation to start while we're still trying to clean up
5389 	 * these pending deletions.
5390 	 */
5391 	ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
5392 	if (ret)
5393 		return ret;
5394 
5395 	/* We may have been woken up by close_ctree, so bail if we're closing. */
5396 	if (btrfs_fs_closing(fs_info))
5397 		return -EINTR;
5398 
5399 	bg = btrfs_lookup_block_group(fs_info, group_start);
5400 	if (!bg)
5401 		return -ENOENT;
5402 
5403 	/*
5404 	 * Relocation of a data block group creates ordered extents.  Without
5405 	 * sb_start_write(), we can freeze the filesystem while unfinished
5406 	 * ordered extents are left. Such ordered extents can cause a deadlock
5407 	 * e.g. when syncfs() is waiting for their completion but they can't
5408 	 * finish because they block when joining a transaction, due to the
5409 	 * fact that the freeze locks are being held in write mode.
5410 	 */
5411 	if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
5412 		ASSERT(sb_write_started(fs_info->sb));
5413 
5414 	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
5415 		btrfs_put_block_group(bg);
5416 		return -ETXTBSY;
5417 	}
5418 
5419 	rc = alloc_reloc_control(fs_info);
5420 	if (!rc) {
5421 		btrfs_put_block_group(bg);
5422 		return -ENOMEM;
5423 	}
5424 
5425 	rc->extent_root = extent_root;
5426 	/* Block group ref now owned by rc, put_reloc_control() will drop it. */
5427 	rc->block_group = bg;
5428 
5429 	ret = reloc_chunk_start(fs_info);
5430 	if (ret < 0)
5431 		goto out_put_rc;
5432 
5433 	ret = btrfs_inc_block_group_ro(rc->block_group, true);
5434 	if (ret)
5435 		goto out;
5436 	bg_is_ro = true;
5437 
5438 	path = btrfs_alloc_path();
5439 	if (!path) {
5440 		ret = -ENOMEM;
5441 		goto out;
5442 	}
5443 
5444 	inode = lookup_free_space_inode(rc->block_group, path);
5445 	btrfs_release_path(path);
5446 
5447 	if (!IS_ERR(inode))
5448 		ret = delete_block_group_cache(rc->block_group, inode, 0);
5449 	else
5450 		ret = PTR_ERR(inode);
5451 
5452 	if (ret && ret != -ENOENT)
5453 		goto out;
5454 
5455 	if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) {
5456 		rc->data_inode = create_reloc_inode(rc->block_group);
5457 		if (IS_ERR(rc->data_inode)) {
5458 			ret = PTR_ERR(rc->data_inode);
5459 			rc->data_inode = NULL;
5460 			goto out;
5461 		}
5462 	}
5463 
5464 	if (verbose)
5465 		describe_relocation(rc->block_group);
5466 
5467 	btrfs_wait_block_group_reservations(rc->block_group);
5468 	btrfs_wait_nocow_writers(rc->block_group);
5469 	btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group);
5470 
5471 	ret = btrfs_zone_finish(rc->block_group);
5472 	WARN_ON(ret && ret != -EAGAIN);
5473 
5474 	if (should_relocate_using_remap_tree(bg)) {
5475 		if (bg->remap_bytes != 0) {
5476 			ret = move_existing_remaps(fs_info, bg, path);
5477 			if (ret)
5478 				goto out;
5479 		}
5480 		ret = start_block_group_remapping(fs_info, path, bg);
5481 		if (ret)
5482 			goto out;
5483 
5484 		ret = do_remap_reloc(fs_info, path, rc->block_group);
5485 		if (ret)
5486 			goto out;
5487 
5488 		btrfs_delete_unused_bgs(fs_info);
5489 	} else {
5490 		ret = do_nonremap_reloc(fs_info, verbose, rc);
5491 	}
5492 
5493 out:
5494 	if (ret && bg_is_ro)
5495 		btrfs_dec_block_group_ro(rc->block_group);
5496 	if (!btrfs_fs_incompat(fs_info, REMAP_TREE))
5497 		iput(rc->data_inode);
5498 	btrfs_free_path(path);
5499 	reloc_chunk_end(fs_info);
5500 out_put_rc:
5501 	put_reloc_control(rc);
5502 	return ret;
5503 }
5504 
5505 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
5506 {
5507 	struct btrfs_fs_info *fs_info = root->fs_info;
5508 	struct btrfs_trans_handle *trans;
5509 	int ret, err;
5510 
5511 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
5512 	if (IS_ERR(trans))
5513 		return PTR_ERR(trans);
5514 
5515 	memset(&root->root_item.drop_progress, 0,
5516 		sizeof(root->root_item.drop_progress));
5517 	btrfs_set_root_drop_level(&root->root_item, 0);
5518 	btrfs_set_root_refs(&root->root_item, 0);
5519 	ret = btrfs_update_root(trans, fs_info->tree_root,
5520 				&root->root_key, &root->root_item);
5521 
5522 	err = btrfs_end_transaction(trans);
5523 	if (err)
5524 		return err;
5525 	return ret;
5526 }
5527 
5528 /*
5529  * recover relocation interrupted by system crash.
5530  *
5531  * this function resumes merging reloc trees with corresponding fs trees.
5532  * this is important for keeping the sharing of tree blocks
5533  */
5534 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
5535 {
5536 	LIST_HEAD(reloc_roots);
5537 	struct btrfs_key key;
5538 	struct btrfs_root *fs_root;
5539 	struct btrfs_root *reloc_root;
5540 	struct btrfs_path *path;
5541 	struct extent_buffer *leaf;
5542 	struct reloc_control *rc = NULL;
5543 	struct btrfs_trans_handle *trans;
5544 	int ret2;
5545 	int ret = 0;
5546 
5547 	path = btrfs_alloc_path();
5548 	if (!path)
5549 		return -ENOMEM;
5550 	path->reada = READA_BACK;
5551 
5552 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5553 	key.type = BTRFS_ROOT_ITEM_KEY;
5554 	key.offset = (u64)-1;
5555 
5556 	while (1) {
5557 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
5558 					path, 0, 0);
5559 		if (ret < 0)
5560 			goto out;
5561 		if (ret > 0) {
5562 			if (path->slots[0] == 0)
5563 				break;
5564 			path->slots[0]--;
5565 		}
5566 		ret = 0;
5567 		leaf = path->nodes[0];
5568 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5569 		btrfs_release_path(path);
5570 
5571 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
5572 		    key.type != BTRFS_ROOT_ITEM_KEY)
5573 			break;
5574 
5575 		reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
5576 		if (IS_ERR(reloc_root)) {
5577 			ret = PTR_ERR(reloc_root);
5578 			goto out;
5579 		}
5580 
5581 		set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
5582 		list_add(&reloc_root->root_list, &reloc_roots);
5583 
5584 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
5585 			fs_root = btrfs_get_fs_root(fs_info,
5586 					reloc_root->root_key.offset, false);
5587 			if (IS_ERR(fs_root)) {
5588 				ret = PTR_ERR(fs_root);
5589 				if (ret != -ENOENT)
5590 					goto out;
5591 				ret = mark_garbage_root(reloc_root);
5592 				if (ret < 0)
5593 					goto out;
5594 				ret = 0;
5595 			} else {
5596 				btrfs_put_root(fs_root);
5597 			}
5598 		}
5599 
5600 		if (key.offset == 0)
5601 			break;
5602 
5603 		key.offset--;
5604 	}
5605 	btrfs_release_path(path);
5606 
5607 	if (list_empty(&reloc_roots))
5608 		goto out;
5609 
5610 	rc = alloc_reloc_control(fs_info);
5611 	if (!rc) {
5612 		ret = -ENOMEM;
5613 		goto out;
5614 	}
5615 
5616 	rc->extent_root = btrfs_extent_root(fs_info, 0);
5617 	if (unlikely(!rc->extent_root)) {
5618 		btrfs_err(fs_info, "missing extent root for extent at bytenr 0");
5619 		ret = -EUCLEAN;
5620 		goto out;
5621 	}
5622 
5623 	ret = reloc_chunk_start(fs_info);
5624 	if (ret < 0)
5625 		goto out_end;
5626 
5627 	set_reloc_control(rc);
5628 
5629 	trans = btrfs_join_transaction(rc->extent_root);
5630 	if (IS_ERR(trans)) {
5631 		ret = PTR_ERR(trans);
5632 		goto out_unset;
5633 	}
5634 
5635 	rc->merge_reloc_tree = true;
5636 
5637 	while (!list_empty(&reloc_roots)) {
5638 		reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
5639 		list_del(&reloc_root->root_list);
5640 
5641 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
5642 			list_add_tail(&reloc_root->root_list,
5643 				      &rc->reloc_roots);
5644 			continue;
5645 		}
5646 
5647 		fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
5648 					    false);
5649 		if (IS_ERR(fs_root)) {
5650 			ret = PTR_ERR(fs_root);
5651 			list_add_tail(&reloc_root->root_list, &reloc_roots);
5652 			btrfs_end_transaction(trans);
5653 			goto out_unset;
5654 		}
5655 
5656 		ret = __add_reloc_root(reloc_root, rc);
5657 		ASSERT(ret != -EEXIST);
5658 		if (ret) {
5659 			list_add_tail(&reloc_root->root_list, &reloc_roots);
5660 			btrfs_put_root(fs_root);
5661 			btrfs_end_transaction(trans);
5662 			goto out_unset;
5663 		}
5664 		fs_root->reloc_root = btrfs_grab_root(reloc_root);
5665 		btrfs_put_root(fs_root);
5666 	}
5667 
5668 	ret = btrfs_commit_transaction(trans);
5669 	if (ret)
5670 		goto out_unset;
5671 
5672 	merge_reloc_roots(rc);
5673 
5674 	unset_reloc_control(rc);
5675 
5676 	trans = btrfs_join_transaction(rc->extent_root);
5677 	if (IS_ERR(trans)) {
5678 		ret = PTR_ERR(trans);
5679 		goto out_clean;
5680 	}
5681 	ret = btrfs_commit_transaction(trans);
5682 out_clean:
5683 	ret2 = clean_dirty_subvols(rc);
5684 	if (ret2 < 0 && !ret)
5685 		ret = ret2;
5686 out_unset:
5687 	unset_reloc_control(rc);
5688 	reloc_chunk_end(fs_info);
5689 out_end:
5690 	put_reloc_control(rc);
5691 out:
5692 	free_reloc_roots(&reloc_roots);
5693 
5694 	btrfs_free_path(path);
5695 
5696 	if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
5697 		/* cleanup orphan inode in data relocation tree */
5698 		fs_root = btrfs_grab_root(fs_info->data_reloc_root);
5699 		ASSERT(fs_root);
5700 		ret = btrfs_orphan_cleanup(fs_root);
5701 		btrfs_put_root(fs_root);
5702 	}
5703 	return ret;
5704 }
5705 
5706 /*
5707  * helper to add ordered checksum for data relocation.
5708  *
5709  * cloning checksum properly handles the nodatasum extents.
5710  * it also saves CPU time to re-calculate the checksum.
5711  */
5712 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
5713 {
5714 	struct btrfs_inode *inode = ordered->inode;
5715 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5716 	u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start;
5717 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
5718 	LIST_HEAD(list);
5719 	int ret;
5720 
5721 	if (unlikely(!csum_root)) {
5722 		btrfs_mark_ordered_extent_error(ordered);
5723 		btrfs_err(fs_info,
5724 			  "missing csum root for extent at bytenr %llu",
5725 			  disk_bytenr);
5726 		return -EUCLEAN;
5727 	}
5728 
5729 	ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
5730 				      disk_bytenr + ordered->num_bytes - 1,
5731 				      &list, false);
5732 	if (ret < 0) {
5733 		btrfs_mark_ordered_extent_error(ordered);
5734 		return ret;
5735 	}
5736 
5737 	while (!list_empty(&list)) {
5738 		struct btrfs_ordered_sum *sums =
5739 			list_first_entry(&list, struct btrfs_ordered_sum, list);
5740 
5741 		list_del_init(&sums->list);
5742 
5743 		/*
5744 		 * We need to offset the new_bytenr based on where the csum is.
5745 		 * We need to do this because we will read in entire prealloc
5746 		 * extents but we may have written to say the middle of the
5747 		 * prealloc extent, so we need to make sure the csum goes with
5748 		 * the right disk offset.
5749 		 *
5750 		 * We can do this because the data reloc inode refers strictly
5751 		 * to the on disk bytes, so we don't have to worry about
5752 		 * disk_len vs real len like with real inodes since it's all
5753 		 * disk length.
5754 		 */
5755 		sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
5756 		btrfs_add_ordered_sum(ordered, sums);
5757 	}
5758 
5759 	return 0;
5760 }
5761 
5762 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
5763 			  struct btrfs_root *root,
5764 			  const struct extent_buffer *buf,
5765 			  struct extent_buffer *cow)
5766 {
5767 	struct btrfs_fs_info *fs_info = root->fs_info;
5768 	struct reloc_control *rc;
5769 	struct btrfs_backref_node *node;
5770 	bool first_cow = false;
5771 	int level;
5772 	int ret = 0;
5773 
5774 	rc = get_reloc_control(fs_info);
5775 	if (!rc)
5776 		return 0;
5777 
5778 	BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
5779 
5780 	level = btrfs_header_level(buf);
5781 	if (btrfs_header_generation(buf) <=
5782 	    btrfs_root_last_snapshot(&root->root_item))
5783 		first_cow = true;
5784 
5785 	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
5786 		WARN_ON(!first_cow && level == 0);
5787 
5788 		node = rc->backref_cache.path[level];
5789 
5790 		/*
5791 		 * If node->bytenr != buf->start and node->new_bytenr !=
5792 		 * buf->start then we've got the wrong backref node for what we
5793 		 * expected to see here and the cache is incorrect.
5794 		 */
5795 		if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
5796 			btrfs_err(fs_info,
5797 "bytenr %llu was found but our backref cache was expecting %llu or %llu",
5798 				  buf->start, node->bytenr, node->new_bytenr);
5799 			ret = -EUCLEAN;
5800 			goto out;
5801 		}
5802 
5803 		btrfs_backref_drop_node_buffer(node);
5804 		refcount_inc(&cow->refs);
5805 		node->eb = cow;
5806 		node->new_bytenr = cow->start;
5807 
5808 		if (!node->pending) {
5809 			list_move_tail(&node->list,
5810 				       &rc->backref_cache.pending[level]);
5811 			node->pending = 1;
5812 		}
5813 
5814 		if (first_cow)
5815 			mark_block_processed(rc, node);
5816 
5817 		if (first_cow && level > 0)
5818 			rc->nodes_relocated += buf->len;
5819 	}
5820 
5821 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
5822 		ret = replace_file_extents(trans, rc, root, cow);
5823 out:
5824 	put_reloc_control(rc);
5825 
5826 	return ret;
5827 }
5828 
5829 /*
5830  * called before creating snapshot. it calculates metadata reservation
5831  * required for relocating tree blocks in the snapshot
5832  */
5833 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
5834 			      u64 *bytes_to_reserve)
5835 {
5836 	struct btrfs_root *root = pending->root;
5837 	struct reloc_control *rc = root->fs_info->reloc_ctl;
5838 
5839 	if (!rc || !have_reloc_root(root))
5840 		return;
5841 
5842 	if (!rc->merge_reloc_tree)
5843 		return;
5844 
5845 	root = root->reloc_root;
5846 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
5847 	/*
5848 	 * relocation is in the stage of merging trees. the space
5849 	 * used by merging a reloc tree is twice the size of
5850 	 * relocated tree nodes in the worst case. half for cowing
5851 	 * the reloc tree, half for cowing the fs tree. the space
5852 	 * used by cowing the reloc tree will be freed after the
5853 	 * tree is dropped. if we create snapshot, cowing the fs
5854 	 * tree may use more space than it frees. so we need
5855 	 * reserve extra space.
5856 	 */
5857 	*bytes_to_reserve += rc->nodes_relocated;
5858 }
5859 
5860 /*
5861  * called after snapshot is created. migrate block reservation
5862  * and create reloc root for the newly created snapshot
5863  *
5864  * This is similar to btrfs_init_reloc_root(), we come out of here with two
5865  * references held on the reloc_root, one for root->reloc_root and one for
5866  * rc->reloc_roots.
5867  */
5868 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
5869 			       struct btrfs_pending_snapshot *pending)
5870 {
5871 	struct btrfs_root *root = pending->root;
5872 	struct btrfs_root *reloc_root;
5873 	struct btrfs_root *new_root;
5874 	struct reloc_control *rc;
5875 	int ret = 0;
5876 
5877 	rc = get_reloc_control(trans->fs_info);
5878 	if (!rc)
5879 		return 0;
5880 
5881 	if (!have_reloc_root(root))
5882 		goto out;
5883 
5884 	rc->merging_rsv_size += rc->nodes_relocated;
5885 
5886 	if (rc->merge_reloc_tree) {
5887 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
5888 					      rc->block_rsv,
5889 					      rc->nodes_relocated, true);
5890 		if (ret)
5891 			goto out;
5892 	}
5893 
5894 	new_root = pending->snap;
5895 	reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
5896 	if (IS_ERR(reloc_root)) {
5897 		ret = PTR_ERR(reloc_root);
5898 		goto out;
5899 	}
5900 
5901 	ret = __add_reloc_root(reloc_root, rc);
5902 	ASSERT(ret != -EEXIST);
5903 	if (ret) {
5904 		/* Pairs with create_reloc_root */
5905 		btrfs_put_root(reloc_root);
5906 		goto out;
5907 	}
5908 	new_root->reloc_root = btrfs_grab_root(reloc_root);
5909 out:
5910 	put_reloc_control(rc);
5911 
5912 	return ret;
5913 }
5914 
5915 /*
5916  * Get the current bytenr for the block group which is being relocated.
5917  *
5918  * Return U64_MAX if no running relocation.
5919  */
5920 u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
5921 {
5922 	u64 logical = U64_MAX;
5923 
5924 	mutex_lock(&fs_info->reloc_mutex);
5925 	if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
5926 		logical = fs_info->reloc_ctl->block_group->start;
5927 	mutex_unlock(&fs_info->reloc_mutex);
5928 
5929 	return logical;
5930 }
5931 
5932 static int insert_remap_item(struct btrfs_trans_handle *trans, struct btrfs_path *path,
5933 			     u64 old_addr, u64 length, u64 new_addr)
5934 {
5935 	int ret;
5936 	struct btrfs_fs_info *fs_info = trans->fs_info;
5937 	struct btrfs_key key;
5938 	struct btrfs_remap_item remap = { 0 };
5939 
5940 	if (old_addr == new_addr) {
5941 		/* Add new identity remap item. */
5942 		key.objectid = old_addr;
5943 		key.type = BTRFS_IDENTITY_REMAP_KEY;
5944 		key.offset = length;
5945 
5946 		ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
5947 					      &key, 0);
5948 		if (ret)
5949 			return ret;
5950 	} else {
5951 		/* Add new remap item. */
5952 		key.objectid = old_addr;
5953 		key.type = BTRFS_REMAP_KEY;
5954 		key.offset = length;
5955 
5956 		ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
5957 					      path, &key, sizeof(struct btrfs_remap_item));
5958 		if (ret)
5959 			return ret;
5960 
5961 		btrfs_set_stack_remap_address(&remap, new_addr);
5962 
5963 		write_extent_buffer(path->nodes[0], &remap,
5964 			btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
5965 			sizeof(struct btrfs_remap_item));
5966 
5967 		btrfs_release_path(path);
5968 
5969 		/* Add new backref item. */
5970 		key.objectid = new_addr;
5971 		key.type = BTRFS_REMAP_BACKREF_KEY;
5972 		key.offset = length;
5973 
5974 		ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
5975 					      path, &key,
5976 					      sizeof(struct btrfs_remap_item));
5977 		if (ret)
5978 			return ret;
5979 
5980 		btrfs_set_stack_remap_address(&remap, old_addr);
5981 
5982 		write_extent_buffer(path->nodes[0], &remap,
5983 			btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
5984 			sizeof(struct btrfs_remap_item));
5985 	}
5986 
5987 	btrfs_release_path(path);
5988 
5989 	return 0;
5990 }
5991 
5992 /*
5993  * Punch a hole in the remap item or identity remap item pointed to by path,
5994  * for the range [hole_start, hole_start + hole_length).
5995  */
5996 static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
5997 					struct btrfs_path *path,
5998 					struct btrfs_block_group *bg,
5999 					u64 hole_start, u64 hole_length)
6000 {
6001 	int ret;
6002 	struct btrfs_fs_info *fs_info = trans->fs_info;
6003 	struct extent_buffer *leaf = path->nodes[0];
6004 	struct btrfs_key key;
6005 	u64 hole_end, new_addr, remap_start, remap_length, remap_end;
6006 	u64 overlap_length;
6007 	bool is_identity_remap;
6008 	int identity_count_delta = 0;
6009 
6010 	hole_end = hole_start + hole_length;
6011 
6012 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6013 
6014 	is_identity_remap = (key.type == BTRFS_IDENTITY_REMAP_KEY);
6015 
6016 	remap_start = key.objectid;
6017 	remap_length = key.offset;
6018 	remap_end = remap_start + remap_length;
6019 
6020 	if (is_identity_remap) {
6021 		new_addr = remap_start;
6022 	} else {
6023 		struct btrfs_remap_item *remap_ptr;
6024 
6025 		remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item);
6026 		new_addr = btrfs_remap_address(leaf, remap_ptr);
6027 	}
6028 
6029 	/* Delete old item. */
6030 	ret = btrfs_del_item(trans, fs_info->remap_root, path);
6031 	btrfs_release_path(path);
6032 	if (ret)
6033 		return ret;
6034 
6035 	if (is_identity_remap) {
6036 		identity_count_delta = -1;
6037 	} else {
6038 		/* Remove backref. */
6039 		key.objectid = new_addr;
6040 		key.type = BTRFS_REMAP_BACKREF_KEY;
6041 		key.offset = remap_length;
6042 
6043 		ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
6044 		if (ret) {
6045 			if (ret == 1) {
6046 				btrfs_release_path(path);
6047 				ret = -ENOENT;
6048 			}
6049 			return ret;
6050 		}
6051 
6052 		ret = btrfs_del_item(trans, fs_info->remap_root, path);
6053 
6054 		btrfs_release_path(path);
6055 
6056 		if (ret)
6057 			return ret;
6058 	}
6059 
6060 	/* If hole_start > remap_start, re-add the start of the remap item. */
6061 	if (hole_start > remap_start) {
6062 		ret = insert_remap_item(trans, path, remap_start,
6063 					hole_start - remap_start, new_addr);
6064 		if (ret)
6065 			return ret;
6066 
6067 		if (is_identity_remap)
6068 			identity_count_delta++;
6069 	}
6070 
6071 	/* If hole_end < remap_end, re-add the end of the remap item. */
6072 	if (hole_end < remap_end) {
6073 		ret = insert_remap_item(trans, path, hole_end,
6074 					remap_end - hole_end,
6075 					hole_end - remap_start + new_addr);
6076 		if (ret)
6077 			return ret;
6078 
6079 		if (is_identity_remap)
6080 			identity_count_delta++;
6081 	}
6082 
6083 	if (identity_count_delta != 0)
6084 		adjust_identity_remap_count(trans, bg, identity_count_delta);
6085 
6086 	overlap_length = min_t(u64, hole_end, remap_end) -
6087 			 max_t(u64, hole_start, remap_start);
6088 
6089 	if (!is_identity_remap) {
6090 		struct btrfs_block_group *dest_bg;
6091 
6092 		dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
6093 		if (unlikely(!dest_bg))
6094 			return -EUCLEAN;
6095 
6096 		adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length);
6097 		btrfs_put_block_group(dest_bg);
6098 		ret = btrfs_add_to_free_space_tree(trans,
6099 						   hole_start - remap_start + new_addr,
6100 						   overlap_length);
6101 		if (ret)
6102 			return ret;
6103 	}
6104 
6105 	ret = overlap_length;
6106 
6107 	return ret;
6108 }
6109 
6110 /*
6111  * Return 1 if remove_range_from_remap_tree() has been called successfully,
6112  * 0 if block group wasn't remapped, and a negative number on error.
6113  */
6114 int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
6115 					struct btrfs_path *path,
6116 					u64 bytenr, u64 num_bytes)
6117 {
6118 	struct btrfs_fs_info *fs_info = trans->fs_info;
6119 	struct btrfs_key key, found_key;
6120 	struct extent_buffer *leaf;
6121 	struct btrfs_block_group *bg;
6122 	int ret, length;
6123 
6124 	if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
6125 	      BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
6126 		return 0;
6127 
6128 	bg = btrfs_lookup_block_group(fs_info, bytenr);
6129 	if (!bg)
6130 		return 0;
6131 
6132 	mutex_lock(&fs_info->remap_mutex);
6133 
6134 	if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
6135 		mutex_unlock(&fs_info->remap_mutex);
6136 		btrfs_put_block_group(bg);
6137 		return 0;
6138 	}
6139 
6140 	do {
6141 		key.objectid = bytenr;
6142 		key.type = (u8)-1;
6143 		key.offset = (u64)-1;
6144 
6145 		ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
6146 		if (ret < 0)
6147 			goto end;
6148 
6149 		leaf = path->nodes[0];
6150 		if (path->slots[0] == 0) {
6151 			ret = -ENOENT;
6152 			goto end;
6153 		}
6154 
6155 		path->slots[0]--;
6156 
6157 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6158 
6159 		if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
6160 		    found_key.type != BTRFS_REMAP_KEY) {
6161 			ret = -ENOENT;
6162 			goto end;
6163 		}
6164 
6165 		if (bytenr < found_key.objectid ||
6166 		    bytenr >= found_key.objectid + found_key.offset) {
6167 			ret = -ENOENT;
6168 			goto end;
6169 		}
6170 
6171 		length = remove_range_from_remap_tree(trans, path, bg, bytenr, num_bytes);
6172 		if (length < 0) {
6173 			ret = length;
6174 			goto end;
6175 		}
6176 
6177 		bytenr += length;
6178 		num_bytes -= length;
6179 	} while (num_bytes > 0);
6180 
6181 	ret = 1;
6182 
6183 end:
6184 	mutex_unlock(&fs_info->remap_mutex);
6185 
6186 	btrfs_put_block_group(bg);
6187 	btrfs_release_path(path);
6188 
6189 	return ret;
6190 }
6191