xref: /linux/fs/btrfs/relocation.c (revision 98838d95075a5295f3478ceba18bcccf472e30f4)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34 #include "qgroup.h"
35 
36 /*
37  * backref_node, mapping_node and tree_block start with this
38  */
39 struct tree_entry {
40 	struct rb_node rb_node;
41 	u64 bytenr;
42 };
43 
44 /*
45  * present a tree block in the backref cache
46  */
47 struct backref_node {
48 	struct rb_node rb_node;
49 	u64 bytenr;
50 
51 	u64 new_bytenr;
52 	/* objectid of tree block owner, can be not uptodate */
53 	u64 owner;
54 	/* link to pending, changed or detached list */
55 	struct list_head list;
56 	/* list of upper level blocks reference this block */
57 	struct list_head upper;
58 	/* list of child blocks in the cache */
59 	struct list_head lower;
60 	/* NULL if this node is not tree root */
61 	struct btrfs_root *root;
62 	/* extent buffer got by COW the block */
63 	struct extent_buffer *eb;
64 	/* level of tree block */
65 	unsigned int level:8;
66 	/* is the block in non-reference counted tree */
67 	unsigned int cowonly:1;
68 	/* 1 if no child node in the cache */
69 	unsigned int lowest:1;
70 	/* is the extent buffer locked */
71 	unsigned int locked:1;
72 	/* has the block been processed */
73 	unsigned int processed:1;
74 	/* have backrefs of this block been checked */
75 	unsigned int checked:1;
76 	/*
77 	 * 1 if corresponding block has been cowed but some upper
78 	 * level block pointers may not point to the new location
79 	 */
80 	unsigned int pending:1;
81 	/*
82 	 * 1 if the backref node isn't connected to any other
83 	 * backref node.
84 	 */
85 	unsigned int detached:1;
86 };
87 
88 /*
89  * present a block pointer in the backref cache
90  */
91 struct backref_edge {
92 	struct list_head list[2];
93 	struct backref_node *node[2];
94 };
95 
96 #define LOWER	0
97 #define UPPER	1
98 #define RELOCATION_RESERVED_NODES	256
99 
100 struct backref_cache {
101 	/* red black tree of all backref nodes in the cache */
102 	struct rb_root rb_root;
103 	/* for passing backref nodes to btrfs_reloc_cow_block */
104 	struct backref_node *path[BTRFS_MAX_LEVEL];
105 	/*
106 	 * list of blocks that have been cowed but some block
107 	 * pointers in upper level blocks may not reflect the
108 	 * new location
109 	 */
110 	struct list_head pending[BTRFS_MAX_LEVEL];
111 	/* list of backref nodes with no child node */
112 	struct list_head leaves;
113 	/* list of blocks that have been cowed in current transaction */
114 	struct list_head changed;
115 	/* list of detached backref node. */
116 	struct list_head detached;
117 
118 	u64 last_trans;
119 
120 	int nr_nodes;
121 	int nr_edges;
122 };
123 
124 /*
125  * map address of tree root to tree
126  */
127 struct mapping_node {
128 	struct rb_node rb_node;
129 	u64 bytenr;
130 	void *data;
131 };
132 
133 struct mapping_tree {
134 	struct rb_root rb_root;
135 	spinlock_t lock;
136 };
137 
138 /*
139  * present a tree block to process
140  */
141 struct tree_block {
142 	struct rb_node rb_node;
143 	u64 bytenr;
144 	struct btrfs_key key;
145 	unsigned int level:8;
146 	unsigned int key_ready:1;
147 };
148 
149 #define MAX_EXTENTS 128
150 
151 struct file_extent_cluster {
152 	u64 start;
153 	u64 end;
154 	u64 boundary[MAX_EXTENTS];
155 	unsigned int nr;
156 };
157 
158 struct reloc_control {
159 	/* block group to relocate */
160 	struct btrfs_block_group_cache *block_group;
161 	/* extent tree */
162 	struct btrfs_root *extent_root;
163 	/* inode for moving data */
164 	struct inode *data_inode;
165 
166 	struct btrfs_block_rsv *block_rsv;
167 
168 	struct backref_cache backref_cache;
169 
170 	struct file_extent_cluster cluster;
171 	/* tree blocks have been processed */
172 	struct extent_io_tree processed_blocks;
173 	/* map start of tree root to corresponding reloc tree */
174 	struct mapping_tree reloc_root_tree;
175 	/* list of reloc trees */
176 	struct list_head reloc_roots;
177 	/* size of metadata reservation for merging reloc trees */
178 	u64 merging_rsv_size;
179 	/* size of relocated tree nodes */
180 	u64 nodes_relocated;
181 	/* reserved size for block group relocation*/
182 	u64 reserved_bytes;
183 
184 	u64 search_start;
185 	u64 extents_found;
186 
187 	unsigned int stage:8;
188 	unsigned int create_reloc_tree:1;
189 	unsigned int merge_reloc_tree:1;
190 	unsigned int found_file_extent:1;
191 };
192 
193 /* stages of data relocation */
194 #define MOVE_DATA_EXTENTS	0
195 #define UPDATE_DATA_PTRS	1
196 
197 static void remove_backref_node(struct backref_cache *cache,
198 				struct backref_node *node);
199 static void __mark_block_processed(struct reloc_control *rc,
200 				   struct backref_node *node);
201 
202 static void mapping_tree_init(struct mapping_tree *tree)
203 {
204 	tree->rb_root = RB_ROOT;
205 	spin_lock_init(&tree->lock);
206 }
207 
208 static void backref_cache_init(struct backref_cache *cache)
209 {
210 	int i;
211 	cache->rb_root = RB_ROOT;
212 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
213 		INIT_LIST_HEAD(&cache->pending[i]);
214 	INIT_LIST_HEAD(&cache->changed);
215 	INIT_LIST_HEAD(&cache->detached);
216 	INIT_LIST_HEAD(&cache->leaves);
217 }
218 
219 static void backref_cache_cleanup(struct backref_cache *cache)
220 {
221 	struct backref_node *node;
222 	int i;
223 
224 	while (!list_empty(&cache->detached)) {
225 		node = list_entry(cache->detached.next,
226 				  struct backref_node, list);
227 		remove_backref_node(cache, node);
228 	}
229 
230 	while (!list_empty(&cache->leaves)) {
231 		node = list_entry(cache->leaves.next,
232 				  struct backref_node, lower);
233 		remove_backref_node(cache, node);
234 	}
235 
236 	cache->last_trans = 0;
237 
238 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
239 		ASSERT(list_empty(&cache->pending[i]));
240 	ASSERT(list_empty(&cache->changed));
241 	ASSERT(list_empty(&cache->detached));
242 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
243 	ASSERT(!cache->nr_nodes);
244 	ASSERT(!cache->nr_edges);
245 }
246 
247 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
248 {
249 	struct backref_node *node;
250 
251 	node = kzalloc(sizeof(*node), GFP_NOFS);
252 	if (node) {
253 		INIT_LIST_HEAD(&node->list);
254 		INIT_LIST_HEAD(&node->upper);
255 		INIT_LIST_HEAD(&node->lower);
256 		RB_CLEAR_NODE(&node->rb_node);
257 		cache->nr_nodes++;
258 	}
259 	return node;
260 }
261 
262 static void free_backref_node(struct backref_cache *cache,
263 			      struct backref_node *node)
264 {
265 	if (node) {
266 		cache->nr_nodes--;
267 		kfree(node);
268 	}
269 }
270 
271 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
272 {
273 	struct backref_edge *edge;
274 
275 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
276 	if (edge)
277 		cache->nr_edges++;
278 	return edge;
279 }
280 
281 static void free_backref_edge(struct backref_cache *cache,
282 			      struct backref_edge *edge)
283 {
284 	if (edge) {
285 		cache->nr_edges--;
286 		kfree(edge);
287 	}
288 }
289 
290 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
291 				   struct rb_node *node)
292 {
293 	struct rb_node **p = &root->rb_node;
294 	struct rb_node *parent = NULL;
295 	struct tree_entry *entry;
296 
297 	while (*p) {
298 		parent = *p;
299 		entry = rb_entry(parent, struct tree_entry, rb_node);
300 
301 		if (bytenr < entry->bytenr)
302 			p = &(*p)->rb_left;
303 		else if (bytenr > entry->bytenr)
304 			p = &(*p)->rb_right;
305 		else
306 			return parent;
307 	}
308 
309 	rb_link_node(node, parent, p);
310 	rb_insert_color(node, root);
311 	return NULL;
312 }
313 
314 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
315 {
316 	struct rb_node *n = root->rb_node;
317 	struct tree_entry *entry;
318 
319 	while (n) {
320 		entry = rb_entry(n, struct tree_entry, rb_node);
321 
322 		if (bytenr < entry->bytenr)
323 			n = n->rb_left;
324 		else if (bytenr > entry->bytenr)
325 			n = n->rb_right;
326 		else
327 			return n;
328 	}
329 	return NULL;
330 }
331 
332 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
333 {
334 
335 	struct btrfs_fs_info *fs_info = NULL;
336 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
337 					      rb_node);
338 	if (bnode->root)
339 		fs_info = bnode->root->fs_info;
340 	btrfs_panic(fs_info, errno,
341 		    "Inconsistency in backref cache found at offset %llu",
342 		    bytenr);
343 }
344 
345 /*
346  * walk up backref nodes until reach node presents tree root
347  */
348 static struct backref_node *walk_up_backref(struct backref_node *node,
349 					    struct backref_edge *edges[],
350 					    int *index)
351 {
352 	struct backref_edge *edge;
353 	int idx = *index;
354 
355 	while (!list_empty(&node->upper)) {
356 		edge = list_entry(node->upper.next,
357 				  struct backref_edge, list[LOWER]);
358 		edges[idx++] = edge;
359 		node = edge->node[UPPER];
360 	}
361 	BUG_ON(node->detached);
362 	*index = idx;
363 	return node;
364 }
365 
366 /*
367  * walk down backref nodes to find start of next reference path
368  */
369 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
370 					      int *index)
371 {
372 	struct backref_edge *edge;
373 	struct backref_node *lower;
374 	int idx = *index;
375 
376 	while (idx > 0) {
377 		edge = edges[idx - 1];
378 		lower = edge->node[LOWER];
379 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
380 			idx--;
381 			continue;
382 		}
383 		edge = list_entry(edge->list[LOWER].next,
384 				  struct backref_edge, list[LOWER]);
385 		edges[idx - 1] = edge;
386 		*index = idx;
387 		return edge->node[UPPER];
388 	}
389 	*index = 0;
390 	return NULL;
391 }
392 
393 static void unlock_node_buffer(struct backref_node *node)
394 {
395 	if (node->locked) {
396 		btrfs_tree_unlock(node->eb);
397 		node->locked = 0;
398 	}
399 }
400 
401 static void drop_node_buffer(struct backref_node *node)
402 {
403 	if (node->eb) {
404 		unlock_node_buffer(node);
405 		free_extent_buffer(node->eb);
406 		node->eb = NULL;
407 	}
408 }
409 
410 static void drop_backref_node(struct backref_cache *tree,
411 			      struct backref_node *node)
412 {
413 	BUG_ON(!list_empty(&node->upper));
414 
415 	drop_node_buffer(node);
416 	list_del(&node->list);
417 	list_del(&node->lower);
418 	if (!RB_EMPTY_NODE(&node->rb_node))
419 		rb_erase(&node->rb_node, &tree->rb_root);
420 	free_backref_node(tree, node);
421 }
422 
423 /*
424  * remove a backref node from the backref cache
425  */
426 static void remove_backref_node(struct backref_cache *cache,
427 				struct backref_node *node)
428 {
429 	struct backref_node *upper;
430 	struct backref_edge *edge;
431 
432 	if (!node)
433 		return;
434 
435 	BUG_ON(!node->lowest && !node->detached);
436 	while (!list_empty(&node->upper)) {
437 		edge = list_entry(node->upper.next, struct backref_edge,
438 				  list[LOWER]);
439 		upper = edge->node[UPPER];
440 		list_del(&edge->list[LOWER]);
441 		list_del(&edge->list[UPPER]);
442 		free_backref_edge(cache, edge);
443 
444 		if (RB_EMPTY_NODE(&upper->rb_node)) {
445 			BUG_ON(!list_empty(&node->upper));
446 			drop_backref_node(cache, node);
447 			node = upper;
448 			node->lowest = 1;
449 			continue;
450 		}
451 		/*
452 		 * add the node to leaf node list if no other
453 		 * child block cached.
454 		 */
455 		if (list_empty(&upper->lower)) {
456 			list_add_tail(&upper->lower, &cache->leaves);
457 			upper->lowest = 1;
458 		}
459 	}
460 
461 	drop_backref_node(cache, node);
462 }
463 
464 static void update_backref_node(struct backref_cache *cache,
465 				struct backref_node *node, u64 bytenr)
466 {
467 	struct rb_node *rb_node;
468 	rb_erase(&node->rb_node, &cache->rb_root);
469 	node->bytenr = bytenr;
470 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
471 	if (rb_node)
472 		backref_tree_panic(rb_node, -EEXIST, bytenr);
473 }
474 
475 /*
476  * update backref cache after a transaction commit
477  */
478 static int update_backref_cache(struct btrfs_trans_handle *trans,
479 				struct backref_cache *cache)
480 {
481 	struct backref_node *node;
482 	int level = 0;
483 
484 	if (cache->last_trans == 0) {
485 		cache->last_trans = trans->transid;
486 		return 0;
487 	}
488 
489 	if (cache->last_trans == trans->transid)
490 		return 0;
491 
492 	/*
493 	 * detached nodes are used to avoid unnecessary backref
494 	 * lookup. transaction commit changes the extent tree.
495 	 * so the detached nodes are no longer useful.
496 	 */
497 	while (!list_empty(&cache->detached)) {
498 		node = list_entry(cache->detached.next,
499 				  struct backref_node, list);
500 		remove_backref_node(cache, node);
501 	}
502 
503 	while (!list_empty(&cache->changed)) {
504 		node = list_entry(cache->changed.next,
505 				  struct backref_node, list);
506 		list_del_init(&node->list);
507 		BUG_ON(node->pending);
508 		update_backref_node(cache, node, node->new_bytenr);
509 	}
510 
511 	/*
512 	 * some nodes can be left in the pending list if there were
513 	 * errors during processing the pending nodes.
514 	 */
515 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
516 		list_for_each_entry(node, &cache->pending[level], list) {
517 			BUG_ON(!node->pending);
518 			if (node->bytenr == node->new_bytenr)
519 				continue;
520 			update_backref_node(cache, node, node->new_bytenr);
521 		}
522 	}
523 
524 	cache->last_trans = 0;
525 	return 1;
526 }
527 
528 
529 static int should_ignore_root(struct btrfs_root *root)
530 {
531 	struct btrfs_root *reloc_root;
532 
533 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
534 		return 0;
535 
536 	reloc_root = root->reloc_root;
537 	if (!reloc_root)
538 		return 0;
539 
540 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
541 	    root->fs_info->running_transaction->transid - 1)
542 		return 0;
543 	/*
544 	 * if there is reloc tree and it was created in previous
545 	 * transaction backref lookup can find the reloc tree,
546 	 * so backref node for the fs tree root is useless for
547 	 * relocation.
548 	 */
549 	return 1;
550 }
551 /*
552  * find reloc tree by address of tree root
553  */
554 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
555 					  u64 bytenr)
556 {
557 	struct rb_node *rb_node;
558 	struct mapping_node *node;
559 	struct btrfs_root *root = NULL;
560 
561 	spin_lock(&rc->reloc_root_tree.lock);
562 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
563 	if (rb_node) {
564 		node = rb_entry(rb_node, struct mapping_node, rb_node);
565 		root = (struct btrfs_root *)node->data;
566 	}
567 	spin_unlock(&rc->reloc_root_tree.lock);
568 	return root;
569 }
570 
571 static int is_cowonly_root(u64 root_objectid)
572 {
573 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
574 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
575 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
576 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
577 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
578 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
579 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
580 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
581 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
582 		return 1;
583 	return 0;
584 }
585 
586 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
587 					u64 root_objectid)
588 {
589 	struct btrfs_key key;
590 
591 	key.objectid = root_objectid;
592 	key.type = BTRFS_ROOT_ITEM_KEY;
593 	if (is_cowonly_root(root_objectid))
594 		key.offset = 0;
595 	else
596 		key.offset = (u64)-1;
597 
598 	return btrfs_get_fs_root(fs_info, &key, false);
599 }
600 
601 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
602 static noinline_for_stack
603 struct btrfs_root *find_tree_root(struct reloc_control *rc,
604 				  struct extent_buffer *leaf,
605 				  struct btrfs_extent_ref_v0 *ref0)
606 {
607 	struct btrfs_root *root;
608 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
609 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
610 
611 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
612 
613 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
614 	BUG_ON(IS_ERR(root));
615 
616 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
617 	    generation != btrfs_root_generation(&root->root_item))
618 		return NULL;
619 
620 	return root;
621 }
622 #endif
623 
624 static noinline_for_stack
625 int find_inline_backref(struct extent_buffer *leaf, int slot,
626 			unsigned long *ptr, unsigned long *end)
627 {
628 	struct btrfs_key key;
629 	struct btrfs_extent_item *ei;
630 	struct btrfs_tree_block_info *bi;
631 	u32 item_size;
632 
633 	btrfs_item_key_to_cpu(leaf, &key, slot);
634 
635 	item_size = btrfs_item_size_nr(leaf, slot);
636 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
637 	if (item_size < sizeof(*ei)) {
638 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
639 		return 1;
640 	}
641 #endif
642 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
643 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
644 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
645 
646 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
647 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
648 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
649 		return 1;
650 	}
651 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
652 	    item_size <= sizeof(*ei)) {
653 		WARN_ON(item_size < sizeof(*ei));
654 		return 1;
655 	}
656 
657 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
658 		bi = (struct btrfs_tree_block_info *)(ei + 1);
659 		*ptr = (unsigned long)(bi + 1);
660 	} else {
661 		*ptr = (unsigned long)(ei + 1);
662 	}
663 	*end = (unsigned long)ei + item_size;
664 	return 0;
665 }
666 
667 /*
668  * build backref tree for a given tree block. root of the backref tree
669  * corresponds the tree block, leaves of the backref tree correspond
670  * roots of b-trees that reference the tree block.
671  *
672  * the basic idea of this function is check backrefs of a given block
673  * to find upper level blocks that reference the block, and then check
674  * backrefs of these upper level blocks recursively. the recursion stop
675  * when tree root is reached or backrefs for the block is cached.
676  *
677  * NOTE: if we find backrefs for a block are cached, we know backrefs
678  * for all upper level blocks that directly/indirectly reference the
679  * block are also cached.
680  */
681 static noinline_for_stack
682 struct backref_node *build_backref_tree(struct reloc_control *rc,
683 					struct btrfs_key *node_key,
684 					int level, u64 bytenr)
685 {
686 	struct backref_cache *cache = &rc->backref_cache;
687 	struct btrfs_path *path1;
688 	struct btrfs_path *path2;
689 	struct extent_buffer *eb;
690 	struct btrfs_root *root;
691 	struct backref_node *cur;
692 	struct backref_node *upper;
693 	struct backref_node *lower;
694 	struct backref_node *node = NULL;
695 	struct backref_node *exist = NULL;
696 	struct backref_edge *edge;
697 	struct rb_node *rb_node;
698 	struct btrfs_key key;
699 	unsigned long end;
700 	unsigned long ptr;
701 	LIST_HEAD(list);
702 	LIST_HEAD(useless);
703 	int cowonly;
704 	int ret;
705 	int err = 0;
706 	bool need_check = true;
707 
708 	path1 = btrfs_alloc_path();
709 	path2 = btrfs_alloc_path();
710 	if (!path1 || !path2) {
711 		err = -ENOMEM;
712 		goto out;
713 	}
714 	path1->reada = READA_FORWARD;
715 	path2->reada = READA_FORWARD;
716 
717 	node = alloc_backref_node(cache);
718 	if (!node) {
719 		err = -ENOMEM;
720 		goto out;
721 	}
722 
723 	node->bytenr = bytenr;
724 	node->level = level;
725 	node->lowest = 1;
726 	cur = node;
727 again:
728 	end = 0;
729 	ptr = 0;
730 	key.objectid = cur->bytenr;
731 	key.type = BTRFS_METADATA_ITEM_KEY;
732 	key.offset = (u64)-1;
733 
734 	path1->search_commit_root = 1;
735 	path1->skip_locking = 1;
736 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
737 				0, 0);
738 	if (ret < 0) {
739 		err = ret;
740 		goto out;
741 	}
742 	ASSERT(ret);
743 	ASSERT(path1->slots[0]);
744 
745 	path1->slots[0]--;
746 
747 	WARN_ON(cur->checked);
748 	if (!list_empty(&cur->upper)) {
749 		/*
750 		 * the backref was added previously when processing
751 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
752 		 */
753 		ASSERT(list_is_singular(&cur->upper));
754 		edge = list_entry(cur->upper.next, struct backref_edge,
755 				  list[LOWER]);
756 		ASSERT(list_empty(&edge->list[UPPER]));
757 		exist = edge->node[UPPER];
758 		/*
759 		 * add the upper level block to pending list if we need
760 		 * check its backrefs
761 		 */
762 		if (!exist->checked)
763 			list_add_tail(&edge->list[UPPER], &list);
764 	} else {
765 		exist = NULL;
766 	}
767 
768 	while (1) {
769 		cond_resched();
770 		eb = path1->nodes[0];
771 
772 		if (ptr >= end) {
773 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
774 				ret = btrfs_next_leaf(rc->extent_root, path1);
775 				if (ret < 0) {
776 					err = ret;
777 					goto out;
778 				}
779 				if (ret > 0)
780 					break;
781 				eb = path1->nodes[0];
782 			}
783 
784 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
785 			if (key.objectid != cur->bytenr) {
786 				WARN_ON(exist);
787 				break;
788 			}
789 
790 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
791 			    key.type == BTRFS_METADATA_ITEM_KEY) {
792 				ret = find_inline_backref(eb, path1->slots[0],
793 							  &ptr, &end);
794 				if (ret)
795 					goto next;
796 			}
797 		}
798 
799 		if (ptr < end) {
800 			/* update key for inline back ref */
801 			struct btrfs_extent_inline_ref *iref;
802 			iref = (struct btrfs_extent_inline_ref *)ptr;
803 			key.type = btrfs_extent_inline_ref_type(eb, iref);
804 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
805 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
806 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
807 		}
808 
809 		if (exist &&
810 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
811 		      exist->owner == key.offset) ||
812 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
813 		      exist->bytenr == key.offset))) {
814 			exist = NULL;
815 			goto next;
816 		}
817 
818 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
819 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
820 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
821 			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
822 				struct btrfs_extent_ref_v0 *ref0;
823 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
824 						struct btrfs_extent_ref_v0);
825 				if (key.objectid == key.offset) {
826 					root = find_tree_root(rc, eb, ref0);
827 					if (root && !should_ignore_root(root))
828 						cur->root = root;
829 					else
830 						list_add(&cur->list, &useless);
831 					break;
832 				}
833 				if (is_cowonly_root(btrfs_ref_root_v0(eb,
834 								      ref0)))
835 					cur->cowonly = 1;
836 			}
837 #else
838 		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
839 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
840 #endif
841 			if (key.objectid == key.offset) {
842 				/*
843 				 * only root blocks of reloc trees use
844 				 * backref of this type.
845 				 */
846 				root = find_reloc_root(rc, cur->bytenr);
847 				ASSERT(root);
848 				cur->root = root;
849 				break;
850 			}
851 
852 			edge = alloc_backref_edge(cache);
853 			if (!edge) {
854 				err = -ENOMEM;
855 				goto out;
856 			}
857 			rb_node = tree_search(&cache->rb_root, key.offset);
858 			if (!rb_node) {
859 				upper = alloc_backref_node(cache);
860 				if (!upper) {
861 					free_backref_edge(cache, edge);
862 					err = -ENOMEM;
863 					goto out;
864 				}
865 				upper->bytenr = key.offset;
866 				upper->level = cur->level + 1;
867 				/*
868 				 *  backrefs for the upper level block isn't
869 				 *  cached, add the block to pending list
870 				 */
871 				list_add_tail(&edge->list[UPPER], &list);
872 			} else {
873 				upper = rb_entry(rb_node, struct backref_node,
874 						 rb_node);
875 				ASSERT(upper->checked);
876 				INIT_LIST_HEAD(&edge->list[UPPER]);
877 			}
878 			list_add_tail(&edge->list[LOWER], &cur->upper);
879 			edge->node[LOWER] = cur;
880 			edge->node[UPPER] = upper;
881 
882 			goto next;
883 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
884 			goto next;
885 		}
886 
887 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
888 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
889 		if (IS_ERR(root)) {
890 			err = PTR_ERR(root);
891 			goto out;
892 		}
893 
894 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
895 			cur->cowonly = 1;
896 
897 		if (btrfs_root_level(&root->root_item) == cur->level) {
898 			/* tree root */
899 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
900 			       cur->bytenr);
901 			if (should_ignore_root(root))
902 				list_add(&cur->list, &useless);
903 			else
904 				cur->root = root;
905 			break;
906 		}
907 
908 		level = cur->level + 1;
909 
910 		/*
911 		 * searching the tree to find upper level blocks
912 		 * reference the block.
913 		 */
914 		path2->search_commit_root = 1;
915 		path2->skip_locking = 1;
916 		path2->lowest_level = level;
917 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
918 		path2->lowest_level = 0;
919 		if (ret < 0) {
920 			err = ret;
921 			goto out;
922 		}
923 		if (ret > 0 && path2->slots[level] > 0)
924 			path2->slots[level]--;
925 
926 		eb = path2->nodes[level];
927 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
928 		    cur->bytenr) {
929 			btrfs_err(root->fs_info,
930 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
931 				  cur->bytenr, level - 1, root->objectid,
932 				  node_key->objectid, node_key->type,
933 				  node_key->offset);
934 			err = -ENOENT;
935 			goto out;
936 		}
937 		lower = cur;
938 		need_check = true;
939 		for (; level < BTRFS_MAX_LEVEL; level++) {
940 			if (!path2->nodes[level]) {
941 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
942 				       lower->bytenr);
943 				if (should_ignore_root(root))
944 					list_add(&lower->list, &useless);
945 				else
946 					lower->root = root;
947 				break;
948 			}
949 
950 			edge = alloc_backref_edge(cache);
951 			if (!edge) {
952 				err = -ENOMEM;
953 				goto out;
954 			}
955 
956 			eb = path2->nodes[level];
957 			rb_node = tree_search(&cache->rb_root, eb->start);
958 			if (!rb_node) {
959 				upper = alloc_backref_node(cache);
960 				if (!upper) {
961 					free_backref_edge(cache, edge);
962 					err = -ENOMEM;
963 					goto out;
964 				}
965 				upper->bytenr = eb->start;
966 				upper->owner = btrfs_header_owner(eb);
967 				upper->level = lower->level + 1;
968 				if (!test_bit(BTRFS_ROOT_REF_COWS,
969 					      &root->state))
970 					upper->cowonly = 1;
971 
972 				/*
973 				 * if we know the block isn't shared
974 				 * we can void checking its backrefs.
975 				 */
976 				if (btrfs_block_can_be_shared(root, eb))
977 					upper->checked = 0;
978 				else
979 					upper->checked = 1;
980 
981 				/*
982 				 * add the block to pending list if we
983 				 * need check its backrefs, we only do this once
984 				 * while walking up a tree as we will catch
985 				 * anything else later on.
986 				 */
987 				if (!upper->checked && need_check) {
988 					need_check = false;
989 					list_add_tail(&edge->list[UPPER],
990 						      &list);
991 				} else {
992 					if (upper->checked)
993 						need_check = true;
994 					INIT_LIST_HEAD(&edge->list[UPPER]);
995 				}
996 			} else {
997 				upper = rb_entry(rb_node, struct backref_node,
998 						 rb_node);
999 				ASSERT(upper->checked);
1000 				INIT_LIST_HEAD(&edge->list[UPPER]);
1001 				if (!upper->owner)
1002 					upper->owner = btrfs_header_owner(eb);
1003 			}
1004 			list_add_tail(&edge->list[LOWER], &lower->upper);
1005 			edge->node[LOWER] = lower;
1006 			edge->node[UPPER] = upper;
1007 
1008 			if (rb_node)
1009 				break;
1010 			lower = upper;
1011 			upper = NULL;
1012 		}
1013 		btrfs_release_path(path2);
1014 next:
1015 		if (ptr < end) {
1016 			ptr += btrfs_extent_inline_ref_size(key.type);
1017 			if (ptr >= end) {
1018 				WARN_ON(ptr > end);
1019 				ptr = 0;
1020 				end = 0;
1021 			}
1022 		}
1023 		if (ptr >= end)
1024 			path1->slots[0]++;
1025 	}
1026 	btrfs_release_path(path1);
1027 
1028 	cur->checked = 1;
1029 	WARN_ON(exist);
1030 
1031 	/* the pending list isn't empty, take the first block to process */
1032 	if (!list_empty(&list)) {
1033 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034 		list_del_init(&edge->list[UPPER]);
1035 		cur = edge->node[UPPER];
1036 		goto again;
1037 	}
1038 
1039 	/*
1040 	 * everything goes well, connect backref nodes and insert backref nodes
1041 	 * into the cache.
1042 	 */
1043 	ASSERT(node->checked);
1044 	cowonly = node->cowonly;
1045 	if (!cowonly) {
1046 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1047 				      &node->rb_node);
1048 		if (rb_node)
1049 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1050 		list_add_tail(&node->lower, &cache->leaves);
1051 	}
1052 
1053 	list_for_each_entry(edge, &node->upper, list[LOWER])
1054 		list_add_tail(&edge->list[UPPER], &list);
1055 
1056 	while (!list_empty(&list)) {
1057 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1058 		list_del_init(&edge->list[UPPER]);
1059 		upper = edge->node[UPPER];
1060 		if (upper->detached) {
1061 			list_del(&edge->list[LOWER]);
1062 			lower = edge->node[LOWER];
1063 			free_backref_edge(cache, edge);
1064 			if (list_empty(&lower->upper))
1065 				list_add(&lower->list, &useless);
1066 			continue;
1067 		}
1068 
1069 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1070 			if (upper->lowest) {
1071 				list_del_init(&upper->lower);
1072 				upper->lowest = 0;
1073 			}
1074 
1075 			list_add_tail(&edge->list[UPPER], &upper->lower);
1076 			continue;
1077 		}
1078 
1079 		if (!upper->checked) {
1080 			/*
1081 			 * Still want to blow up for developers since this is a
1082 			 * logic bug.
1083 			 */
1084 			ASSERT(0);
1085 			err = -EINVAL;
1086 			goto out;
1087 		}
1088 		if (cowonly != upper->cowonly) {
1089 			ASSERT(0);
1090 			err = -EINVAL;
1091 			goto out;
1092 		}
1093 
1094 		if (!cowonly) {
1095 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1096 					      &upper->rb_node);
1097 			if (rb_node)
1098 				backref_tree_panic(rb_node, -EEXIST,
1099 						   upper->bytenr);
1100 		}
1101 
1102 		list_add_tail(&edge->list[UPPER], &upper->lower);
1103 
1104 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1105 			list_add_tail(&edge->list[UPPER], &list);
1106 	}
1107 	/*
1108 	 * process useless backref nodes. backref nodes for tree leaves
1109 	 * are deleted from the cache. backref nodes for upper level
1110 	 * tree blocks are left in the cache to avoid unnecessary backref
1111 	 * lookup.
1112 	 */
1113 	while (!list_empty(&useless)) {
1114 		upper = list_entry(useless.next, struct backref_node, list);
1115 		list_del_init(&upper->list);
1116 		ASSERT(list_empty(&upper->upper));
1117 		if (upper == node)
1118 			node = NULL;
1119 		if (upper->lowest) {
1120 			list_del_init(&upper->lower);
1121 			upper->lowest = 0;
1122 		}
1123 		while (!list_empty(&upper->lower)) {
1124 			edge = list_entry(upper->lower.next,
1125 					  struct backref_edge, list[UPPER]);
1126 			list_del(&edge->list[UPPER]);
1127 			list_del(&edge->list[LOWER]);
1128 			lower = edge->node[LOWER];
1129 			free_backref_edge(cache, edge);
1130 
1131 			if (list_empty(&lower->upper))
1132 				list_add(&lower->list, &useless);
1133 		}
1134 		__mark_block_processed(rc, upper);
1135 		if (upper->level > 0) {
1136 			list_add(&upper->list, &cache->detached);
1137 			upper->detached = 1;
1138 		} else {
1139 			rb_erase(&upper->rb_node, &cache->rb_root);
1140 			free_backref_node(cache, upper);
1141 		}
1142 	}
1143 out:
1144 	btrfs_free_path(path1);
1145 	btrfs_free_path(path2);
1146 	if (err) {
1147 		while (!list_empty(&useless)) {
1148 			lower = list_entry(useless.next,
1149 					   struct backref_node, list);
1150 			list_del_init(&lower->list);
1151 		}
1152 		while (!list_empty(&list)) {
1153 			edge = list_first_entry(&list, struct backref_edge,
1154 						list[UPPER]);
1155 			list_del(&edge->list[UPPER]);
1156 			list_del(&edge->list[LOWER]);
1157 			lower = edge->node[LOWER];
1158 			upper = edge->node[UPPER];
1159 			free_backref_edge(cache, edge);
1160 
1161 			/*
1162 			 * Lower is no longer linked to any upper backref nodes
1163 			 * and isn't in the cache, we can free it ourselves.
1164 			 */
1165 			if (list_empty(&lower->upper) &&
1166 			    RB_EMPTY_NODE(&lower->rb_node))
1167 				list_add(&lower->list, &useless);
1168 
1169 			if (!RB_EMPTY_NODE(&upper->rb_node))
1170 				continue;
1171 
1172 			/* Add this guy's upper edges to the list to process */
1173 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1174 				list_add_tail(&edge->list[UPPER], &list);
1175 			if (list_empty(&upper->upper))
1176 				list_add(&upper->list, &useless);
1177 		}
1178 
1179 		while (!list_empty(&useless)) {
1180 			lower = list_entry(useless.next,
1181 					   struct backref_node, list);
1182 			list_del_init(&lower->list);
1183 			if (lower == node)
1184 				node = NULL;
1185 			free_backref_node(cache, lower);
1186 		}
1187 
1188 		free_backref_node(cache, node);
1189 		return ERR_PTR(err);
1190 	}
1191 	ASSERT(!node || !node->detached);
1192 	return node;
1193 }
1194 
1195 /*
1196  * helper to add backref node for the newly created snapshot.
1197  * the backref node is created by cloning backref node that
1198  * corresponds to root of source tree
1199  */
1200 static int clone_backref_node(struct btrfs_trans_handle *trans,
1201 			      struct reloc_control *rc,
1202 			      struct btrfs_root *src,
1203 			      struct btrfs_root *dest)
1204 {
1205 	struct btrfs_root *reloc_root = src->reloc_root;
1206 	struct backref_cache *cache = &rc->backref_cache;
1207 	struct backref_node *node = NULL;
1208 	struct backref_node *new_node;
1209 	struct backref_edge *edge;
1210 	struct backref_edge *new_edge;
1211 	struct rb_node *rb_node;
1212 
1213 	if (cache->last_trans > 0)
1214 		update_backref_cache(trans, cache);
1215 
1216 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1217 	if (rb_node) {
1218 		node = rb_entry(rb_node, struct backref_node, rb_node);
1219 		if (node->detached)
1220 			node = NULL;
1221 		else
1222 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1223 	}
1224 
1225 	if (!node) {
1226 		rb_node = tree_search(&cache->rb_root,
1227 				      reloc_root->commit_root->start);
1228 		if (rb_node) {
1229 			node = rb_entry(rb_node, struct backref_node,
1230 					rb_node);
1231 			BUG_ON(node->detached);
1232 		}
1233 	}
1234 
1235 	if (!node)
1236 		return 0;
1237 
1238 	new_node = alloc_backref_node(cache);
1239 	if (!new_node)
1240 		return -ENOMEM;
1241 
1242 	new_node->bytenr = dest->node->start;
1243 	new_node->level = node->level;
1244 	new_node->lowest = node->lowest;
1245 	new_node->checked = 1;
1246 	new_node->root = dest;
1247 
1248 	if (!node->lowest) {
1249 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1250 			new_edge = alloc_backref_edge(cache);
1251 			if (!new_edge)
1252 				goto fail;
1253 
1254 			new_edge->node[UPPER] = new_node;
1255 			new_edge->node[LOWER] = edge->node[LOWER];
1256 			list_add_tail(&new_edge->list[UPPER],
1257 				      &new_node->lower);
1258 		}
1259 	} else {
1260 		list_add_tail(&new_node->lower, &cache->leaves);
1261 	}
1262 
1263 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1264 			      &new_node->rb_node);
1265 	if (rb_node)
1266 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1267 
1268 	if (!new_node->lowest) {
1269 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1270 			list_add_tail(&new_edge->list[LOWER],
1271 				      &new_edge->node[LOWER]->upper);
1272 		}
1273 	}
1274 	return 0;
1275 fail:
1276 	while (!list_empty(&new_node->lower)) {
1277 		new_edge = list_entry(new_node->lower.next,
1278 				      struct backref_edge, list[UPPER]);
1279 		list_del(&new_edge->list[UPPER]);
1280 		free_backref_edge(cache, new_edge);
1281 	}
1282 	free_backref_node(cache, new_node);
1283 	return -ENOMEM;
1284 }
1285 
1286 /*
1287  * helper to add 'address of tree root -> reloc tree' mapping
1288  */
1289 static int __must_check __add_reloc_root(struct btrfs_root *root)
1290 {
1291 	struct rb_node *rb_node;
1292 	struct mapping_node *node;
1293 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1294 
1295 	node = kmalloc(sizeof(*node), GFP_NOFS);
1296 	if (!node)
1297 		return -ENOMEM;
1298 
1299 	node->bytenr = root->node->start;
1300 	node->data = root;
1301 
1302 	spin_lock(&rc->reloc_root_tree.lock);
1303 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1304 			      node->bytenr, &node->rb_node);
1305 	spin_unlock(&rc->reloc_root_tree.lock);
1306 	if (rb_node) {
1307 		btrfs_panic(root->fs_info, -EEXIST,
1308 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1309 			    node->bytenr);
1310 		kfree(node);
1311 		return -EEXIST;
1312 	}
1313 
1314 	list_add_tail(&root->root_list, &rc->reloc_roots);
1315 	return 0;
1316 }
1317 
1318 /*
1319  * helper to delete the 'address of tree root -> reloc tree'
1320  * mapping
1321  */
1322 static void __del_reloc_root(struct btrfs_root *root)
1323 {
1324 	struct rb_node *rb_node;
1325 	struct mapping_node *node = NULL;
1326 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1327 
1328 	spin_lock(&rc->reloc_root_tree.lock);
1329 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1330 			      root->node->start);
1331 	if (rb_node) {
1332 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1333 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1334 	}
1335 	spin_unlock(&rc->reloc_root_tree.lock);
1336 
1337 	if (!node)
1338 		return;
1339 	BUG_ON((struct btrfs_root *)node->data != root);
1340 
1341 	spin_lock(&root->fs_info->trans_lock);
1342 	list_del_init(&root->root_list);
1343 	spin_unlock(&root->fs_info->trans_lock);
1344 	kfree(node);
1345 }
1346 
1347 /*
1348  * helper to update the 'address of tree root -> reloc tree'
1349  * mapping
1350  */
1351 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1352 {
1353 	struct rb_node *rb_node;
1354 	struct mapping_node *node = NULL;
1355 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1356 
1357 	spin_lock(&rc->reloc_root_tree.lock);
1358 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1359 			      root->node->start);
1360 	if (rb_node) {
1361 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1362 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1363 	}
1364 	spin_unlock(&rc->reloc_root_tree.lock);
1365 
1366 	if (!node)
1367 		return 0;
1368 	BUG_ON((struct btrfs_root *)node->data != root);
1369 
1370 	spin_lock(&rc->reloc_root_tree.lock);
1371 	node->bytenr = new_bytenr;
1372 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1373 			      node->bytenr, &node->rb_node);
1374 	spin_unlock(&rc->reloc_root_tree.lock);
1375 	if (rb_node)
1376 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1377 	return 0;
1378 }
1379 
1380 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1381 					struct btrfs_root *root, u64 objectid)
1382 {
1383 	struct btrfs_root *reloc_root;
1384 	struct extent_buffer *eb;
1385 	struct btrfs_root_item *root_item;
1386 	struct btrfs_key root_key;
1387 	u64 last_snap = 0;
1388 	int ret;
1389 
1390 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1391 	BUG_ON(!root_item);
1392 
1393 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1394 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1395 	root_key.offset = objectid;
1396 
1397 	if (root->root_key.objectid == objectid) {
1398 		/* called by btrfs_init_reloc_root */
1399 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1400 				      BTRFS_TREE_RELOC_OBJECTID);
1401 		BUG_ON(ret);
1402 
1403 		last_snap = btrfs_root_last_snapshot(&root->root_item);
1404 		btrfs_set_root_last_snapshot(&root->root_item,
1405 					     trans->transid - 1);
1406 	} else {
1407 		/*
1408 		 * called by btrfs_reloc_post_snapshot_hook.
1409 		 * the source tree is a reloc tree, all tree blocks
1410 		 * modified after it was created have RELOC flag
1411 		 * set in their headers. so it's OK to not update
1412 		 * the 'last_snapshot'.
1413 		 */
1414 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1415 				      BTRFS_TREE_RELOC_OBJECTID);
1416 		BUG_ON(ret);
1417 	}
1418 
1419 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1420 	btrfs_set_root_bytenr(root_item, eb->start);
1421 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1422 	btrfs_set_root_generation(root_item, trans->transid);
1423 
1424 	if (root->root_key.objectid == objectid) {
1425 		btrfs_set_root_refs(root_item, 0);
1426 		memset(&root_item->drop_progress, 0,
1427 		       sizeof(struct btrfs_disk_key));
1428 		root_item->drop_level = 0;
1429 		/*
1430 		 * abuse rtransid, it is safe because it is impossible to
1431 		 * receive data into a relocation tree.
1432 		 */
1433 		btrfs_set_root_rtransid(root_item, last_snap);
1434 		btrfs_set_root_otransid(root_item, trans->transid);
1435 	}
1436 
1437 	btrfs_tree_unlock(eb);
1438 	free_extent_buffer(eb);
1439 
1440 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1441 				&root_key, root_item);
1442 	BUG_ON(ret);
1443 	kfree(root_item);
1444 
1445 	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1446 	BUG_ON(IS_ERR(reloc_root));
1447 	reloc_root->last_trans = trans->transid;
1448 	return reloc_root;
1449 }
1450 
1451 /*
1452  * create reloc tree for a given fs tree. reloc tree is just a
1453  * snapshot of the fs tree with special root objectid.
1454  */
1455 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1456 			  struct btrfs_root *root)
1457 {
1458 	struct btrfs_root *reloc_root;
1459 	struct reloc_control *rc = root->fs_info->reloc_ctl;
1460 	struct btrfs_block_rsv *rsv;
1461 	int clear_rsv = 0;
1462 	int ret;
1463 
1464 	if (root->reloc_root) {
1465 		reloc_root = root->reloc_root;
1466 		reloc_root->last_trans = trans->transid;
1467 		return 0;
1468 	}
1469 
1470 	if (!rc || !rc->create_reloc_tree ||
1471 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1472 		return 0;
1473 
1474 	if (!trans->reloc_reserved) {
1475 		rsv = trans->block_rsv;
1476 		trans->block_rsv = rc->block_rsv;
1477 		clear_rsv = 1;
1478 	}
1479 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1480 	if (clear_rsv)
1481 		trans->block_rsv = rsv;
1482 
1483 	ret = __add_reloc_root(reloc_root);
1484 	BUG_ON(ret < 0);
1485 	root->reloc_root = reloc_root;
1486 	return 0;
1487 }
1488 
1489 /*
1490  * update root item of reloc tree
1491  */
1492 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1493 			    struct btrfs_root *root)
1494 {
1495 	struct btrfs_root *reloc_root;
1496 	struct btrfs_root_item *root_item;
1497 	int ret;
1498 
1499 	if (!root->reloc_root)
1500 		goto out;
1501 
1502 	reloc_root = root->reloc_root;
1503 	root_item = &reloc_root->root_item;
1504 
1505 	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1506 	    btrfs_root_refs(root_item) == 0) {
1507 		root->reloc_root = NULL;
1508 		__del_reloc_root(reloc_root);
1509 	}
1510 
1511 	if (reloc_root->commit_root != reloc_root->node) {
1512 		btrfs_set_root_node(root_item, reloc_root->node);
1513 		free_extent_buffer(reloc_root->commit_root);
1514 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1515 	}
1516 
1517 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
1518 				&reloc_root->root_key, root_item);
1519 	BUG_ON(ret);
1520 
1521 out:
1522 	return 0;
1523 }
1524 
1525 /*
1526  * helper to find first cached inode with inode number >= objectid
1527  * in a subvolume
1528  */
1529 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1530 {
1531 	struct rb_node *node;
1532 	struct rb_node *prev;
1533 	struct btrfs_inode *entry;
1534 	struct inode *inode;
1535 
1536 	spin_lock(&root->inode_lock);
1537 again:
1538 	node = root->inode_tree.rb_node;
1539 	prev = NULL;
1540 	while (node) {
1541 		prev = node;
1542 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1543 
1544 		if (objectid < btrfs_ino(&entry->vfs_inode))
1545 			node = node->rb_left;
1546 		else if (objectid > btrfs_ino(&entry->vfs_inode))
1547 			node = node->rb_right;
1548 		else
1549 			break;
1550 	}
1551 	if (!node) {
1552 		while (prev) {
1553 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1554 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1555 				node = prev;
1556 				break;
1557 			}
1558 			prev = rb_next(prev);
1559 		}
1560 	}
1561 	while (node) {
1562 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1563 		inode = igrab(&entry->vfs_inode);
1564 		if (inode) {
1565 			spin_unlock(&root->inode_lock);
1566 			return inode;
1567 		}
1568 
1569 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
1570 		if (cond_resched_lock(&root->inode_lock))
1571 			goto again;
1572 
1573 		node = rb_next(node);
1574 	}
1575 	spin_unlock(&root->inode_lock);
1576 	return NULL;
1577 }
1578 
1579 static int in_block_group(u64 bytenr,
1580 			  struct btrfs_block_group_cache *block_group)
1581 {
1582 	if (bytenr >= block_group->key.objectid &&
1583 	    bytenr < block_group->key.objectid + block_group->key.offset)
1584 		return 1;
1585 	return 0;
1586 }
1587 
1588 /*
1589  * get new location of data
1590  */
1591 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1592 			    u64 bytenr, u64 num_bytes)
1593 {
1594 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1595 	struct btrfs_path *path;
1596 	struct btrfs_file_extent_item *fi;
1597 	struct extent_buffer *leaf;
1598 	int ret;
1599 
1600 	path = btrfs_alloc_path();
1601 	if (!path)
1602 		return -ENOMEM;
1603 
1604 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1605 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1606 				       bytenr, 0);
1607 	if (ret < 0)
1608 		goto out;
1609 	if (ret > 0) {
1610 		ret = -ENOENT;
1611 		goto out;
1612 	}
1613 
1614 	leaf = path->nodes[0];
1615 	fi = btrfs_item_ptr(leaf, path->slots[0],
1616 			    struct btrfs_file_extent_item);
1617 
1618 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1619 	       btrfs_file_extent_compression(leaf, fi) ||
1620 	       btrfs_file_extent_encryption(leaf, fi) ||
1621 	       btrfs_file_extent_other_encoding(leaf, fi));
1622 
1623 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1624 		ret = -EINVAL;
1625 		goto out;
1626 	}
1627 
1628 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1629 	ret = 0;
1630 out:
1631 	btrfs_free_path(path);
1632 	return ret;
1633 }
1634 
1635 /*
1636  * update file extent items in the tree leaf to point to
1637  * the new locations.
1638  */
1639 static noinline_for_stack
1640 int replace_file_extents(struct btrfs_trans_handle *trans,
1641 			 struct reloc_control *rc,
1642 			 struct btrfs_root *root,
1643 			 struct extent_buffer *leaf)
1644 {
1645 	struct btrfs_key key;
1646 	struct btrfs_file_extent_item *fi;
1647 	struct inode *inode = NULL;
1648 	u64 parent;
1649 	u64 bytenr;
1650 	u64 new_bytenr = 0;
1651 	u64 num_bytes;
1652 	u64 end;
1653 	u32 nritems;
1654 	u32 i;
1655 	int ret = 0;
1656 	int first = 1;
1657 	int dirty = 0;
1658 
1659 	if (rc->stage != UPDATE_DATA_PTRS)
1660 		return 0;
1661 
1662 	/* reloc trees always use full backref */
1663 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1664 		parent = leaf->start;
1665 	else
1666 		parent = 0;
1667 
1668 	nritems = btrfs_header_nritems(leaf);
1669 	for (i = 0; i < nritems; i++) {
1670 		cond_resched();
1671 		btrfs_item_key_to_cpu(leaf, &key, i);
1672 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1673 			continue;
1674 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1675 		if (btrfs_file_extent_type(leaf, fi) ==
1676 		    BTRFS_FILE_EXTENT_INLINE)
1677 			continue;
1678 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1679 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1680 		if (bytenr == 0)
1681 			continue;
1682 		if (!in_block_group(bytenr, rc->block_group))
1683 			continue;
1684 
1685 		/*
1686 		 * if we are modifying block in fs tree, wait for readpage
1687 		 * to complete and drop the extent cache
1688 		 */
1689 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1690 			if (first) {
1691 				inode = find_next_inode(root, key.objectid);
1692 				first = 0;
1693 			} else if (inode && btrfs_ino(inode) < key.objectid) {
1694 				btrfs_add_delayed_iput(inode);
1695 				inode = find_next_inode(root, key.objectid);
1696 			}
1697 			if (inode && btrfs_ino(inode) == key.objectid) {
1698 				end = key.offset +
1699 				      btrfs_file_extent_num_bytes(leaf, fi);
1700 				WARN_ON(!IS_ALIGNED(key.offset,
1701 						    root->sectorsize));
1702 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1703 				end--;
1704 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1705 						      key.offset, end);
1706 				if (!ret)
1707 					continue;
1708 
1709 				btrfs_drop_extent_cache(inode, key.offset, end,
1710 							1);
1711 				unlock_extent(&BTRFS_I(inode)->io_tree,
1712 					      key.offset, end);
1713 			}
1714 		}
1715 
1716 		ret = get_new_location(rc->data_inode, &new_bytenr,
1717 				       bytenr, num_bytes);
1718 		if (ret) {
1719 			/*
1720 			 * Don't have to abort since we've not changed anything
1721 			 * in the file extent yet.
1722 			 */
1723 			break;
1724 		}
1725 
1726 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1727 		dirty = 1;
1728 
1729 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1730 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1731 					   num_bytes, parent,
1732 					   btrfs_header_owner(leaf),
1733 					   key.objectid, key.offset);
1734 		if (ret) {
1735 			btrfs_abort_transaction(trans, ret);
1736 			break;
1737 		}
1738 
1739 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1740 					parent, btrfs_header_owner(leaf),
1741 					key.objectid, key.offset);
1742 		if (ret) {
1743 			btrfs_abort_transaction(trans, ret);
1744 			break;
1745 		}
1746 	}
1747 	if (dirty)
1748 		btrfs_mark_buffer_dirty(leaf);
1749 	if (inode)
1750 		btrfs_add_delayed_iput(inode);
1751 	return ret;
1752 }
1753 
1754 static noinline_for_stack
1755 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1756 		     struct btrfs_path *path, int level)
1757 {
1758 	struct btrfs_disk_key key1;
1759 	struct btrfs_disk_key key2;
1760 	btrfs_node_key(eb, &key1, slot);
1761 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1762 	return memcmp(&key1, &key2, sizeof(key1));
1763 }
1764 
1765 /*
1766  * try to replace tree blocks in fs tree with the new blocks
1767  * in reloc tree. tree blocks haven't been modified since the
1768  * reloc tree was create can be replaced.
1769  *
1770  * if a block was replaced, level of the block + 1 is returned.
1771  * if no block got replaced, 0 is returned. if there are other
1772  * errors, a negative error number is returned.
1773  */
1774 static noinline_for_stack
1775 int replace_path(struct btrfs_trans_handle *trans,
1776 		 struct btrfs_root *dest, struct btrfs_root *src,
1777 		 struct btrfs_path *path, struct btrfs_key *next_key,
1778 		 int lowest_level, int max_level)
1779 {
1780 	struct extent_buffer *eb;
1781 	struct extent_buffer *parent;
1782 	struct btrfs_key key;
1783 	u64 old_bytenr;
1784 	u64 new_bytenr;
1785 	u64 old_ptr_gen;
1786 	u64 new_ptr_gen;
1787 	u64 last_snapshot;
1788 	u32 blocksize;
1789 	int cow = 0;
1790 	int level;
1791 	int ret;
1792 	int slot;
1793 
1794 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1795 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1796 
1797 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1798 again:
1799 	slot = path->slots[lowest_level];
1800 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1801 
1802 	eb = btrfs_lock_root_node(dest);
1803 	btrfs_set_lock_blocking(eb);
1804 	level = btrfs_header_level(eb);
1805 
1806 	if (level < lowest_level) {
1807 		btrfs_tree_unlock(eb);
1808 		free_extent_buffer(eb);
1809 		return 0;
1810 	}
1811 
1812 	if (cow) {
1813 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1814 		BUG_ON(ret);
1815 	}
1816 	btrfs_set_lock_blocking(eb);
1817 
1818 	if (next_key) {
1819 		next_key->objectid = (u64)-1;
1820 		next_key->type = (u8)-1;
1821 		next_key->offset = (u64)-1;
1822 	}
1823 
1824 	parent = eb;
1825 	while (1) {
1826 		level = btrfs_header_level(parent);
1827 		BUG_ON(level < lowest_level);
1828 
1829 		ret = btrfs_bin_search(parent, &key, level, &slot);
1830 		if (ret && slot > 0)
1831 			slot--;
1832 
1833 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1834 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1835 
1836 		old_bytenr = btrfs_node_blockptr(parent, slot);
1837 		blocksize = dest->nodesize;
1838 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1839 
1840 		if (level <= max_level) {
1841 			eb = path->nodes[level];
1842 			new_bytenr = btrfs_node_blockptr(eb,
1843 							path->slots[level]);
1844 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1845 							path->slots[level]);
1846 		} else {
1847 			new_bytenr = 0;
1848 			new_ptr_gen = 0;
1849 		}
1850 
1851 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1852 			ret = level;
1853 			break;
1854 		}
1855 
1856 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1857 		    memcmp_node_keys(parent, slot, path, level)) {
1858 			if (level <= lowest_level) {
1859 				ret = 0;
1860 				break;
1861 			}
1862 
1863 			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1864 			if (IS_ERR(eb)) {
1865 				ret = PTR_ERR(eb);
1866 				break;
1867 			} else if (!extent_buffer_uptodate(eb)) {
1868 				ret = -EIO;
1869 				free_extent_buffer(eb);
1870 				break;
1871 			}
1872 			btrfs_tree_lock(eb);
1873 			if (cow) {
1874 				ret = btrfs_cow_block(trans, dest, eb, parent,
1875 						      slot, &eb);
1876 				BUG_ON(ret);
1877 			}
1878 			btrfs_set_lock_blocking(eb);
1879 
1880 			btrfs_tree_unlock(parent);
1881 			free_extent_buffer(parent);
1882 
1883 			parent = eb;
1884 			continue;
1885 		}
1886 
1887 		if (!cow) {
1888 			btrfs_tree_unlock(parent);
1889 			free_extent_buffer(parent);
1890 			cow = 1;
1891 			goto again;
1892 		}
1893 
1894 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1895 				      path->slots[level]);
1896 		btrfs_release_path(path);
1897 
1898 		path->lowest_level = level;
1899 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1900 		path->lowest_level = 0;
1901 		BUG_ON(ret);
1902 
1903 		/*
1904 		 * swap blocks in fs tree and reloc tree.
1905 		 */
1906 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1907 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1908 		btrfs_mark_buffer_dirty(parent);
1909 
1910 		btrfs_set_node_blockptr(path->nodes[level],
1911 					path->slots[level], old_bytenr);
1912 		btrfs_set_node_ptr_generation(path->nodes[level],
1913 					      path->slots[level], old_ptr_gen);
1914 		btrfs_mark_buffer_dirty(path->nodes[level]);
1915 
1916 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1917 					path->nodes[level]->start,
1918 					src->root_key.objectid, level - 1, 0);
1919 		BUG_ON(ret);
1920 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1921 					0, dest->root_key.objectid, level - 1,
1922 					0);
1923 		BUG_ON(ret);
1924 
1925 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1926 					path->nodes[level]->start,
1927 					src->root_key.objectid, level - 1, 0);
1928 		BUG_ON(ret);
1929 
1930 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1931 					0, dest->root_key.objectid, level - 1,
1932 					0);
1933 		BUG_ON(ret);
1934 
1935 		btrfs_unlock_up_safe(path, 0);
1936 
1937 		ret = level;
1938 		break;
1939 	}
1940 	btrfs_tree_unlock(parent);
1941 	free_extent_buffer(parent);
1942 	return ret;
1943 }
1944 
1945 /*
1946  * helper to find next relocated block in reloc tree
1947  */
1948 static noinline_for_stack
1949 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1950 		       int *level)
1951 {
1952 	struct extent_buffer *eb;
1953 	int i;
1954 	u64 last_snapshot;
1955 	u32 nritems;
1956 
1957 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1958 
1959 	for (i = 0; i < *level; i++) {
1960 		free_extent_buffer(path->nodes[i]);
1961 		path->nodes[i] = NULL;
1962 	}
1963 
1964 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1965 		eb = path->nodes[i];
1966 		nritems = btrfs_header_nritems(eb);
1967 		while (path->slots[i] + 1 < nritems) {
1968 			path->slots[i]++;
1969 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1970 			    last_snapshot)
1971 				continue;
1972 
1973 			*level = i;
1974 			return 0;
1975 		}
1976 		free_extent_buffer(path->nodes[i]);
1977 		path->nodes[i] = NULL;
1978 	}
1979 	return 1;
1980 }
1981 
1982 /*
1983  * walk down reloc tree to find relocated block of lowest level
1984  */
1985 static noinline_for_stack
1986 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1987 			 int *level)
1988 {
1989 	struct extent_buffer *eb = NULL;
1990 	int i;
1991 	u64 bytenr;
1992 	u64 ptr_gen = 0;
1993 	u64 last_snapshot;
1994 	u32 nritems;
1995 
1996 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1997 
1998 	for (i = *level; i > 0; i--) {
1999 		eb = path->nodes[i];
2000 		nritems = btrfs_header_nritems(eb);
2001 		while (path->slots[i] < nritems) {
2002 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2003 			if (ptr_gen > last_snapshot)
2004 				break;
2005 			path->slots[i]++;
2006 		}
2007 		if (path->slots[i] >= nritems) {
2008 			if (i == *level)
2009 				break;
2010 			*level = i + 1;
2011 			return 0;
2012 		}
2013 		if (i == 1) {
2014 			*level = i;
2015 			return 0;
2016 		}
2017 
2018 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2019 		eb = read_tree_block(root, bytenr, ptr_gen);
2020 		if (IS_ERR(eb)) {
2021 			return PTR_ERR(eb);
2022 		} else if (!extent_buffer_uptodate(eb)) {
2023 			free_extent_buffer(eb);
2024 			return -EIO;
2025 		}
2026 		BUG_ON(btrfs_header_level(eb) != i - 1);
2027 		path->nodes[i - 1] = eb;
2028 		path->slots[i - 1] = 0;
2029 	}
2030 	return 1;
2031 }
2032 
2033 /*
2034  * invalidate extent cache for file extents whose key in range of
2035  * [min_key, max_key)
2036  */
2037 static int invalidate_extent_cache(struct btrfs_root *root,
2038 				   struct btrfs_key *min_key,
2039 				   struct btrfs_key *max_key)
2040 {
2041 	struct inode *inode = NULL;
2042 	u64 objectid;
2043 	u64 start, end;
2044 	u64 ino;
2045 
2046 	objectid = min_key->objectid;
2047 	while (1) {
2048 		cond_resched();
2049 		iput(inode);
2050 
2051 		if (objectid > max_key->objectid)
2052 			break;
2053 
2054 		inode = find_next_inode(root, objectid);
2055 		if (!inode)
2056 			break;
2057 		ino = btrfs_ino(inode);
2058 
2059 		if (ino > max_key->objectid) {
2060 			iput(inode);
2061 			break;
2062 		}
2063 
2064 		objectid = ino + 1;
2065 		if (!S_ISREG(inode->i_mode))
2066 			continue;
2067 
2068 		if (unlikely(min_key->objectid == ino)) {
2069 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2070 				continue;
2071 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2072 				start = 0;
2073 			else {
2074 				start = min_key->offset;
2075 				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
2076 			}
2077 		} else {
2078 			start = 0;
2079 		}
2080 
2081 		if (unlikely(max_key->objectid == ino)) {
2082 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2083 				continue;
2084 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2085 				end = (u64)-1;
2086 			} else {
2087 				if (max_key->offset == 0)
2088 					continue;
2089 				end = max_key->offset;
2090 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2091 				end--;
2092 			}
2093 		} else {
2094 			end = (u64)-1;
2095 		}
2096 
2097 		/* the lock_extent waits for readpage to complete */
2098 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2099 		btrfs_drop_extent_cache(inode, start, end, 1);
2100 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2101 	}
2102 	return 0;
2103 }
2104 
2105 static int find_next_key(struct btrfs_path *path, int level,
2106 			 struct btrfs_key *key)
2107 
2108 {
2109 	while (level < BTRFS_MAX_LEVEL) {
2110 		if (!path->nodes[level])
2111 			break;
2112 		if (path->slots[level] + 1 <
2113 		    btrfs_header_nritems(path->nodes[level])) {
2114 			btrfs_node_key_to_cpu(path->nodes[level], key,
2115 					      path->slots[level] + 1);
2116 			return 0;
2117 		}
2118 		level++;
2119 	}
2120 	return 1;
2121 }
2122 
2123 /*
2124  * merge the relocated tree blocks in reloc tree with corresponding
2125  * fs tree.
2126  */
2127 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2128 					       struct btrfs_root *root)
2129 {
2130 	LIST_HEAD(inode_list);
2131 	struct btrfs_key key;
2132 	struct btrfs_key next_key;
2133 	struct btrfs_trans_handle *trans = NULL;
2134 	struct btrfs_root *reloc_root;
2135 	struct btrfs_root_item *root_item;
2136 	struct btrfs_path *path;
2137 	struct extent_buffer *leaf;
2138 	int level;
2139 	int max_level;
2140 	int replaced = 0;
2141 	int ret;
2142 	int err = 0;
2143 	u32 min_reserved;
2144 
2145 	path = btrfs_alloc_path();
2146 	if (!path)
2147 		return -ENOMEM;
2148 	path->reada = READA_FORWARD;
2149 
2150 	reloc_root = root->reloc_root;
2151 	root_item = &reloc_root->root_item;
2152 
2153 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2154 		level = btrfs_root_level(root_item);
2155 		extent_buffer_get(reloc_root->node);
2156 		path->nodes[level] = reloc_root->node;
2157 		path->slots[level] = 0;
2158 	} else {
2159 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2160 
2161 		level = root_item->drop_level;
2162 		BUG_ON(level == 0);
2163 		path->lowest_level = level;
2164 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2165 		path->lowest_level = 0;
2166 		if (ret < 0) {
2167 			btrfs_free_path(path);
2168 			return ret;
2169 		}
2170 
2171 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2172 				      path->slots[level]);
2173 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2174 
2175 		btrfs_unlock_up_safe(path, 0);
2176 	}
2177 
2178 	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2179 	memset(&next_key, 0, sizeof(next_key));
2180 
2181 	while (1) {
2182 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2183 					     BTRFS_RESERVE_FLUSH_ALL);
2184 		if (ret) {
2185 			err = ret;
2186 			goto out;
2187 		}
2188 		trans = btrfs_start_transaction(root, 0);
2189 		if (IS_ERR(trans)) {
2190 			err = PTR_ERR(trans);
2191 			trans = NULL;
2192 			goto out;
2193 		}
2194 		trans->block_rsv = rc->block_rsv;
2195 
2196 		replaced = 0;
2197 		max_level = level;
2198 
2199 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2200 		if (ret < 0) {
2201 			err = ret;
2202 			goto out;
2203 		}
2204 		if (ret > 0)
2205 			break;
2206 
2207 		if (!find_next_key(path, level, &key) &&
2208 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2209 			ret = 0;
2210 		} else {
2211 			ret = replace_path(trans, root, reloc_root, path,
2212 					   &next_key, level, max_level);
2213 		}
2214 		if (ret < 0) {
2215 			err = ret;
2216 			goto out;
2217 		}
2218 
2219 		if (ret > 0) {
2220 			level = ret;
2221 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2222 					      path->slots[level]);
2223 			replaced = 1;
2224 		}
2225 
2226 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2227 		if (ret > 0)
2228 			break;
2229 
2230 		BUG_ON(level == 0);
2231 		/*
2232 		 * save the merging progress in the drop_progress.
2233 		 * this is OK since root refs == 1 in this case.
2234 		 */
2235 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2236 			       path->slots[level]);
2237 		root_item->drop_level = level;
2238 
2239 		btrfs_end_transaction_throttle(trans, root);
2240 		trans = NULL;
2241 
2242 		btrfs_btree_balance_dirty(root);
2243 
2244 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2245 			invalidate_extent_cache(root, &key, &next_key);
2246 	}
2247 
2248 	/*
2249 	 * handle the case only one block in the fs tree need to be
2250 	 * relocated and the block is tree root.
2251 	 */
2252 	leaf = btrfs_lock_root_node(root);
2253 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2254 	btrfs_tree_unlock(leaf);
2255 	free_extent_buffer(leaf);
2256 	if (ret < 0)
2257 		err = ret;
2258 out:
2259 	btrfs_free_path(path);
2260 
2261 	if (err == 0) {
2262 		memset(&root_item->drop_progress, 0,
2263 		       sizeof(root_item->drop_progress));
2264 		root_item->drop_level = 0;
2265 		btrfs_set_root_refs(root_item, 0);
2266 		btrfs_update_reloc_root(trans, root);
2267 	}
2268 
2269 	if (trans)
2270 		btrfs_end_transaction_throttle(trans, root);
2271 
2272 	btrfs_btree_balance_dirty(root);
2273 
2274 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2275 		invalidate_extent_cache(root, &key, &next_key);
2276 
2277 	return err;
2278 }
2279 
2280 static noinline_for_stack
2281 int prepare_to_merge(struct reloc_control *rc, int err)
2282 {
2283 	struct btrfs_root *root = rc->extent_root;
2284 	struct btrfs_root *reloc_root;
2285 	struct btrfs_trans_handle *trans;
2286 	LIST_HEAD(reloc_roots);
2287 	u64 num_bytes = 0;
2288 	int ret;
2289 
2290 	mutex_lock(&root->fs_info->reloc_mutex);
2291 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2292 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2293 	mutex_unlock(&root->fs_info->reloc_mutex);
2294 
2295 again:
2296 	if (!err) {
2297 		num_bytes = rc->merging_rsv_size;
2298 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2299 					  BTRFS_RESERVE_FLUSH_ALL);
2300 		if (ret)
2301 			err = ret;
2302 	}
2303 
2304 	trans = btrfs_join_transaction(rc->extent_root);
2305 	if (IS_ERR(trans)) {
2306 		if (!err)
2307 			btrfs_block_rsv_release(rc->extent_root,
2308 						rc->block_rsv, num_bytes);
2309 		return PTR_ERR(trans);
2310 	}
2311 
2312 	if (!err) {
2313 		if (num_bytes != rc->merging_rsv_size) {
2314 			btrfs_end_transaction(trans, rc->extent_root);
2315 			btrfs_block_rsv_release(rc->extent_root,
2316 						rc->block_rsv, num_bytes);
2317 			goto again;
2318 		}
2319 	}
2320 
2321 	rc->merge_reloc_tree = 1;
2322 
2323 	while (!list_empty(&rc->reloc_roots)) {
2324 		reloc_root = list_entry(rc->reloc_roots.next,
2325 					struct btrfs_root, root_list);
2326 		list_del_init(&reloc_root->root_list);
2327 
2328 		root = read_fs_root(reloc_root->fs_info,
2329 				    reloc_root->root_key.offset);
2330 		BUG_ON(IS_ERR(root));
2331 		BUG_ON(root->reloc_root != reloc_root);
2332 
2333 		/*
2334 		 * set reference count to 1, so btrfs_recover_relocation
2335 		 * knows it should resumes merging
2336 		 */
2337 		if (!err)
2338 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2339 		btrfs_update_reloc_root(trans, root);
2340 
2341 		list_add(&reloc_root->root_list, &reloc_roots);
2342 	}
2343 
2344 	list_splice(&reloc_roots, &rc->reloc_roots);
2345 
2346 	if (!err)
2347 		btrfs_commit_transaction(trans, rc->extent_root);
2348 	else
2349 		btrfs_end_transaction(trans, rc->extent_root);
2350 	return err;
2351 }
2352 
2353 static noinline_for_stack
2354 void free_reloc_roots(struct list_head *list)
2355 {
2356 	struct btrfs_root *reloc_root;
2357 
2358 	while (!list_empty(list)) {
2359 		reloc_root = list_entry(list->next, struct btrfs_root,
2360 					root_list);
2361 		free_extent_buffer(reloc_root->node);
2362 		free_extent_buffer(reloc_root->commit_root);
2363 		reloc_root->node = NULL;
2364 		reloc_root->commit_root = NULL;
2365 		__del_reloc_root(reloc_root);
2366 	}
2367 }
2368 
2369 static noinline_for_stack
2370 void merge_reloc_roots(struct reloc_control *rc)
2371 {
2372 	struct btrfs_root *root;
2373 	struct btrfs_root *reloc_root;
2374 	u64 last_snap;
2375 	u64 otransid;
2376 	u64 objectid;
2377 	LIST_HEAD(reloc_roots);
2378 	int found = 0;
2379 	int ret = 0;
2380 again:
2381 	root = rc->extent_root;
2382 
2383 	/*
2384 	 * this serializes us with btrfs_record_root_in_transaction,
2385 	 * we have to make sure nobody is in the middle of
2386 	 * adding their roots to the list while we are
2387 	 * doing this splice
2388 	 */
2389 	mutex_lock(&root->fs_info->reloc_mutex);
2390 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2391 	mutex_unlock(&root->fs_info->reloc_mutex);
2392 
2393 	while (!list_empty(&reloc_roots)) {
2394 		found = 1;
2395 		reloc_root = list_entry(reloc_roots.next,
2396 					struct btrfs_root, root_list);
2397 
2398 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2399 			root = read_fs_root(reloc_root->fs_info,
2400 					    reloc_root->root_key.offset);
2401 			BUG_ON(IS_ERR(root));
2402 			BUG_ON(root->reloc_root != reloc_root);
2403 
2404 			ret = merge_reloc_root(rc, root);
2405 			if (ret) {
2406 				if (list_empty(&reloc_root->root_list))
2407 					list_add_tail(&reloc_root->root_list,
2408 						      &reloc_roots);
2409 				goto out;
2410 			}
2411 		} else {
2412 			list_del_init(&reloc_root->root_list);
2413 		}
2414 
2415 		/*
2416 		 * we keep the old last snapshot transid in rtranid when we
2417 		 * created the relocation tree.
2418 		 */
2419 		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2420 		otransid = btrfs_root_otransid(&reloc_root->root_item);
2421 		objectid = reloc_root->root_key.offset;
2422 
2423 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2424 		if (ret < 0) {
2425 			if (list_empty(&reloc_root->root_list))
2426 				list_add_tail(&reloc_root->root_list,
2427 					      &reloc_roots);
2428 			goto out;
2429 		}
2430 	}
2431 
2432 	if (found) {
2433 		found = 0;
2434 		goto again;
2435 	}
2436 out:
2437 	if (ret) {
2438 		btrfs_handle_fs_error(root->fs_info, ret, NULL);
2439 		if (!list_empty(&reloc_roots))
2440 			free_reloc_roots(&reloc_roots);
2441 
2442 		/* new reloc root may be added */
2443 		mutex_lock(&root->fs_info->reloc_mutex);
2444 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2445 		mutex_unlock(&root->fs_info->reloc_mutex);
2446 		if (!list_empty(&reloc_roots))
2447 			free_reloc_roots(&reloc_roots);
2448 	}
2449 
2450 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2451 }
2452 
2453 static void free_block_list(struct rb_root *blocks)
2454 {
2455 	struct tree_block *block;
2456 	struct rb_node *rb_node;
2457 	while ((rb_node = rb_first(blocks))) {
2458 		block = rb_entry(rb_node, struct tree_block, rb_node);
2459 		rb_erase(rb_node, blocks);
2460 		kfree(block);
2461 	}
2462 }
2463 
2464 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2465 				      struct btrfs_root *reloc_root)
2466 {
2467 	struct btrfs_root *root;
2468 
2469 	if (reloc_root->last_trans == trans->transid)
2470 		return 0;
2471 
2472 	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2473 	BUG_ON(IS_ERR(root));
2474 	BUG_ON(root->reloc_root != reloc_root);
2475 
2476 	return btrfs_record_root_in_trans(trans, root);
2477 }
2478 
2479 static noinline_for_stack
2480 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2481 				     struct reloc_control *rc,
2482 				     struct backref_node *node,
2483 				     struct backref_edge *edges[])
2484 {
2485 	struct backref_node *next;
2486 	struct btrfs_root *root;
2487 	int index = 0;
2488 
2489 	next = node;
2490 	while (1) {
2491 		cond_resched();
2492 		next = walk_up_backref(next, edges, &index);
2493 		root = next->root;
2494 		BUG_ON(!root);
2495 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2496 
2497 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2498 			record_reloc_root_in_trans(trans, root);
2499 			break;
2500 		}
2501 
2502 		btrfs_record_root_in_trans(trans, root);
2503 		root = root->reloc_root;
2504 
2505 		if (next->new_bytenr != root->node->start) {
2506 			BUG_ON(next->new_bytenr);
2507 			BUG_ON(!list_empty(&next->list));
2508 			next->new_bytenr = root->node->start;
2509 			next->root = root;
2510 			list_add_tail(&next->list,
2511 				      &rc->backref_cache.changed);
2512 			__mark_block_processed(rc, next);
2513 			break;
2514 		}
2515 
2516 		WARN_ON(1);
2517 		root = NULL;
2518 		next = walk_down_backref(edges, &index);
2519 		if (!next || next->level <= node->level)
2520 			break;
2521 	}
2522 	if (!root)
2523 		return NULL;
2524 
2525 	next = node;
2526 	/* setup backref node path for btrfs_reloc_cow_block */
2527 	while (1) {
2528 		rc->backref_cache.path[next->level] = next;
2529 		if (--index < 0)
2530 			break;
2531 		next = edges[index]->node[UPPER];
2532 	}
2533 	return root;
2534 }
2535 
2536 /*
2537  * select a tree root for relocation. return NULL if the block
2538  * is reference counted. we should use do_relocation() in this
2539  * case. return a tree root pointer if the block isn't reference
2540  * counted. return -ENOENT if the block is root of reloc tree.
2541  */
2542 static noinline_for_stack
2543 struct btrfs_root *select_one_root(struct backref_node *node)
2544 {
2545 	struct backref_node *next;
2546 	struct btrfs_root *root;
2547 	struct btrfs_root *fs_root = NULL;
2548 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2549 	int index = 0;
2550 
2551 	next = node;
2552 	while (1) {
2553 		cond_resched();
2554 		next = walk_up_backref(next, edges, &index);
2555 		root = next->root;
2556 		BUG_ON(!root);
2557 
2558 		/* no other choice for non-references counted tree */
2559 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2560 			return root;
2561 
2562 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2563 			fs_root = root;
2564 
2565 		if (next != node)
2566 			return NULL;
2567 
2568 		next = walk_down_backref(edges, &index);
2569 		if (!next || next->level <= node->level)
2570 			break;
2571 	}
2572 
2573 	if (!fs_root)
2574 		return ERR_PTR(-ENOENT);
2575 	return fs_root;
2576 }
2577 
2578 static noinline_for_stack
2579 u64 calcu_metadata_size(struct reloc_control *rc,
2580 			struct backref_node *node, int reserve)
2581 {
2582 	struct backref_node *next = node;
2583 	struct backref_edge *edge;
2584 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2585 	u64 num_bytes = 0;
2586 	int index = 0;
2587 
2588 	BUG_ON(reserve && node->processed);
2589 
2590 	while (next) {
2591 		cond_resched();
2592 		while (1) {
2593 			if (next->processed && (reserve || next != node))
2594 				break;
2595 
2596 			num_bytes += rc->extent_root->nodesize;
2597 
2598 			if (list_empty(&next->upper))
2599 				break;
2600 
2601 			edge = list_entry(next->upper.next,
2602 					  struct backref_edge, list[LOWER]);
2603 			edges[index++] = edge;
2604 			next = edge->node[UPPER];
2605 		}
2606 		next = walk_down_backref(edges, &index);
2607 	}
2608 	return num_bytes;
2609 }
2610 
2611 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2612 				  struct reloc_control *rc,
2613 				  struct backref_node *node)
2614 {
2615 	struct btrfs_root *root = rc->extent_root;
2616 	u64 num_bytes;
2617 	int ret;
2618 	u64 tmp;
2619 
2620 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2621 
2622 	trans->block_rsv = rc->block_rsv;
2623 	rc->reserved_bytes += num_bytes;
2624 
2625 	/*
2626 	 * We are under a transaction here so we can only do limited flushing.
2627 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2628 	 * transaction and try to refill when we can flush all the things.
2629 	 */
2630 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2631 				BTRFS_RESERVE_FLUSH_LIMIT);
2632 	if (ret) {
2633 		tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
2634 		while (tmp <= rc->reserved_bytes)
2635 			tmp <<= 1;
2636 		/*
2637 		 * only one thread can access block_rsv at this point,
2638 		 * so we don't need hold lock to protect block_rsv.
2639 		 * we expand more reservation size here to allow enough
2640 		 * space for relocation and we will return eailer in
2641 		 * enospc case.
2642 		 */
2643 		rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2644 			RELOCATION_RESERVED_NODES;
2645 		return -EAGAIN;
2646 	}
2647 
2648 	return 0;
2649 }
2650 
2651 /*
2652  * relocate a block tree, and then update pointers in upper level
2653  * blocks that reference the block to point to the new location.
2654  *
2655  * if called by link_to_upper, the block has already been relocated.
2656  * in that case this function just updates pointers.
2657  */
2658 static int do_relocation(struct btrfs_trans_handle *trans,
2659 			 struct reloc_control *rc,
2660 			 struct backref_node *node,
2661 			 struct btrfs_key *key,
2662 			 struct btrfs_path *path, int lowest)
2663 {
2664 	struct backref_node *upper;
2665 	struct backref_edge *edge;
2666 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2667 	struct btrfs_root *root;
2668 	struct extent_buffer *eb;
2669 	u32 blocksize;
2670 	u64 bytenr;
2671 	u64 generation;
2672 	int slot;
2673 	int ret;
2674 	int err = 0;
2675 
2676 	BUG_ON(lowest && node->eb);
2677 
2678 	path->lowest_level = node->level + 1;
2679 	rc->backref_cache.path[node->level] = node;
2680 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2681 		cond_resched();
2682 
2683 		upper = edge->node[UPPER];
2684 		root = select_reloc_root(trans, rc, upper, edges);
2685 		BUG_ON(!root);
2686 
2687 		if (upper->eb && !upper->locked) {
2688 			if (!lowest) {
2689 				ret = btrfs_bin_search(upper->eb, key,
2690 						       upper->level, &slot);
2691 				BUG_ON(ret);
2692 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2693 				if (node->eb->start == bytenr)
2694 					goto next;
2695 			}
2696 			drop_node_buffer(upper);
2697 		}
2698 
2699 		if (!upper->eb) {
2700 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2701 			if (ret) {
2702 				if (ret < 0)
2703 					err = ret;
2704 				else
2705 					err = -ENOENT;
2706 
2707 				btrfs_release_path(path);
2708 				break;
2709 			}
2710 
2711 			if (!upper->eb) {
2712 				upper->eb = path->nodes[upper->level];
2713 				path->nodes[upper->level] = NULL;
2714 			} else {
2715 				BUG_ON(upper->eb != path->nodes[upper->level]);
2716 			}
2717 
2718 			upper->locked = 1;
2719 			path->locks[upper->level] = 0;
2720 
2721 			slot = path->slots[upper->level];
2722 			btrfs_release_path(path);
2723 		} else {
2724 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2725 					       &slot);
2726 			BUG_ON(ret);
2727 		}
2728 
2729 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2730 		if (lowest) {
2731 			BUG_ON(bytenr != node->bytenr);
2732 		} else {
2733 			if (node->eb->start == bytenr)
2734 				goto next;
2735 		}
2736 
2737 		blocksize = root->nodesize;
2738 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2739 		eb = read_tree_block(root, bytenr, generation);
2740 		if (IS_ERR(eb)) {
2741 			err = PTR_ERR(eb);
2742 			goto next;
2743 		} else if (!extent_buffer_uptodate(eb)) {
2744 			free_extent_buffer(eb);
2745 			err = -EIO;
2746 			goto next;
2747 		}
2748 		btrfs_tree_lock(eb);
2749 		btrfs_set_lock_blocking(eb);
2750 
2751 		if (!node->eb) {
2752 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2753 					      slot, &eb);
2754 			btrfs_tree_unlock(eb);
2755 			free_extent_buffer(eb);
2756 			if (ret < 0) {
2757 				err = ret;
2758 				goto next;
2759 			}
2760 			BUG_ON(node->eb != eb);
2761 		} else {
2762 			btrfs_set_node_blockptr(upper->eb, slot,
2763 						node->eb->start);
2764 			btrfs_set_node_ptr_generation(upper->eb, slot,
2765 						      trans->transid);
2766 			btrfs_mark_buffer_dirty(upper->eb);
2767 
2768 			ret = btrfs_inc_extent_ref(trans, root,
2769 						node->eb->start, blocksize,
2770 						upper->eb->start,
2771 						btrfs_header_owner(upper->eb),
2772 						node->level, 0);
2773 			BUG_ON(ret);
2774 
2775 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2776 			BUG_ON(ret);
2777 		}
2778 next:
2779 		if (!upper->pending)
2780 			drop_node_buffer(upper);
2781 		else
2782 			unlock_node_buffer(upper);
2783 		if (err)
2784 			break;
2785 	}
2786 
2787 	if (!err && node->pending) {
2788 		drop_node_buffer(node);
2789 		list_move_tail(&node->list, &rc->backref_cache.changed);
2790 		node->pending = 0;
2791 	}
2792 
2793 	path->lowest_level = 0;
2794 	BUG_ON(err == -ENOSPC);
2795 	return err;
2796 }
2797 
2798 static int link_to_upper(struct btrfs_trans_handle *trans,
2799 			 struct reloc_control *rc,
2800 			 struct backref_node *node,
2801 			 struct btrfs_path *path)
2802 {
2803 	struct btrfs_key key;
2804 
2805 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2806 	return do_relocation(trans, rc, node, &key, path, 0);
2807 }
2808 
2809 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2810 				struct reloc_control *rc,
2811 				struct btrfs_path *path, int err)
2812 {
2813 	LIST_HEAD(list);
2814 	struct backref_cache *cache = &rc->backref_cache;
2815 	struct backref_node *node;
2816 	int level;
2817 	int ret;
2818 
2819 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2820 		while (!list_empty(&cache->pending[level])) {
2821 			node = list_entry(cache->pending[level].next,
2822 					  struct backref_node, list);
2823 			list_move_tail(&node->list, &list);
2824 			BUG_ON(!node->pending);
2825 
2826 			if (!err) {
2827 				ret = link_to_upper(trans, rc, node, path);
2828 				if (ret < 0)
2829 					err = ret;
2830 			}
2831 		}
2832 		list_splice_init(&list, &cache->pending[level]);
2833 	}
2834 	return err;
2835 }
2836 
2837 static void mark_block_processed(struct reloc_control *rc,
2838 				 u64 bytenr, u32 blocksize)
2839 {
2840 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2841 			EXTENT_DIRTY);
2842 }
2843 
2844 static void __mark_block_processed(struct reloc_control *rc,
2845 				   struct backref_node *node)
2846 {
2847 	u32 blocksize;
2848 	if (node->level == 0 ||
2849 	    in_block_group(node->bytenr, rc->block_group)) {
2850 		blocksize = rc->extent_root->nodesize;
2851 		mark_block_processed(rc, node->bytenr, blocksize);
2852 	}
2853 	node->processed = 1;
2854 }
2855 
2856 /*
2857  * mark a block and all blocks directly/indirectly reference the block
2858  * as processed.
2859  */
2860 static void update_processed_blocks(struct reloc_control *rc,
2861 				    struct backref_node *node)
2862 {
2863 	struct backref_node *next = node;
2864 	struct backref_edge *edge;
2865 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2866 	int index = 0;
2867 
2868 	while (next) {
2869 		cond_resched();
2870 		while (1) {
2871 			if (next->processed)
2872 				break;
2873 
2874 			__mark_block_processed(rc, next);
2875 
2876 			if (list_empty(&next->upper))
2877 				break;
2878 
2879 			edge = list_entry(next->upper.next,
2880 					  struct backref_edge, list[LOWER]);
2881 			edges[index++] = edge;
2882 			next = edge->node[UPPER];
2883 		}
2884 		next = walk_down_backref(edges, &index);
2885 	}
2886 }
2887 
2888 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2889 {
2890 	u32 blocksize = rc->extent_root->nodesize;
2891 
2892 	if (test_range_bit(&rc->processed_blocks, bytenr,
2893 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2894 		return 1;
2895 	return 0;
2896 }
2897 
2898 static int get_tree_block_key(struct reloc_control *rc,
2899 			      struct tree_block *block)
2900 {
2901 	struct extent_buffer *eb;
2902 
2903 	BUG_ON(block->key_ready);
2904 	eb = read_tree_block(rc->extent_root, block->bytenr,
2905 			     block->key.offset);
2906 	if (IS_ERR(eb)) {
2907 		return PTR_ERR(eb);
2908 	} else if (!extent_buffer_uptodate(eb)) {
2909 		free_extent_buffer(eb);
2910 		return -EIO;
2911 	}
2912 	WARN_ON(btrfs_header_level(eb) != block->level);
2913 	if (block->level == 0)
2914 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2915 	else
2916 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2917 	free_extent_buffer(eb);
2918 	block->key_ready = 1;
2919 	return 0;
2920 }
2921 
2922 /*
2923  * helper function to relocate a tree block
2924  */
2925 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2926 				struct reloc_control *rc,
2927 				struct backref_node *node,
2928 				struct btrfs_key *key,
2929 				struct btrfs_path *path)
2930 {
2931 	struct btrfs_root *root;
2932 	int ret = 0;
2933 
2934 	if (!node)
2935 		return 0;
2936 
2937 	BUG_ON(node->processed);
2938 	root = select_one_root(node);
2939 	if (root == ERR_PTR(-ENOENT)) {
2940 		update_processed_blocks(rc, node);
2941 		goto out;
2942 	}
2943 
2944 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2945 		ret = reserve_metadata_space(trans, rc, node);
2946 		if (ret)
2947 			goto out;
2948 	}
2949 
2950 	if (root) {
2951 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2952 			BUG_ON(node->new_bytenr);
2953 			BUG_ON(!list_empty(&node->list));
2954 			btrfs_record_root_in_trans(trans, root);
2955 			root = root->reloc_root;
2956 			node->new_bytenr = root->node->start;
2957 			node->root = root;
2958 			list_add_tail(&node->list, &rc->backref_cache.changed);
2959 		} else {
2960 			path->lowest_level = node->level;
2961 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2962 			btrfs_release_path(path);
2963 			if (ret > 0)
2964 				ret = 0;
2965 		}
2966 		if (!ret)
2967 			update_processed_blocks(rc, node);
2968 	} else {
2969 		ret = do_relocation(trans, rc, node, key, path, 1);
2970 	}
2971 out:
2972 	if (ret || node->level == 0 || node->cowonly)
2973 		remove_backref_node(&rc->backref_cache, node);
2974 	return ret;
2975 }
2976 
2977 /*
2978  * relocate a list of blocks
2979  */
2980 static noinline_for_stack
2981 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2982 			 struct reloc_control *rc, struct rb_root *blocks)
2983 {
2984 	struct backref_node *node;
2985 	struct btrfs_path *path;
2986 	struct tree_block *block;
2987 	struct rb_node *rb_node;
2988 	int ret;
2989 	int err = 0;
2990 
2991 	path = btrfs_alloc_path();
2992 	if (!path) {
2993 		err = -ENOMEM;
2994 		goto out_free_blocks;
2995 	}
2996 
2997 	rb_node = rb_first(blocks);
2998 	while (rb_node) {
2999 		block = rb_entry(rb_node, struct tree_block, rb_node);
3000 		if (!block->key_ready)
3001 			readahead_tree_block(rc->extent_root, block->bytenr);
3002 		rb_node = rb_next(rb_node);
3003 	}
3004 
3005 	rb_node = rb_first(blocks);
3006 	while (rb_node) {
3007 		block = rb_entry(rb_node, struct tree_block, rb_node);
3008 		if (!block->key_ready) {
3009 			err = get_tree_block_key(rc, block);
3010 			if (err)
3011 				goto out_free_path;
3012 		}
3013 		rb_node = rb_next(rb_node);
3014 	}
3015 
3016 	rb_node = rb_first(blocks);
3017 	while (rb_node) {
3018 		block = rb_entry(rb_node, struct tree_block, rb_node);
3019 
3020 		node = build_backref_tree(rc, &block->key,
3021 					  block->level, block->bytenr);
3022 		if (IS_ERR(node)) {
3023 			err = PTR_ERR(node);
3024 			goto out;
3025 		}
3026 
3027 		ret = relocate_tree_block(trans, rc, node, &block->key,
3028 					  path);
3029 		if (ret < 0) {
3030 			if (ret != -EAGAIN || rb_node == rb_first(blocks))
3031 				err = ret;
3032 			goto out;
3033 		}
3034 		rb_node = rb_next(rb_node);
3035 	}
3036 out:
3037 	err = finish_pending_nodes(trans, rc, path, err);
3038 
3039 out_free_path:
3040 	btrfs_free_path(path);
3041 out_free_blocks:
3042 	free_block_list(blocks);
3043 	return err;
3044 }
3045 
3046 static noinline_for_stack
3047 int prealloc_file_extent_cluster(struct inode *inode,
3048 				 struct file_extent_cluster *cluster)
3049 {
3050 	u64 alloc_hint = 0;
3051 	u64 start;
3052 	u64 end;
3053 	u64 offset = BTRFS_I(inode)->index_cnt;
3054 	u64 num_bytes;
3055 	int nr = 0;
3056 	int ret = 0;
3057 	u64 prealloc_start = cluster->start - offset;
3058 	u64 prealloc_end = cluster->end - offset;
3059 	u64 cur_offset;
3060 
3061 	BUG_ON(cluster->start != cluster->boundary[0]);
3062 	inode_lock(inode);
3063 
3064 	ret = btrfs_check_data_free_space(inode, prealloc_start,
3065 					  prealloc_end + 1 - prealloc_start);
3066 	if (ret)
3067 		goto out;
3068 
3069 	cur_offset = prealloc_start;
3070 	while (nr < cluster->nr) {
3071 		start = cluster->boundary[nr] - offset;
3072 		if (nr + 1 < cluster->nr)
3073 			end = cluster->boundary[nr + 1] - 1 - offset;
3074 		else
3075 			end = cluster->end - offset;
3076 
3077 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3078 		num_bytes = end + 1 - start;
3079 		if (cur_offset < start)
3080 			btrfs_free_reserved_data_space(inode, cur_offset,
3081 					start - cur_offset);
3082 		ret = btrfs_prealloc_file_range(inode, 0, start,
3083 						num_bytes, num_bytes,
3084 						end + 1, &alloc_hint);
3085 		cur_offset = end + 1;
3086 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3087 		if (ret)
3088 			break;
3089 		nr++;
3090 	}
3091 	if (cur_offset < prealloc_end)
3092 		btrfs_free_reserved_data_space(inode, cur_offset,
3093 				       prealloc_end + 1 - cur_offset);
3094 out:
3095 	inode_unlock(inode);
3096 	return ret;
3097 }
3098 
3099 static noinline_for_stack
3100 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3101 			 u64 block_start)
3102 {
3103 	struct btrfs_root *root = BTRFS_I(inode)->root;
3104 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3105 	struct extent_map *em;
3106 	int ret = 0;
3107 
3108 	em = alloc_extent_map();
3109 	if (!em)
3110 		return -ENOMEM;
3111 
3112 	em->start = start;
3113 	em->len = end + 1 - start;
3114 	em->block_len = em->len;
3115 	em->block_start = block_start;
3116 	em->bdev = root->fs_info->fs_devices->latest_bdev;
3117 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3118 
3119 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3120 	while (1) {
3121 		write_lock(&em_tree->lock);
3122 		ret = add_extent_mapping(em_tree, em, 0);
3123 		write_unlock(&em_tree->lock);
3124 		if (ret != -EEXIST) {
3125 			free_extent_map(em);
3126 			break;
3127 		}
3128 		btrfs_drop_extent_cache(inode, start, end, 0);
3129 	}
3130 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3131 	return ret;
3132 }
3133 
3134 static int relocate_file_extent_cluster(struct inode *inode,
3135 					struct file_extent_cluster *cluster)
3136 {
3137 	u64 page_start;
3138 	u64 page_end;
3139 	u64 offset = BTRFS_I(inode)->index_cnt;
3140 	unsigned long index;
3141 	unsigned long last_index;
3142 	struct page *page;
3143 	struct file_ra_state *ra;
3144 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3145 	int nr = 0;
3146 	int ret = 0;
3147 
3148 	if (!cluster->nr)
3149 		return 0;
3150 
3151 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3152 	if (!ra)
3153 		return -ENOMEM;
3154 
3155 	ret = prealloc_file_extent_cluster(inode, cluster);
3156 	if (ret)
3157 		goto out;
3158 
3159 	file_ra_state_init(ra, inode->i_mapping);
3160 
3161 	ret = setup_extent_mapping(inode, cluster->start - offset,
3162 				   cluster->end - offset, cluster->start);
3163 	if (ret)
3164 		goto out;
3165 
3166 	index = (cluster->start - offset) >> PAGE_SHIFT;
3167 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3168 	while (index <= last_index) {
3169 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
3170 		if (ret)
3171 			goto out;
3172 
3173 		page = find_lock_page(inode->i_mapping, index);
3174 		if (!page) {
3175 			page_cache_sync_readahead(inode->i_mapping,
3176 						  ra, NULL, index,
3177 						  last_index + 1 - index);
3178 			page = find_or_create_page(inode->i_mapping, index,
3179 						   mask);
3180 			if (!page) {
3181 				btrfs_delalloc_release_metadata(inode,
3182 							PAGE_SIZE);
3183 				ret = -ENOMEM;
3184 				goto out;
3185 			}
3186 		}
3187 
3188 		if (PageReadahead(page)) {
3189 			page_cache_async_readahead(inode->i_mapping,
3190 						   ra, NULL, page, index,
3191 						   last_index + 1 - index);
3192 		}
3193 
3194 		if (!PageUptodate(page)) {
3195 			btrfs_readpage(NULL, page);
3196 			lock_page(page);
3197 			if (!PageUptodate(page)) {
3198 				unlock_page(page);
3199 				put_page(page);
3200 				btrfs_delalloc_release_metadata(inode,
3201 							PAGE_SIZE);
3202 				ret = -EIO;
3203 				goto out;
3204 			}
3205 		}
3206 
3207 		page_start = page_offset(page);
3208 		page_end = page_start + PAGE_SIZE - 1;
3209 
3210 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3211 
3212 		set_page_extent_mapped(page);
3213 
3214 		if (nr < cluster->nr &&
3215 		    page_start + offset == cluster->boundary[nr]) {
3216 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3217 					page_start, page_end,
3218 					EXTENT_BOUNDARY);
3219 			nr++;
3220 		}
3221 
3222 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
3223 		set_page_dirty(page);
3224 
3225 		unlock_extent(&BTRFS_I(inode)->io_tree,
3226 			      page_start, page_end);
3227 		unlock_page(page);
3228 		put_page(page);
3229 
3230 		index++;
3231 		balance_dirty_pages_ratelimited(inode->i_mapping);
3232 		btrfs_throttle(BTRFS_I(inode)->root);
3233 	}
3234 	WARN_ON(nr != cluster->nr);
3235 out:
3236 	kfree(ra);
3237 	return ret;
3238 }
3239 
3240 static noinline_for_stack
3241 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3242 			 struct file_extent_cluster *cluster)
3243 {
3244 	int ret;
3245 
3246 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3247 		ret = relocate_file_extent_cluster(inode, cluster);
3248 		if (ret)
3249 			return ret;
3250 		cluster->nr = 0;
3251 	}
3252 
3253 	if (!cluster->nr)
3254 		cluster->start = extent_key->objectid;
3255 	else
3256 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3257 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3258 	cluster->boundary[cluster->nr] = extent_key->objectid;
3259 	cluster->nr++;
3260 
3261 	if (cluster->nr >= MAX_EXTENTS) {
3262 		ret = relocate_file_extent_cluster(inode, cluster);
3263 		if (ret)
3264 			return ret;
3265 		cluster->nr = 0;
3266 	}
3267 	return 0;
3268 }
3269 
3270 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3271 static int get_ref_objectid_v0(struct reloc_control *rc,
3272 			       struct btrfs_path *path,
3273 			       struct btrfs_key *extent_key,
3274 			       u64 *ref_objectid, int *path_change)
3275 {
3276 	struct btrfs_key key;
3277 	struct extent_buffer *leaf;
3278 	struct btrfs_extent_ref_v0 *ref0;
3279 	int ret;
3280 	int slot;
3281 
3282 	leaf = path->nodes[0];
3283 	slot = path->slots[0];
3284 	while (1) {
3285 		if (slot >= btrfs_header_nritems(leaf)) {
3286 			ret = btrfs_next_leaf(rc->extent_root, path);
3287 			if (ret < 0)
3288 				return ret;
3289 			BUG_ON(ret > 0);
3290 			leaf = path->nodes[0];
3291 			slot = path->slots[0];
3292 			if (path_change)
3293 				*path_change = 1;
3294 		}
3295 		btrfs_item_key_to_cpu(leaf, &key, slot);
3296 		if (key.objectid != extent_key->objectid)
3297 			return -ENOENT;
3298 
3299 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3300 			slot++;
3301 			continue;
3302 		}
3303 		ref0 = btrfs_item_ptr(leaf, slot,
3304 				struct btrfs_extent_ref_v0);
3305 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3306 		break;
3307 	}
3308 	return 0;
3309 }
3310 #endif
3311 
3312 /*
3313  * helper to add a tree block to the list.
3314  * the major work is getting the generation and level of the block
3315  */
3316 static int add_tree_block(struct reloc_control *rc,
3317 			  struct btrfs_key *extent_key,
3318 			  struct btrfs_path *path,
3319 			  struct rb_root *blocks)
3320 {
3321 	struct extent_buffer *eb;
3322 	struct btrfs_extent_item *ei;
3323 	struct btrfs_tree_block_info *bi;
3324 	struct tree_block *block;
3325 	struct rb_node *rb_node;
3326 	u32 item_size;
3327 	int level = -1;
3328 	u64 generation;
3329 
3330 	eb =  path->nodes[0];
3331 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3332 
3333 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3334 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3335 		ei = btrfs_item_ptr(eb, path->slots[0],
3336 				struct btrfs_extent_item);
3337 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3338 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3339 			level = btrfs_tree_block_level(eb, bi);
3340 		} else {
3341 			level = (int)extent_key->offset;
3342 		}
3343 		generation = btrfs_extent_generation(eb, ei);
3344 	} else {
3345 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3346 		u64 ref_owner;
3347 		int ret;
3348 
3349 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3350 		ret = get_ref_objectid_v0(rc, path, extent_key,
3351 					  &ref_owner, NULL);
3352 		if (ret < 0)
3353 			return ret;
3354 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3355 		level = (int)ref_owner;
3356 		/* FIXME: get real generation */
3357 		generation = 0;
3358 #else
3359 		BUG();
3360 #endif
3361 	}
3362 
3363 	btrfs_release_path(path);
3364 
3365 	BUG_ON(level == -1);
3366 
3367 	block = kmalloc(sizeof(*block), GFP_NOFS);
3368 	if (!block)
3369 		return -ENOMEM;
3370 
3371 	block->bytenr = extent_key->objectid;
3372 	block->key.objectid = rc->extent_root->nodesize;
3373 	block->key.offset = generation;
3374 	block->level = level;
3375 	block->key_ready = 0;
3376 
3377 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3378 	if (rb_node)
3379 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3380 
3381 	return 0;
3382 }
3383 
3384 /*
3385  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3386  */
3387 static int __add_tree_block(struct reloc_control *rc,
3388 			    u64 bytenr, u32 blocksize,
3389 			    struct rb_root *blocks)
3390 {
3391 	struct btrfs_path *path;
3392 	struct btrfs_key key;
3393 	int ret;
3394 	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3395 					SKINNY_METADATA);
3396 
3397 	if (tree_block_processed(bytenr, rc))
3398 		return 0;
3399 
3400 	if (tree_search(blocks, bytenr))
3401 		return 0;
3402 
3403 	path = btrfs_alloc_path();
3404 	if (!path)
3405 		return -ENOMEM;
3406 again:
3407 	key.objectid = bytenr;
3408 	if (skinny) {
3409 		key.type = BTRFS_METADATA_ITEM_KEY;
3410 		key.offset = (u64)-1;
3411 	} else {
3412 		key.type = BTRFS_EXTENT_ITEM_KEY;
3413 		key.offset = blocksize;
3414 	}
3415 
3416 	path->search_commit_root = 1;
3417 	path->skip_locking = 1;
3418 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3419 	if (ret < 0)
3420 		goto out;
3421 
3422 	if (ret > 0 && skinny) {
3423 		if (path->slots[0]) {
3424 			path->slots[0]--;
3425 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3426 					      path->slots[0]);
3427 			if (key.objectid == bytenr &&
3428 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3429 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3430 			      key.offset == blocksize)))
3431 				ret = 0;
3432 		}
3433 
3434 		if (ret) {
3435 			skinny = false;
3436 			btrfs_release_path(path);
3437 			goto again;
3438 		}
3439 	}
3440 	BUG_ON(ret);
3441 
3442 	ret = add_tree_block(rc, &key, path, blocks);
3443 out:
3444 	btrfs_free_path(path);
3445 	return ret;
3446 }
3447 
3448 /*
3449  * helper to check if the block use full backrefs for pointers in it
3450  */
3451 static int block_use_full_backref(struct reloc_control *rc,
3452 				  struct extent_buffer *eb)
3453 {
3454 	u64 flags;
3455 	int ret;
3456 
3457 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3458 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3459 		return 1;
3460 
3461 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3462 				       eb->start, btrfs_header_level(eb), 1,
3463 				       NULL, &flags);
3464 	BUG_ON(ret);
3465 
3466 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3467 		ret = 1;
3468 	else
3469 		ret = 0;
3470 	return ret;
3471 }
3472 
3473 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3474 				    struct btrfs_block_group_cache *block_group,
3475 				    struct inode *inode,
3476 				    u64 ino)
3477 {
3478 	struct btrfs_key key;
3479 	struct btrfs_root *root = fs_info->tree_root;
3480 	struct btrfs_trans_handle *trans;
3481 	int ret = 0;
3482 
3483 	if (inode)
3484 		goto truncate;
3485 
3486 	key.objectid = ino;
3487 	key.type = BTRFS_INODE_ITEM_KEY;
3488 	key.offset = 0;
3489 
3490 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3491 	if (IS_ERR(inode) || is_bad_inode(inode)) {
3492 		if (!IS_ERR(inode))
3493 			iput(inode);
3494 		return -ENOENT;
3495 	}
3496 
3497 truncate:
3498 	ret = btrfs_check_trunc_cache_free_space(root,
3499 						 &fs_info->global_block_rsv);
3500 	if (ret)
3501 		goto out;
3502 
3503 	trans = btrfs_join_transaction(root);
3504 	if (IS_ERR(trans)) {
3505 		ret = PTR_ERR(trans);
3506 		goto out;
3507 	}
3508 
3509 	ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3510 
3511 	btrfs_end_transaction(trans, root);
3512 	btrfs_btree_balance_dirty(root);
3513 out:
3514 	iput(inode);
3515 	return ret;
3516 }
3517 
3518 /*
3519  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3520  * this function scans fs tree to find blocks reference the data extent
3521  */
3522 static int find_data_references(struct reloc_control *rc,
3523 				struct btrfs_key *extent_key,
3524 				struct extent_buffer *leaf,
3525 				struct btrfs_extent_data_ref *ref,
3526 				struct rb_root *blocks)
3527 {
3528 	struct btrfs_path *path;
3529 	struct tree_block *block;
3530 	struct btrfs_root *root;
3531 	struct btrfs_file_extent_item *fi;
3532 	struct rb_node *rb_node;
3533 	struct btrfs_key key;
3534 	u64 ref_root;
3535 	u64 ref_objectid;
3536 	u64 ref_offset;
3537 	u32 ref_count;
3538 	u32 nritems;
3539 	int err = 0;
3540 	int added = 0;
3541 	int counted;
3542 	int ret;
3543 
3544 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3545 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3546 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3547 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3548 
3549 	/*
3550 	 * This is an extent belonging to the free space cache, lets just delete
3551 	 * it and redo the search.
3552 	 */
3553 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3554 		ret = delete_block_group_cache(rc->extent_root->fs_info,
3555 					       rc->block_group,
3556 					       NULL, ref_objectid);
3557 		if (ret != -ENOENT)
3558 			return ret;
3559 		ret = 0;
3560 	}
3561 
3562 	path = btrfs_alloc_path();
3563 	if (!path)
3564 		return -ENOMEM;
3565 	path->reada = READA_FORWARD;
3566 
3567 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
3568 	if (IS_ERR(root)) {
3569 		err = PTR_ERR(root);
3570 		goto out;
3571 	}
3572 
3573 	key.objectid = ref_objectid;
3574 	key.type = BTRFS_EXTENT_DATA_KEY;
3575 	if (ref_offset > ((u64)-1 << 32))
3576 		key.offset = 0;
3577 	else
3578 		key.offset = ref_offset;
3579 
3580 	path->search_commit_root = 1;
3581 	path->skip_locking = 1;
3582 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3583 	if (ret < 0) {
3584 		err = ret;
3585 		goto out;
3586 	}
3587 
3588 	leaf = path->nodes[0];
3589 	nritems = btrfs_header_nritems(leaf);
3590 	/*
3591 	 * the references in tree blocks that use full backrefs
3592 	 * are not counted in
3593 	 */
3594 	if (block_use_full_backref(rc, leaf))
3595 		counted = 0;
3596 	else
3597 		counted = 1;
3598 	rb_node = tree_search(blocks, leaf->start);
3599 	if (rb_node) {
3600 		if (counted)
3601 			added = 1;
3602 		else
3603 			path->slots[0] = nritems;
3604 	}
3605 
3606 	while (ref_count > 0) {
3607 		while (path->slots[0] >= nritems) {
3608 			ret = btrfs_next_leaf(root, path);
3609 			if (ret < 0) {
3610 				err = ret;
3611 				goto out;
3612 			}
3613 			if (WARN_ON(ret > 0))
3614 				goto out;
3615 
3616 			leaf = path->nodes[0];
3617 			nritems = btrfs_header_nritems(leaf);
3618 			added = 0;
3619 
3620 			if (block_use_full_backref(rc, leaf))
3621 				counted = 0;
3622 			else
3623 				counted = 1;
3624 			rb_node = tree_search(blocks, leaf->start);
3625 			if (rb_node) {
3626 				if (counted)
3627 					added = 1;
3628 				else
3629 					path->slots[0] = nritems;
3630 			}
3631 		}
3632 
3633 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3634 		if (WARN_ON(key.objectid != ref_objectid ||
3635 		    key.type != BTRFS_EXTENT_DATA_KEY))
3636 			break;
3637 
3638 		fi = btrfs_item_ptr(leaf, path->slots[0],
3639 				    struct btrfs_file_extent_item);
3640 
3641 		if (btrfs_file_extent_type(leaf, fi) ==
3642 		    BTRFS_FILE_EXTENT_INLINE)
3643 			goto next;
3644 
3645 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3646 		    extent_key->objectid)
3647 			goto next;
3648 
3649 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3650 		if (key.offset != ref_offset)
3651 			goto next;
3652 
3653 		if (counted)
3654 			ref_count--;
3655 		if (added)
3656 			goto next;
3657 
3658 		if (!tree_block_processed(leaf->start, rc)) {
3659 			block = kmalloc(sizeof(*block), GFP_NOFS);
3660 			if (!block) {
3661 				err = -ENOMEM;
3662 				break;
3663 			}
3664 			block->bytenr = leaf->start;
3665 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3666 			block->level = 0;
3667 			block->key_ready = 1;
3668 			rb_node = tree_insert(blocks, block->bytenr,
3669 					      &block->rb_node);
3670 			if (rb_node)
3671 				backref_tree_panic(rb_node, -EEXIST,
3672 						   block->bytenr);
3673 		}
3674 		if (counted)
3675 			added = 1;
3676 		else
3677 			path->slots[0] = nritems;
3678 next:
3679 		path->slots[0]++;
3680 
3681 	}
3682 out:
3683 	btrfs_free_path(path);
3684 	return err;
3685 }
3686 
3687 /*
3688  * helper to find all tree blocks that reference a given data extent
3689  */
3690 static noinline_for_stack
3691 int add_data_references(struct reloc_control *rc,
3692 			struct btrfs_key *extent_key,
3693 			struct btrfs_path *path,
3694 			struct rb_root *blocks)
3695 {
3696 	struct btrfs_key key;
3697 	struct extent_buffer *eb;
3698 	struct btrfs_extent_data_ref *dref;
3699 	struct btrfs_extent_inline_ref *iref;
3700 	unsigned long ptr;
3701 	unsigned long end;
3702 	u32 blocksize = rc->extent_root->nodesize;
3703 	int ret = 0;
3704 	int err = 0;
3705 
3706 	eb = path->nodes[0];
3707 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3708 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3709 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3710 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3711 		ptr = end;
3712 	else
3713 #endif
3714 		ptr += sizeof(struct btrfs_extent_item);
3715 
3716 	while (ptr < end) {
3717 		iref = (struct btrfs_extent_inline_ref *)ptr;
3718 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3719 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3720 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3721 			ret = __add_tree_block(rc, key.offset, blocksize,
3722 					       blocks);
3723 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3724 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3725 			ret = find_data_references(rc, extent_key,
3726 						   eb, dref, blocks);
3727 		} else {
3728 			BUG();
3729 		}
3730 		if (ret) {
3731 			err = ret;
3732 			goto out;
3733 		}
3734 		ptr += btrfs_extent_inline_ref_size(key.type);
3735 	}
3736 	WARN_ON(ptr > end);
3737 
3738 	while (1) {
3739 		cond_resched();
3740 		eb = path->nodes[0];
3741 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3742 			ret = btrfs_next_leaf(rc->extent_root, path);
3743 			if (ret < 0) {
3744 				err = ret;
3745 				break;
3746 			}
3747 			if (ret > 0)
3748 				break;
3749 			eb = path->nodes[0];
3750 		}
3751 
3752 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3753 		if (key.objectid != extent_key->objectid)
3754 			break;
3755 
3756 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3757 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3758 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3759 #else
3760 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3761 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3762 #endif
3763 			ret = __add_tree_block(rc, key.offset, blocksize,
3764 					       blocks);
3765 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3766 			dref = btrfs_item_ptr(eb, path->slots[0],
3767 					      struct btrfs_extent_data_ref);
3768 			ret = find_data_references(rc, extent_key,
3769 						   eb, dref, blocks);
3770 		} else {
3771 			ret = 0;
3772 		}
3773 		if (ret) {
3774 			err = ret;
3775 			break;
3776 		}
3777 		path->slots[0]++;
3778 	}
3779 out:
3780 	btrfs_release_path(path);
3781 	if (err)
3782 		free_block_list(blocks);
3783 	return err;
3784 }
3785 
3786 /*
3787  * helper to find next unprocessed extent
3788  */
3789 static noinline_for_stack
3790 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3791 		     struct btrfs_key *extent_key)
3792 {
3793 	struct btrfs_key key;
3794 	struct extent_buffer *leaf;
3795 	u64 start, end, last;
3796 	int ret;
3797 
3798 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3799 	while (1) {
3800 		cond_resched();
3801 		if (rc->search_start >= last) {
3802 			ret = 1;
3803 			break;
3804 		}
3805 
3806 		key.objectid = rc->search_start;
3807 		key.type = BTRFS_EXTENT_ITEM_KEY;
3808 		key.offset = 0;
3809 
3810 		path->search_commit_root = 1;
3811 		path->skip_locking = 1;
3812 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3813 					0, 0);
3814 		if (ret < 0)
3815 			break;
3816 next:
3817 		leaf = path->nodes[0];
3818 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3819 			ret = btrfs_next_leaf(rc->extent_root, path);
3820 			if (ret != 0)
3821 				break;
3822 			leaf = path->nodes[0];
3823 		}
3824 
3825 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3826 		if (key.objectid >= last) {
3827 			ret = 1;
3828 			break;
3829 		}
3830 
3831 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3832 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3833 			path->slots[0]++;
3834 			goto next;
3835 		}
3836 
3837 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3838 		    key.objectid + key.offset <= rc->search_start) {
3839 			path->slots[0]++;
3840 			goto next;
3841 		}
3842 
3843 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3844 		    key.objectid + rc->extent_root->nodesize <=
3845 		    rc->search_start) {
3846 			path->slots[0]++;
3847 			goto next;
3848 		}
3849 
3850 		ret = find_first_extent_bit(&rc->processed_blocks,
3851 					    key.objectid, &start, &end,
3852 					    EXTENT_DIRTY, NULL);
3853 
3854 		if (ret == 0 && start <= key.objectid) {
3855 			btrfs_release_path(path);
3856 			rc->search_start = end + 1;
3857 		} else {
3858 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3859 				rc->search_start = key.objectid + key.offset;
3860 			else
3861 				rc->search_start = key.objectid +
3862 					rc->extent_root->nodesize;
3863 			memcpy(extent_key, &key, sizeof(key));
3864 			return 0;
3865 		}
3866 	}
3867 	btrfs_release_path(path);
3868 	return ret;
3869 }
3870 
3871 static void set_reloc_control(struct reloc_control *rc)
3872 {
3873 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3874 
3875 	mutex_lock(&fs_info->reloc_mutex);
3876 	fs_info->reloc_ctl = rc;
3877 	mutex_unlock(&fs_info->reloc_mutex);
3878 }
3879 
3880 static void unset_reloc_control(struct reloc_control *rc)
3881 {
3882 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3883 
3884 	mutex_lock(&fs_info->reloc_mutex);
3885 	fs_info->reloc_ctl = NULL;
3886 	mutex_unlock(&fs_info->reloc_mutex);
3887 }
3888 
3889 static int check_extent_flags(u64 flags)
3890 {
3891 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3892 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3893 		return 1;
3894 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3895 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3896 		return 1;
3897 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3898 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3899 		return 1;
3900 	return 0;
3901 }
3902 
3903 static noinline_for_stack
3904 int prepare_to_relocate(struct reloc_control *rc)
3905 {
3906 	struct btrfs_trans_handle *trans;
3907 	int ret;
3908 
3909 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3910 					      BTRFS_BLOCK_RSV_TEMP);
3911 	if (!rc->block_rsv)
3912 		return -ENOMEM;
3913 
3914 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3915 	rc->search_start = rc->block_group->key.objectid;
3916 	rc->extents_found = 0;
3917 	rc->nodes_relocated = 0;
3918 	rc->merging_rsv_size = 0;
3919 	rc->reserved_bytes = 0;
3920 	rc->block_rsv->size = rc->extent_root->nodesize *
3921 			      RELOCATION_RESERVED_NODES;
3922 	ret = btrfs_block_rsv_refill(rc->extent_root,
3923 				     rc->block_rsv, rc->block_rsv->size,
3924 				     BTRFS_RESERVE_FLUSH_ALL);
3925 	if (ret)
3926 		return ret;
3927 
3928 	rc->create_reloc_tree = 1;
3929 	set_reloc_control(rc);
3930 
3931 	trans = btrfs_join_transaction(rc->extent_root);
3932 	if (IS_ERR(trans)) {
3933 		unset_reloc_control(rc);
3934 		/*
3935 		 * extent tree is not a ref_cow tree and has no reloc_root to
3936 		 * cleanup.  And callers are responsible to free the above
3937 		 * block rsv.
3938 		 */
3939 		return PTR_ERR(trans);
3940 	}
3941 	btrfs_commit_transaction(trans, rc->extent_root);
3942 	return 0;
3943 }
3944 
3945 /*
3946  * Qgroup fixer for data chunk relocation.
3947  * The data relocation is done in the following steps
3948  * 1) Copy data extents into data reloc tree
3949  * 2) Create tree reloc tree(special snapshot) for related subvolumes
3950  * 3) Modify file extents in tree reloc tree
3951  * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
3952  *
3953  * The problem is, data and tree reloc tree are not accounted to qgroup,
3954  * and 4) will only info qgroup to track tree blocks change, not file extents
3955  * in the tree blocks.
3956  *
3957  * The good news is, related data extents are all in data reloc tree, so we
3958  * only need to info qgroup to track all file extents in data reloc tree
3959  * before commit trans.
3960  */
3961 static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
3962 					     struct reloc_control *rc)
3963 {
3964 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3965 	struct inode *inode = rc->data_inode;
3966 	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
3967 	struct btrfs_path *path;
3968 	struct btrfs_key key;
3969 	int ret = 0;
3970 
3971 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3972 		return 0;
3973 
3974 	/*
3975 	 * Only for stage where we update data pointers the qgroup fix is
3976 	 * valid.
3977 	 * For MOVING_DATA stage, we will miss the timing of swapping tree
3978 	 * blocks, and won't fix it.
3979 	 */
3980 	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
3981 		return 0;
3982 
3983 	path = btrfs_alloc_path();
3984 	if (!path)
3985 		return -ENOMEM;
3986 	key.objectid = btrfs_ino(inode);
3987 	key.type = BTRFS_EXTENT_DATA_KEY;
3988 	key.offset = 0;
3989 
3990 	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
3991 	if (ret < 0)
3992 		goto out;
3993 
3994 	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
3995 	while (1) {
3996 		struct btrfs_file_extent_item *fi;
3997 
3998 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3999 		if (key.objectid > btrfs_ino(inode))
4000 			break;
4001 		if (key.type != BTRFS_EXTENT_DATA_KEY)
4002 			goto next;
4003 		fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
4004 				    struct btrfs_file_extent_item);
4005 		if (btrfs_file_extent_type(path->nodes[0], fi) !=
4006 				BTRFS_FILE_EXTENT_REG)
4007 			goto next;
4008 		ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
4009 			btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
4010 			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
4011 			GFP_NOFS);
4012 		if (ret < 0)
4013 			break;
4014 next:
4015 		ret = btrfs_next_item(data_reloc_root, path);
4016 		if (ret < 0)
4017 			break;
4018 		if (ret > 0) {
4019 			ret = 0;
4020 			break;
4021 		}
4022 	}
4023 	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
4024 out:
4025 	btrfs_free_path(path);
4026 	return ret;
4027 }
4028 
4029 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4030 {
4031 	struct rb_root blocks = RB_ROOT;
4032 	struct btrfs_key key;
4033 	struct btrfs_trans_handle *trans = NULL;
4034 	struct btrfs_path *path;
4035 	struct btrfs_extent_item *ei;
4036 	u64 flags;
4037 	u32 item_size;
4038 	int ret;
4039 	int err = 0;
4040 	int progress = 0;
4041 
4042 	path = btrfs_alloc_path();
4043 	if (!path)
4044 		return -ENOMEM;
4045 	path->reada = READA_FORWARD;
4046 
4047 	ret = prepare_to_relocate(rc);
4048 	if (ret) {
4049 		err = ret;
4050 		goto out_free;
4051 	}
4052 
4053 	while (1) {
4054 		rc->reserved_bytes = 0;
4055 		ret = btrfs_block_rsv_refill(rc->extent_root,
4056 					rc->block_rsv, rc->block_rsv->size,
4057 					BTRFS_RESERVE_FLUSH_ALL);
4058 		if (ret) {
4059 			err = ret;
4060 			break;
4061 		}
4062 		progress++;
4063 		trans = btrfs_start_transaction(rc->extent_root, 0);
4064 		if (IS_ERR(trans)) {
4065 			err = PTR_ERR(trans);
4066 			trans = NULL;
4067 			break;
4068 		}
4069 restart:
4070 		if (update_backref_cache(trans, &rc->backref_cache)) {
4071 			btrfs_end_transaction(trans, rc->extent_root);
4072 			continue;
4073 		}
4074 
4075 		ret = find_next_extent(rc, path, &key);
4076 		if (ret < 0)
4077 			err = ret;
4078 		if (ret != 0)
4079 			break;
4080 
4081 		rc->extents_found++;
4082 
4083 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4084 				    struct btrfs_extent_item);
4085 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4086 		if (item_size >= sizeof(*ei)) {
4087 			flags = btrfs_extent_flags(path->nodes[0], ei);
4088 			ret = check_extent_flags(flags);
4089 			BUG_ON(ret);
4090 
4091 		} else {
4092 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4093 			u64 ref_owner;
4094 			int path_change = 0;
4095 
4096 			BUG_ON(item_size !=
4097 			       sizeof(struct btrfs_extent_item_v0));
4098 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4099 						  &path_change);
4100 			if (ret < 0) {
4101 				err = ret;
4102 				break;
4103 			}
4104 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4105 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4106 			else
4107 				flags = BTRFS_EXTENT_FLAG_DATA;
4108 
4109 			if (path_change) {
4110 				btrfs_release_path(path);
4111 
4112 				path->search_commit_root = 1;
4113 				path->skip_locking = 1;
4114 				ret = btrfs_search_slot(NULL, rc->extent_root,
4115 							&key, path, 0, 0);
4116 				if (ret < 0) {
4117 					err = ret;
4118 					break;
4119 				}
4120 				BUG_ON(ret > 0);
4121 			}
4122 #else
4123 			BUG();
4124 #endif
4125 		}
4126 
4127 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4128 			ret = add_tree_block(rc, &key, path, &blocks);
4129 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4130 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4131 			ret = add_data_references(rc, &key, path, &blocks);
4132 		} else {
4133 			btrfs_release_path(path);
4134 			ret = 0;
4135 		}
4136 		if (ret < 0) {
4137 			err = ret;
4138 			break;
4139 		}
4140 
4141 		if (!RB_EMPTY_ROOT(&blocks)) {
4142 			ret = relocate_tree_blocks(trans, rc, &blocks);
4143 			if (ret < 0) {
4144 				/*
4145 				 * if we fail to relocate tree blocks, force to update
4146 				 * backref cache when committing transaction.
4147 				 */
4148 				rc->backref_cache.last_trans = trans->transid - 1;
4149 
4150 				if (ret != -EAGAIN) {
4151 					err = ret;
4152 					break;
4153 				}
4154 				rc->extents_found--;
4155 				rc->search_start = key.objectid;
4156 			}
4157 		}
4158 
4159 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4160 		btrfs_btree_balance_dirty(rc->extent_root);
4161 		trans = NULL;
4162 
4163 		if (rc->stage == MOVE_DATA_EXTENTS &&
4164 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4165 			rc->found_file_extent = 1;
4166 			ret = relocate_data_extent(rc->data_inode,
4167 						   &key, &rc->cluster);
4168 			if (ret < 0) {
4169 				err = ret;
4170 				break;
4171 			}
4172 		}
4173 	}
4174 	if (trans && progress && err == -ENOSPC) {
4175 		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4176 					      rc->block_group->flags);
4177 		if (ret == 1) {
4178 			err = 0;
4179 			progress = 0;
4180 			goto restart;
4181 		}
4182 	}
4183 
4184 	btrfs_release_path(path);
4185 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4186 
4187 	if (trans) {
4188 		btrfs_end_transaction_throttle(trans, rc->extent_root);
4189 		btrfs_btree_balance_dirty(rc->extent_root);
4190 	}
4191 
4192 	if (!err) {
4193 		ret = relocate_file_extent_cluster(rc->data_inode,
4194 						   &rc->cluster);
4195 		if (ret < 0)
4196 			err = ret;
4197 	}
4198 
4199 	rc->create_reloc_tree = 0;
4200 	set_reloc_control(rc);
4201 
4202 	backref_cache_cleanup(&rc->backref_cache);
4203 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4204 
4205 	err = prepare_to_merge(rc, err);
4206 
4207 	merge_reloc_roots(rc);
4208 
4209 	rc->merge_reloc_tree = 0;
4210 	unset_reloc_control(rc);
4211 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4212 
4213 	/* get rid of pinned extents */
4214 	trans = btrfs_join_transaction(rc->extent_root);
4215 	if (IS_ERR(trans)) {
4216 		err = PTR_ERR(trans);
4217 		goto out_free;
4218 	}
4219 	ret = qgroup_fix_relocated_data_extents(trans, rc);
4220 	if (ret < 0) {
4221 		btrfs_abort_transaction(trans, ret);
4222 		if (!err)
4223 			err = ret;
4224 		goto out_free;
4225 	}
4226 	btrfs_commit_transaction(trans, rc->extent_root);
4227 out_free:
4228 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4229 	btrfs_free_path(path);
4230 	return err;
4231 }
4232 
4233 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4234 				 struct btrfs_root *root, u64 objectid)
4235 {
4236 	struct btrfs_path *path;
4237 	struct btrfs_inode_item *item;
4238 	struct extent_buffer *leaf;
4239 	int ret;
4240 
4241 	path = btrfs_alloc_path();
4242 	if (!path)
4243 		return -ENOMEM;
4244 
4245 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4246 	if (ret)
4247 		goto out;
4248 
4249 	leaf = path->nodes[0];
4250 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4251 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4252 	btrfs_set_inode_generation(leaf, item, 1);
4253 	btrfs_set_inode_size(leaf, item, 0);
4254 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4255 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4256 					  BTRFS_INODE_PREALLOC);
4257 	btrfs_mark_buffer_dirty(leaf);
4258 out:
4259 	btrfs_free_path(path);
4260 	return ret;
4261 }
4262 
4263 /*
4264  * helper to create inode for data relocation.
4265  * the inode is in data relocation tree and its link count is 0
4266  */
4267 static noinline_for_stack
4268 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4269 				 struct btrfs_block_group_cache *group)
4270 {
4271 	struct inode *inode = NULL;
4272 	struct btrfs_trans_handle *trans;
4273 	struct btrfs_root *root;
4274 	struct btrfs_key key;
4275 	u64 objectid;
4276 	int err = 0;
4277 
4278 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4279 	if (IS_ERR(root))
4280 		return ERR_CAST(root);
4281 
4282 	trans = btrfs_start_transaction(root, 6);
4283 	if (IS_ERR(trans))
4284 		return ERR_CAST(trans);
4285 
4286 	err = btrfs_find_free_objectid(root, &objectid);
4287 	if (err)
4288 		goto out;
4289 
4290 	err = __insert_orphan_inode(trans, root, objectid);
4291 	BUG_ON(err);
4292 
4293 	key.objectid = objectid;
4294 	key.type = BTRFS_INODE_ITEM_KEY;
4295 	key.offset = 0;
4296 	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4297 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4298 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4299 
4300 	err = btrfs_orphan_add(trans, inode);
4301 out:
4302 	btrfs_end_transaction(trans, root);
4303 	btrfs_btree_balance_dirty(root);
4304 	if (err) {
4305 		if (inode)
4306 			iput(inode);
4307 		inode = ERR_PTR(err);
4308 	}
4309 	return inode;
4310 }
4311 
4312 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4313 {
4314 	struct reloc_control *rc;
4315 
4316 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4317 	if (!rc)
4318 		return NULL;
4319 
4320 	INIT_LIST_HEAD(&rc->reloc_roots);
4321 	backref_cache_init(&rc->backref_cache);
4322 	mapping_tree_init(&rc->reloc_root_tree);
4323 	extent_io_tree_init(&rc->processed_blocks,
4324 			    fs_info->btree_inode->i_mapping);
4325 	return rc;
4326 }
4327 
4328 /*
4329  * function to relocate all extents in a block group.
4330  */
4331 int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4332 {
4333 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
4334 	struct reloc_control *rc;
4335 	struct inode *inode;
4336 	struct btrfs_path *path;
4337 	int ret;
4338 	int rw = 0;
4339 	int err = 0;
4340 
4341 	rc = alloc_reloc_control(fs_info);
4342 	if (!rc)
4343 		return -ENOMEM;
4344 
4345 	rc->extent_root = extent_root;
4346 
4347 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4348 	BUG_ON(!rc->block_group);
4349 
4350 	ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
4351 	if (ret) {
4352 		err = ret;
4353 		goto out;
4354 	}
4355 	rw = 1;
4356 
4357 	path = btrfs_alloc_path();
4358 	if (!path) {
4359 		err = -ENOMEM;
4360 		goto out;
4361 	}
4362 
4363 	inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4364 					path);
4365 	btrfs_free_path(path);
4366 
4367 	if (!IS_ERR(inode))
4368 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4369 	else
4370 		ret = PTR_ERR(inode);
4371 
4372 	if (ret && ret != -ENOENT) {
4373 		err = ret;
4374 		goto out;
4375 	}
4376 
4377 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4378 	if (IS_ERR(rc->data_inode)) {
4379 		err = PTR_ERR(rc->data_inode);
4380 		rc->data_inode = NULL;
4381 		goto out;
4382 	}
4383 
4384 	btrfs_info(extent_root->fs_info,
4385 		   "relocating block group %llu flags %llu",
4386 		   rc->block_group->key.objectid, rc->block_group->flags);
4387 
4388 	btrfs_wait_block_group_reservations(rc->block_group);
4389 	btrfs_wait_nocow_writers(rc->block_group);
4390 	btrfs_wait_ordered_roots(fs_info, -1,
4391 				 rc->block_group->key.objectid,
4392 				 rc->block_group->key.offset);
4393 
4394 	while (1) {
4395 		mutex_lock(&fs_info->cleaner_mutex);
4396 		ret = relocate_block_group(rc);
4397 		mutex_unlock(&fs_info->cleaner_mutex);
4398 		if (ret < 0) {
4399 			err = ret;
4400 			goto out;
4401 		}
4402 
4403 		if (rc->extents_found == 0)
4404 			break;
4405 
4406 		btrfs_info(extent_root->fs_info, "found %llu extents",
4407 			rc->extents_found);
4408 
4409 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4410 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4411 						       (u64)-1);
4412 			if (ret) {
4413 				err = ret;
4414 				goto out;
4415 			}
4416 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4417 						 0, -1);
4418 			rc->stage = UPDATE_DATA_PTRS;
4419 		}
4420 	}
4421 
4422 	WARN_ON(rc->block_group->pinned > 0);
4423 	WARN_ON(rc->block_group->reserved > 0);
4424 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4425 out:
4426 	if (err && rw)
4427 		btrfs_dec_block_group_ro(extent_root, rc->block_group);
4428 	iput(rc->data_inode);
4429 	btrfs_put_block_group(rc->block_group);
4430 	kfree(rc);
4431 	return err;
4432 }
4433 
4434 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4435 {
4436 	struct btrfs_trans_handle *trans;
4437 	int ret, err;
4438 
4439 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4440 	if (IS_ERR(trans))
4441 		return PTR_ERR(trans);
4442 
4443 	memset(&root->root_item.drop_progress, 0,
4444 		sizeof(root->root_item.drop_progress));
4445 	root->root_item.drop_level = 0;
4446 	btrfs_set_root_refs(&root->root_item, 0);
4447 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
4448 				&root->root_key, &root->root_item);
4449 
4450 	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4451 	if (err)
4452 		return err;
4453 	return ret;
4454 }
4455 
4456 /*
4457  * recover relocation interrupted by system crash.
4458  *
4459  * this function resumes merging reloc trees with corresponding fs trees.
4460  * this is important for keeping the sharing of tree blocks
4461  */
4462 int btrfs_recover_relocation(struct btrfs_root *root)
4463 {
4464 	LIST_HEAD(reloc_roots);
4465 	struct btrfs_key key;
4466 	struct btrfs_root *fs_root;
4467 	struct btrfs_root *reloc_root;
4468 	struct btrfs_path *path;
4469 	struct extent_buffer *leaf;
4470 	struct reloc_control *rc = NULL;
4471 	struct btrfs_trans_handle *trans;
4472 	int ret;
4473 	int err = 0;
4474 
4475 	path = btrfs_alloc_path();
4476 	if (!path)
4477 		return -ENOMEM;
4478 	path->reada = READA_BACK;
4479 
4480 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4481 	key.type = BTRFS_ROOT_ITEM_KEY;
4482 	key.offset = (u64)-1;
4483 
4484 	while (1) {
4485 		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4486 					path, 0, 0);
4487 		if (ret < 0) {
4488 			err = ret;
4489 			goto out;
4490 		}
4491 		if (ret > 0) {
4492 			if (path->slots[0] == 0)
4493 				break;
4494 			path->slots[0]--;
4495 		}
4496 		leaf = path->nodes[0];
4497 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4498 		btrfs_release_path(path);
4499 
4500 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4501 		    key.type != BTRFS_ROOT_ITEM_KEY)
4502 			break;
4503 
4504 		reloc_root = btrfs_read_fs_root(root, &key);
4505 		if (IS_ERR(reloc_root)) {
4506 			err = PTR_ERR(reloc_root);
4507 			goto out;
4508 		}
4509 
4510 		list_add(&reloc_root->root_list, &reloc_roots);
4511 
4512 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4513 			fs_root = read_fs_root(root->fs_info,
4514 					       reloc_root->root_key.offset);
4515 			if (IS_ERR(fs_root)) {
4516 				ret = PTR_ERR(fs_root);
4517 				if (ret != -ENOENT) {
4518 					err = ret;
4519 					goto out;
4520 				}
4521 				ret = mark_garbage_root(reloc_root);
4522 				if (ret < 0) {
4523 					err = ret;
4524 					goto out;
4525 				}
4526 			}
4527 		}
4528 
4529 		if (key.offset == 0)
4530 			break;
4531 
4532 		key.offset--;
4533 	}
4534 	btrfs_release_path(path);
4535 
4536 	if (list_empty(&reloc_roots))
4537 		goto out;
4538 
4539 	rc = alloc_reloc_control(root->fs_info);
4540 	if (!rc) {
4541 		err = -ENOMEM;
4542 		goto out;
4543 	}
4544 
4545 	rc->extent_root = root->fs_info->extent_root;
4546 
4547 	set_reloc_control(rc);
4548 
4549 	trans = btrfs_join_transaction(rc->extent_root);
4550 	if (IS_ERR(trans)) {
4551 		unset_reloc_control(rc);
4552 		err = PTR_ERR(trans);
4553 		goto out_free;
4554 	}
4555 
4556 	rc->merge_reloc_tree = 1;
4557 
4558 	while (!list_empty(&reloc_roots)) {
4559 		reloc_root = list_entry(reloc_roots.next,
4560 					struct btrfs_root, root_list);
4561 		list_del(&reloc_root->root_list);
4562 
4563 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4564 			list_add_tail(&reloc_root->root_list,
4565 				      &rc->reloc_roots);
4566 			continue;
4567 		}
4568 
4569 		fs_root = read_fs_root(root->fs_info,
4570 				       reloc_root->root_key.offset);
4571 		if (IS_ERR(fs_root)) {
4572 			err = PTR_ERR(fs_root);
4573 			goto out_free;
4574 		}
4575 
4576 		err = __add_reloc_root(reloc_root);
4577 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4578 		fs_root->reloc_root = reloc_root;
4579 	}
4580 
4581 	err = btrfs_commit_transaction(trans, rc->extent_root);
4582 	if (err)
4583 		goto out_free;
4584 
4585 	merge_reloc_roots(rc);
4586 
4587 	unset_reloc_control(rc);
4588 
4589 	trans = btrfs_join_transaction(rc->extent_root);
4590 	if (IS_ERR(trans)) {
4591 		err = PTR_ERR(trans);
4592 		goto out_free;
4593 	}
4594 	err = qgroup_fix_relocated_data_extents(trans, rc);
4595 	if (err < 0) {
4596 		btrfs_abort_transaction(trans, err);
4597 		goto out_free;
4598 	}
4599 	err = btrfs_commit_transaction(trans, rc->extent_root);
4600 out_free:
4601 	kfree(rc);
4602 out:
4603 	if (!list_empty(&reloc_roots))
4604 		free_reloc_roots(&reloc_roots);
4605 
4606 	btrfs_free_path(path);
4607 
4608 	if (err == 0) {
4609 		/* cleanup orphan inode in data relocation tree */
4610 		fs_root = read_fs_root(root->fs_info,
4611 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
4612 		if (IS_ERR(fs_root))
4613 			err = PTR_ERR(fs_root);
4614 		else
4615 			err = btrfs_orphan_cleanup(fs_root);
4616 	}
4617 	return err;
4618 }
4619 
4620 /*
4621  * helper to add ordered checksum for data relocation.
4622  *
4623  * cloning checksum properly handles the nodatasum extents.
4624  * it also saves CPU time to re-calculate the checksum.
4625  */
4626 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4627 {
4628 	struct btrfs_ordered_sum *sums;
4629 	struct btrfs_ordered_extent *ordered;
4630 	struct btrfs_root *root = BTRFS_I(inode)->root;
4631 	int ret;
4632 	u64 disk_bytenr;
4633 	u64 new_bytenr;
4634 	LIST_HEAD(list);
4635 
4636 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4637 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4638 
4639 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4640 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4641 				       disk_bytenr + len - 1, &list, 0);
4642 	if (ret)
4643 		goto out;
4644 
4645 	while (!list_empty(&list)) {
4646 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4647 		list_del_init(&sums->list);
4648 
4649 		/*
4650 		 * We need to offset the new_bytenr based on where the csum is.
4651 		 * We need to do this because we will read in entire prealloc
4652 		 * extents but we may have written to say the middle of the
4653 		 * prealloc extent, so we need to make sure the csum goes with
4654 		 * the right disk offset.
4655 		 *
4656 		 * We can do this because the data reloc inode refers strictly
4657 		 * to the on disk bytes, so we don't have to worry about
4658 		 * disk_len vs real len like with real inodes since it's all
4659 		 * disk length.
4660 		 */
4661 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4662 		sums->bytenr = new_bytenr;
4663 
4664 		btrfs_add_ordered_sum(inode, ordered, sums);
4665 	}
4666 out:
4667 	btrfs_put_ordered_extent(ordered);
4668 	return ret;
4669 }
4670 
4671 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4672 			  struct btrfs_root *root, struct extent_buffer *buf,
4673 			  struct extent_buffer *cow)
4674 {
4675 	struct reloc_control *rc;
4676 	struct backref_node *node;
4677 	int first_cow = 0;
4678 	int level;
4679 	int ret = 0;
4680 
4681 	rc = root->fs_info->reloc_ctl;
4682 	if (!rc)
4683 		return 0;
4684 
4685 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4686 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4687 
4688 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4689 		if (buf == root->node)
4690 			__update_reloc_root(root, cow->start);
4691 	}
4692 
4693 	level = btrfs_header_level(buf);
4694 	if (btrfs_header_generation(buf) <=
4695 	    btrfs_root_last_snapshot(&root->root_item))
4696 		first_cow = 1;
4697 
4698 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4699 	    rc->create_reloc_tree) {
4700 		WARN_ON(!first_cow && level == 0);
4701 
4702 		node = rc->backref_cache.path[level];
4703 		BUG_ON(node->bytenr != buf->start &&
4704 		       node->new_bytenr != buf->start);
4705 
4706 		drop_node_buffer(node);
4707 		extent_buffer_get(cow);
4708 		node->eb = cow;
4709 		node->new_bytenr = cow->start;
4710 
4711 		if (!node->pending) {
4712 			list_move_tail(&node->list,
4713 				       &rc->backref_cache.pending[level]);
4714 			node->pending = 1;
4715 		}
4716 
4717 		if (first_cow)
4718 			__mark_block_processed(rc, node);
4719 
4720 		if (first_cow && level > 0)
4721 			rc->nodes_relocated += buf->len;
4722 	}
4723 
4724 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4725 		ret = replace_file_extents(trans, rc, root, cow);
4726 	return ret;
4727 }
4728 
4729 /*
4730  * called before creating snapshot. it calculates metadata reservation
4731  * required for relocating tree blocks in the snapshot
4732  */
4733 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4734 			      u64 *bytes_to_reserve)
4735 {
4736 	struct btrfs_root *root;
4737 	struct reloc_control *rc;
4738 
4739 	root = pending->root;
4740 	if (!root->reloc_root)
4741 		return;
4742 
4743 	rc = root->fs_info->reloc_ctl;
4744 	if (!rc->merge_reloc_tree)
4745 		return;
4746 
4747 	root = root->reloc_root;
4748 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4749 	/*
4750 	 * relocation is in the stage of merging trees. the space
4751 	 * used by merging a reloc tree is twice the size of
4752 	 * relocated tree nodes in the worst case. half for cowing
4753 	 * the reloc tree, half for cowing the fs tree. the space
4754 	 * used by cowing the reloc tree will be freed after the
4755 	 * tree is dropped. if we create snapshot, cowing the fs
4756 	 * tree may use more space than it frees. so we need
4757 	 * reserve extra space.
4758 	 */
4759 	*bytes_to_reserve += rc->nodes_relocated;
4760 }
4761 
4762 /*
4763  * called after snapshot is created. migrate block reservation
4764  * and create reloc root for the newly created snapshot
4765  */
4766 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4767 			       struct btrfs_pending_snapshot *pending)
4768 {
4769 	struct btrfs_root *root = pending->root;
4770 	struct btrfs_root *reloc_root;
4771 	struct btrfs_root *new_root;
4772 	struct reloc_control *rc;
4773 	int ret;
4774 
4775 	if (!root->reloc_root)
4776 		return 0;
4777 
4778 	rc = root->fs_info->reloc_ctl;
4779 	rc->merging_rsv_size += rc->nodes_relocated;
4780 
4781 	if (rc->merge_reloc_tree) {
4782 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4783 					      rc->block_rsv,
4784 					      rc->nodes_relocated, 1);
4785 		if (ret)
4786 			return ret;
4787 	}
4788 
4789 	new_root = pending->snap;
4790 	reloc_root = create_reloc_root(trans, root->reloc_root,
4791 				       new_root->root_key.objectid);
4792 	if (IS_ERR(reloc_root))
4793 		return PTR_ERR(reloc_root);
4794 
4795 	ret = __add_reloc_root(reloc_root);
4796 	BUG_ON(ret < 0);
4797 	new_root->reloc_root = reloc_root;
4798 
4799 	if (rc->create_reloc_tree)
4800 		ret = clone_backref_node(trans, rc, root, reloc_root);
4801 	return ret;
4802 }
4803