xref: /linux/fs/btrfs/relocation.c (revision d257f9bf06129613de539ea71ecea60848b662cd)
1 /*
2  * Copyright (C) 2009 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include "ctree.h"
26 #include "disk-io.h"
27 #include "transaction.h"
28 #include "volumes.h"
29 #include "locking.h"
30 #include "btrfs_inode.h"
31 #include "async-thread.h"
32 #include "free-space-cache.h"
33 #include "inode-map.h"
34 #include "qgroup.h"
35 
36 /*
37  * backref_node, mapping_node and tree_block start with this
38  */
39 struct tree_entry {
40 	struct rb_node rb_node;
41 	u64 bytenr;
42 };
43 
44 /*
45  * present a tree block in the backref cache
46  */
47 struct backref_node {
48 	struct rb_node rb_node;
49 	u64 bytenr;
50 
51 	u64 new_bytenr;
52 	/* objectid of tree block owner, can be not uptodate */
53 	u64 owner;
54 	/* link to pending, changed or detached list */
55 	struct list_head list;
56 	/* list of upper level blocks reference this block */
57 	struct list_head upper;
58 	/* list of child blocks in the cache */
59 	struct list_head lower;
60 	/* NULL if this node is not tree root */
61 	struct btrfs_root *root;
62 	/* extent buffer got by COW the block */
63 	struct extent_buffer *eb;
64 	/* level of tree block */
65 	unsigned int level:8;
66 	/* is the block in non-reference counted tree */
67 	unsigned int cowonly:1;
68 	/* 1 if no child node in the cache */
69 	unsigned int lowest:1;
70 	/* is the extent buffer locked */
71 	unsigned int locked:1;
72 	/* has the block been processed */
73 	unsigned int processed:1;
74 	/* have backrefs of this block been checked */
75 	unsigned int checked:1;
76 	/*
77 	 * 1 if corresponding block has been cowed but some upper
78 	 * level block pointers may not point to the new location
79 	 */
80 	unsigned int pending:1;
81 	/*
82 	 * 1 if the backref node isn't connected to any other
83 	 * backref node.
84 	 */
85 	unsigned int detached:1;
86 };
87 
88 /*
89  * present a block pointer in the backref cache
90  */
91 struct backref_edge {
92 	struct list_head list[2];
93 	struct backref_node *node[2];
94 };
95 
96 #define LOWER	0
97 #define UPPER	1
98 #define RELOCATION_RESERVED_NODES	256
99 
100 struct backref_cache {
101 	/* red black tree of all backref nodes in the cache */
102 	struct rb_root rb_root;
103 	/* for passing backref nodes to btrfs_reloc_cow_block */
104 	struct backref_node *path[BTRFS_MAX_LEVEL];
105 	/*
106 	 * list of blocks that have been cowed but some block
107 	 * pointers in upper level blocks may not reflect the
108 	 * new location
109 	 */
110 	struct list_head pending[BTRFS_MAX_LEVEL];
111 	/* list of backref nodes with no child node */
112 	struct list_head leaves;
113 	/* list of blocks that have been cowed in current transaction */
114 	struct list_head changed;
115 	/* list of detached backref node. */
116 	struct list_head detached;
117 
118 	u64 last_trans;
119 
120 	int nr_nodes;
121 	int nr_edges;
122 };
123 
124 /*
125  * map address of tree root to tree
126  */
127 struct mapping_node {
128 	struct rb_node rb_node;
129 	u64 bytenr;
130 	void *data;
131 };
132 
133 struct mapping_tree {
134 	struct rb_root rb_root;
135 	spinlock_t lock;
136 };
137 
138 /*
139  * present a tree block to process
140  */
141 struct tree_block {
142 	struct rb_node rb_node;
143 	u64 bytenr;
144 	struct btrfs_key key;
145 	unsigned int level:8;
146 	unsigned int key_ready:1;
147 };
148 
149 #define MAX_EXTENTS 128
150 
151 struct file_extent_cluster {
152 	u64 start;
153 	u64 end;
154 	u64 boundary[MAX_EXTENTS];
155 	unsigned int nr;
156 };
157 
158 struct reloc_control {
159 	/* block group to relocate */
160 	struct btrfs_block_group_cache *block_group;
161 	/* extent tree */
162 	struct btrfs_root *extent_root;
163 	/* inode for moving data */
164 	struct inode *data_inode;
165 
166 	struct btrfs_block_rsv *block_rsv;
167 
168 	struct backref_cache backref_cache;
169 
170 	struct file_extent_cluster cluster;
171 	/* tree blocks have been processed */
172 	struct extent_io_tree processed_blocks;
173 	/* map start of tree root to corresponding reloc tree */
174 	struct mapping_tree reloc_root_tree;
175 	/* list of reloc trees */
176 	struct list_head reloc_roots;
177 	/* size of metadata reservation for merging reloc trees */
178 	u64 merging_rsv_size;
179 	/* size of relocated tree nodes */
180 	u64 nodes_relocated;
181 	/* reserved size for block group relocation*/
182 	u64 reserved_bytes;
183 
184 	u64 search_start;
185 	u64 extents_found;
186 
187 	unsigned int stage:8;
188 	unsigned int create_reloc_tree:1;
189 	unsigned int merge_reloc_tree:1;
190 	unsigned int found_file_extent:1;
191 };
192 
193 /* stages of data relocation */
194 #define MOVE_DATA_EXTENTS	0
195 #define UPDATE_DATA_PTRS	1
196 
197 static void remove_backref_node(struct backref_cache *cache,
198 				struct backref_node *node);
199 static void __mark_block_processed(struct reloc_control *rc,
200 				   struct backref_node *node);
201 
202 static void mapping_tree_init(struct mapping_tree *tree)
203 {
204 	tree->rb_root = RB_ROOT;
205 	spin_lock_init(&tree->lock);
206 }
207 
208 static void backref_cache_init(struct backref_cache *cache)
209 {
210 	int i;
211 	cache->rb_root = RB_ROOT;
212 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
213 		INIT_LIST_HEAD(&cache->pending[i]);
214 	INIT_LIST_HEAD(&cache->changed);
215 	INIT_LIST_HEAD(&cache->detached);
216 	INIT_LIST_HEAD(&cache->leaves);
217 }
218 
219 static void backref_cache_cleanup(struct backref_cache *cache)
220 {
221 	struct backref_node *node;
222 	int i;
223 
224 	while (!list_empty(&cache->detached)) {
225 		node = list_entry(cache->detached.next,
226 				  struct backref_node, list);
227 		remove_backref_node(cache, node);
228 	}
229 
230 	while (!list_empty(&cache->leaves)) {
231 		node = list_entry(cache->leaves.next,
232 				  struct backref_node, lower);
233 		remove_backref_node(cache, node);
234 	}
235 
236 	cache->last_trans = 0;
237 
238 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
239 		ASSERT(list_empty(&cache->pending[i]));
240 	ASSERT(list_empty(&cache->changed));
241 	ASSERT(list_empty(&cache->detached));
242 	ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
243 	ASSERT(!cache->nr_nodes);
244 	ASSERT(!cache->nr_edges);
245 }
246 
247 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
248 {
249 	struct backref_node *node;
250 
251 	node = kzalloc(sizeof(*node), GFP_NOFS);
252 	if (node) {
253 		INIT_LIST_HEAD(&node->list);
254 		INIT_LIST_HEAD(&node->upper);
255 		INIT_LIST_HEAD(&node->lower);
256 		RB_CLEAR_NODE(&node->rb_node);
257 		cache->nr_nodes++;
258 	}
259 	return node;
260 }
261 
262 static void free_backref_node(struct backref_cache *cache,
263 			      struct backref_node *node)
264 {
265 	if (node) {
266 		cache->nr_nodes--;
267 		kfree(node);
268 	}
269 }
270 
271 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
272 {
273 	struct backref_edge *edge;
274 
275 	edge = kzalloc(sizeof(*edge), GFP_NOFS);
276 	if (edge)
277 		cache->nr_edges++;
278 	return edge;
279 }
280 
281 static void free_backref_edge(struct backref_cache *cache,
282 			      struct backref_edge *edge)
283 {
284 	if (edge) {
285 		cache->nr_edges--;
286 		kfree(edge);
287 	}
288 }
289 
290 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
291 				   struct rb_node *node)
292 {
293 	struct rb_node **p = &root->rb_node;
294 	struct rb_node *parent = NULL;
295 	struct tree_entry *entry;
296 
297 	while (*p) {
298 		parent = *p;
299 		entry = rb_entry(parent, struct tree_entry, rb_node);
300 
301 		if (bytenr < entry->bytenr)
302 			p = &(*p)->rb_left;
303 		else if (bytenr > entry->bytenr)
304 			p = &(*p)->rb_right;
305 		else
306 			return parent;
307 	}
308 
309 	rb_link_node(node, parent, p);
310 	rb_insert_color(node, root);
311 	return NULL;
312 }
313 
314 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
315 {
316 	struct rb_node *n = root->rb_node;
317 	struct tree_entry *entry;
318 
319 	while (n) {
320 		entry = rb_entry(n, struct tree_entry, rb_node);
321 
322 		if (bytenr < entry->bytenr)
323 			n = n->rb_left;
324 		else if (bytenr > entry->bytenr)
325 			n = n->rb_right;
326 		else
327 			return n;
328 	}
329 	return NULL;
330 }
331 
332 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
333 {
334 
335 	struct btrfs_fs_info *fs_info = NULL;
336 	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
337 					      rb_node);
338 	if (bnode->root)
339 		fs_info = bnode->root->fs_info;
340 	btrfs_panic(fs_info, errno,
341 		    "Inconsistency in backref cache found at offset %llu",
342 		    bytenr);
343 }
344 
345 /*
346  * walk up backref nodes until reach node presents tree root
347  */
348 static struct backref_node *walk_up_backref(struct backref_node *node,
349 					    struct backref_edge *edges[],
350 					    int *index)
351 {
352 	struct backref_edge *edge;
353 	int idx = *index;
354 
355 	while (!list_empty(&node->upper)) {
356 		edge = list_entry(node->upper.next,
357 				  struct backref_edge, list[LOWER]);
358 		edges[idx++] = edge;
359 		node = edge->node[UPPER];
360 	}
361 	BUG_ON(node->detached);
362 	*index = idx;
363 	return node;
364 }
365 
366 /*
367  * walk down backref nodes to find start of next reference path
368  */
369 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
370 					      int *index)
371 {
372 	struct backref_edge *edge;
373 	struct backref_node *lower;
374 	int idx = *index;
375 
376 	while (idx > 0) {
377 		edge = edges[idx - 1];
378 		lower = edge->node[LOWER];
379 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
380 			idx--;
381 			continue;
382 		}
383 		edge = list_entry(edge->list[LOWER].next,
384 				  struct backref_edge, list[LOWER]);
385 		edges[idx - 1] = edge;
386 		*index = idx;
387 		return edge->node[UPPER];
388 	}
389 	*index = 0;
390 	return NULL;
391 }
392 
393 static void unlock_node_buffer(struct backref_node *node)
394 {
395 	if (node->locked) {
396 		btrfs_tree_unlock(node->eb);
397 		node->locked = 0;
398 	}
399 }
400 
401 static void drop_node_buffer(struct backref_node *node)
402 {
403 	if (node->eb) {
404 		unlock_node_buffer(node);
405 		free_extent_buffer(node->eb);
406 		node->eb = NULL;
407 	}
408 }
409 
410 static void drop_backref_node(struct backref_cache *tree,
411 			      struct backref_node *node)
412 {
413 	BUG_ON(!list_empty(&node->upper));
414 
415 	drop_node_buffer(node);
416 	list_del(&node->list);
417 	list_del(&node->lower);
418 	if (!RB_EMPTY_NODE(&node->rb_node))
419 		rb_erase(&node->rb_node, &tree->rb_root);
420 	free_backref_node(tree, node);
421 }
422 
423 /*
424  * remove a backref node from the backref cache
425  */
426 static void remove_backref_node(struct backref_cache *cache,
427 				struct backref_node *node)
428 {
429 	struct backref_node *upper;
430 	struct backref_edge *edge;
431 
432 	if (!node)
433 		return;
434 
435 	BUG_ON(!node->lowest && !node->detached);
436 	while (!list_empty(&node->upper)) {
437 		edge = list_entry(node->upper.next, struct backref_edge,
438 				  list[LOWER]);
439 		upper = edge->node[UPPER];
440 		list_del(&edge->list[LOWER]);
441 		list_del(&edge->list[UPPER]);
442 		free_backref_edge(cache, edge);
443 
444 		if (RB_EMPTY_NODE(&upper->rb_node)) {
445 			BUG_ON(!list_empty(&node->upper));
446 			drop_backref_node(cache, node);
447 			node = upper;
448 			node->lowest = 1;
449 			continue;
450 		}
451 		/*
452 		 * add the node to leaf node list if no other
453 		 * child block cached.
454 		 */
455 		if (list_empty(&upper->lower)) {
456 			list_add_tail(&upper->lower, &cache->leaves);
457 			upper->lowest = 1;
458 		}
459 	}
460 
461 	drop_backref_node(cache, node);
462 }
463 
464 static void update_backref_node(struct backref_cache *cache,
465 				struct backref_node *node, u64 bytenr)
466 {
467 	struct rb_node *rb_node;
468 	rb_erase(&node->rb_node, &cache->rb_root);
469 	node->bytenr = bytenr;
470 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
471 	if (rb_node)
472 		backref_tree_panic(rb_node, -EEXIST, bytenr);
473 }
474 
475 /*
476  * update backref cache after a transaction commit
477  */
478 static int update_backref_cache(struct btrfs_trans_handle *trans,
479 				struct backref_cache *cache)
480 {
481 	struct backref_node *node;
482 	int level = 0;
483 
484 	if (cache->last_trans == 0) {
485 		cache->last_trans = trans->transid;
486 		return 0;
487 	}
488 
489 	if (cache->last_trans == trans->transid)
490 		return 0;
491 
492 	/*
493 	 * detached nodes are used to avoid unnecessary backref
494 	 * lookup. transaction commit changes the extent tree.
495 	 * so the detached nodes are no longer useful.
496 	 */
497 	while (!list_empty(&cache->detached)) {
498 		node = list_entry(cache->detached.next,
499 				  struct backref_node, list);
500 		remove_backref_node(cache, node);
501 	}
502 
503 	while (!list_empty(&cache->changed)) {
504 		node = list_entry(cache->changed.next,
505 				  struct backref_node, list);
506 		list_del_init(&node->list);
507 		BUG_ON(node->pending);
508 		update_backref_node(cache, node, node->new_bytenr);
509 	}
510 
511 	/*
512 	 * some nodes can be left in the pending list if there were
513 	 * errors during processing the pending nodes.
514 	 */
515 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
516 		list_for_each_entry(node, &cache->pending[level], list) {
517 			BUG_ON(!node->pending);
518 			if (node->bytenr == node->new_bytenr)
519 				continue;
520 			update_backref_node(cache, node, node->new_bytenr);
521 		}
522 	}
523 
524 	cache->last_trans = 0;
525 	return 1;
526 }
527 
528 
529 static int should_ignore_root(struct btrfs_root *root)
530 {
531 	struct btrfs_root *reloc_root;
532 
533 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
534 		return 0;
535 
536 	reloc_root = root->reloc_root;
537 	if (!reloc_root)
538 		return 0;
539 
540 	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
541 	    root->fs_info->running_transaction->transid - 1)
542 		return 0;
543 	/*
544 	 * if there is reloc tree and it was created in previous
545 	 * transaction backref lookup can find the reloc tree,
546 	 * so backref node for the fs tree root is useless for
547 	 * relocation.
548 	 */
549 	return 1;
550 }
551 /*
552  * find reloc tree by address of tree root
553  */
554 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
555 					  u64 bytenr)
556 {
557 	struct rb_node *rb_node;
558 	struct mapping_node *node;
559 	struct btrfs_root *root = NULL;
560 
561 	spin_lock(&rc->reloc_root_tree.lock);
562 	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
563 	if (rb_node) {
564 		node = rb_entry(rb_node, struct mapping_node, rb_node);
565 		root = (struct btrfs_root *)node->data;
566 	}
567 	spin_unlock(&rc->reloc_root_tree.lock);
568 	return root;
569 }
570 
571 static int is_cowonly_root(u64 root_objectid)
572 {
573 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
574 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
575 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
576 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
577 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
578 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
579 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
580 	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
581 	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
582 		return 1;
583 	return 0;
584 }
585 
586 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
587 					u64 root_objectid)
588 {
589 	struct btrfs_key key;
590 
591 	key.objectid = root_objectid;
592 	key.type = BTRFS_ROOT_ITEM_KEY;
593 	if (is_cowonly_root(root_objectid))
594 		key.offset = 0;
595 	else
596 		key.offset = (u64)-1;
597 
598 	return btrfs_get_fs_root(fs_info, &key, false);
599 }
600 
601 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
602 static noinline_for_stack
603 struct btrfs_root *find_tree_root(struct reloc_control *rc,
604 				  struct extent_buffer *leaf,
605 				  struct btrfs_extent_ref_v0 *ref0)
606 {
607 	struct btrfs_root *root;
608 	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
609 	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
610 
611 	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
612 
613 	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
614 	BUG_ON(IS_ERR(root));
615 
616 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
617 	    generation != btrfs_root_generation(&root->root_item))
618 		return NULL;
619 
620 	return root;
621 }
622 #endif
623 
624 static noinline_for_stack
625 int find_inline_backref(struct extent_buffer *leaf, int slot,
626 			unsigned long *ptr, unsigned long *end)
627 {
628 	struct btrfs_key key;
629 	struct btrfs_extent_item *ei;
630 	struct btrfs_tree_block_info *bi;
631 	u32 item_size;
632 
633 	btrfs_item_key_to_cpu(leaf, &key, slot);
634 
635 	item_size = btrfs_item_size_nr(leaf, slot);
636 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
637 	if (item_size < sizeof(*ei)) {
638 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
639 		return 1;
640 	}
641 #endif
642 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
643 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
644 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
645 
646 	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
647 	    item_size <= sizeof(*ei) + sizeof(*bi)) {
648 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
649 		return 1;
650 	}
651 	if (key.type == BTRFS_METADATA_ITEM_KEY &&
652 	    item_size <= sizeof(*ei)) {
653 		WARN_ON(item_size < sizeof(*ei));
654 		return 1;
655 	}
656 
657 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
658 		bi = (struct btrfs_tree_block_info *)(ei + 1);
659 		*ptr = (unsigned long)(bi + 1);
660 	} else {
661 		*ptr = (unsigned long)(ei + 1);
662 	}
663 	*end = (unsigned long)ei + item_size;
664 	return 0;
665 }
666 
667 /*
668  * build backref tree for a given tree block. root of the backref tree
669  * corresponds the tree block, leaves of the backref tree correspond
670  * roots of b-trees that reference the tree block.
671  *
672  * the basic idea of this function is check backrefs of a given block
673  * to find upper level blocks that reference the block, and then check
674  * backrefs of these upper level blocks recursively. the recursion stop
675  * when tree root is reached or backrefs for the block is cached.
676  *
677  * NOTE: if we find backrefs for a block are cached, we know backrefs
678  * for all upper level blocks that directly/indirectly reference the
679  * block are also cached.
680  */
681 static noinline_for_stack
682 struct backref_node *build_backref_tree(struct reloc_control *rc,
683 					struct btrfs_key *node_key,
684 					int level, u64 bytenr)
685 {
686 	struct backref_cache *cache = &rc->backref_cache;
687 	struct btrfs_path *path1;
688 	struct btrfs_path *path2;
689 	struct extent_buffer *eb;
690 	struct btrfs_root *root;
691 	struct backref_node *cur;
692 	struct backref_node *upper;
693 	struct backref_node *lower;
694 	struct backref_node *node = NULL;
695 	struct backref_node *exist = NULL;
696 	struct backref_edge *edge;
697 	struct rb_node *rb_node;
698 	struct btrfs_key key;
699 	unsigned long end;
700 	unsigned long ptr;
701 	LIST_HEAD(list);
702 	LIST_HEAD(useless);
703 	int cowonly;
704 	int ret;
705 	int err = 0;
706 	bool need_check = true;
707 
708 	path1 = btrfs_alloc_path();
709 	path2 = btrfs_alloc_path();
710 	if (!path1 || !path2) {
711 		err = -ENOMEM;
712 		goto out;
713 	}
714 	path1->reada = READA_FORWARD;
715 	path2->reada = READA_FORWARD;
716 
717 	node = alloc_backref_node(cache);
718 	if (!node) {
719 		err = -ENOMEM;
720 		goto out;
721 	}
722 
723 	node->bytenr = bytenr;
724 	node->level = level;
725 	node->lowest = 1;
726 	cur = node;
727 again:
728 	end = 0;
729 	ptr = 0;
730 	key.objectid = cur->bytenr;
731 	key.type = BTRFS_METADATA_ITEM_KEY;
732 	key.offset = (u64)-1;
733 
734 	path1->search_commit_root = 1;
735 	path1->skip_locking = 1;
736 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
737 				0, 0);
738 	if (ret < 0) {
739 		err = ret;
740 		goto out;
741 	}
742 	ASSERT(ret);
743 	ASSERT(path1->slots[0]);
744 
745 	path1->slots[0]--;
746 
747 	WARN_ON(cur->checked);
748 	if (!list_empty(&cur->upper)) {
749 		/*
750 		 * the backref was added previously when processing
751 		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
752 		 */
753 		ASSERT(list_is_singular(&cur->upper));
754 		edge = list_entry(cur->upper.next, struct backref_edge,
755 				  list[LOWER]);
756 		ASSERT(list_empty(&edge->list[UPPER]));
757 		exist = edge->node[UPPER];
758 		/*
759 		 * add the upper level block to pending list if we need
760 		 * check its backrefs
761 		 */
762 		if (!exist->checked)
763 			list_add_tail(&edge->list[UPPER], &list);
764 	} else {
765 		exist = NULL;
766 	}
767 
768 	while (1) {
769 		cond_resched();
770 		eb = path1->nodes[0];
771 
772 		if (ptr >= end) {
773 			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
774 				ret = btrfs_next_leaf(rc->extent_root, path1);
775 				if (ret < 0) {
776 					err = ret;
777 					goto out;
778 				}
779 				if (ret > 0)
780 					break;
781 				eb = path1->nodes[0];
782 			}
783 
784 			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
785 			if (key.objectid != cur->bytenr) {
786 				WARN_ON(exist);
787 				break;
788 			}
789 
790 			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
791 			    key.type == BTRFS_METADATA_ITEM_KEY) {
792 				ret = find_inline_backref(eb, path1->slots[0],
793 							  &ptr, &end);
794 				if (ret)
795 					goto next;
796 			}
797 		}
798 
799 		if (ptr < end) {
800 			/* update key for inline back ref */
801 			struct btrfs_extent_inline_ref *iref;
802 			iref = (struct btrfs_extent_inline_ref *)ptr;
803 			key.type = btrfs_extent_inline_ref_type(eb, iref);
804 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
805 			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
806 				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
807 		}
808 
809 		if (exist &&
810 		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
811 		      exist->owner == key.offset) ||
812 		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
813 		      exist->bytenr == key.offset))) {
814 			exist = NULL;
815 			goto next;
816 		}
817 
818 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
819 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
820 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
821 			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
822 				struct btrfs_extent_ref_v0 *ref0;
823 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
824 						struct btrfs_extent_ref_v0);
825 				if (key.objectid == key.offset) {
826 					root = find_tree_root(rc, eb, ref0);
827 					if (root && !should_ignore_root(root))
828 						cur->root = root;
829 					else
830 						list_add(&cur->list, &useless);
831 					break;
832 				}
833 				if (is_cowonly_root(btrfs_ref_root_v0(eb,
834 								      ref0)))
835 					cur->cowonly = 1;
836 			}
837 #else
838 		ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
839 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
840 #endif
841 			if (key.objectid == key.offset) {
842 				/*
843 				 * only root blocks of reloc trees use
844 				 * backref of this type.
845 				 */
846 				root = find_reloc_root(rc, cur->bytenr);
847 				ASSERT(root);
848 				cur->root = root;
849 				break;
850 			}
851 
852 			edge = alloc_backref_edge(cache);
853 			if (!edge) {
854 				err = -ENOMEM;
855 				goto out;
856 			}
857 			rb_node = tree_search(&cache->rb_root, key.offset);
858 			if (!rb_node) {
859 				upper = alloc_backref_node(cache);
860 				if (!upper) {
861 					free_backref_edge(cache, edge);
862 					err = -ENOMEM;
863 					goto out;
864 				}
865 				upper->bytenr = key.offset;
866 				upper->level = cur->level + 1;
867 				/*
868 				 *  backrefs for the upper level block isn't
869 				 *  cached, add the block to pending list
870 				 */
871 				list_add_tail(&edge->list[UPPER], &list);
872 			} else {
873 				upper = rb_entry(rb_node, struct backref_node,
874 						 rb_node);
875 				ASSERT(upper->checked);
876 				INIT_LIST_HEAD(&edge->list[UPPER]);
877 			}
878 			list_add_tail(&edge->list[LOWER], &cur->upper);
879 			edge->node[LOWER] = cur;
880 			edge->node[UPPER] = upper;
881 
882 			goto next;
883 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
884 			goto next;
885 		}
886 
887 		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
888 		root = read_fs_root(rc->extent_root->fs_info, key.offset);
889 		if (IS_ERR(root)) {
890 			err = PTR_ERR(root);
891 			goto out;
892 		}
893 
894 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
895 			cur->cowonly = 1;
896 
897 		if (btrfs_root_level(&root->root_item) == cur->level) {
898 			/* tree root */
899 			ASSERT(btrfs_root_bytenr(&root->root_item) ==
900 			       cur->bytenr);
901 			if (should_ignore_root(root))
902 				list_add(&cur->list, &useless);
903 			else
904 				cur->root = root;
905 			break;
906 		}
907 
908 		level = cur->level + 1;
909 
910 		/*
911 		 * searching the tree to find upper level blocks
912 		 * reference the block.
913 		 */
914 		path2->search_commit_root = 1;
915 		path2->skip_locking = 1;
916 		path2->lowest_level = level;
917 		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
918 		path2->lowest_level = 0;
919 		if (ret < 0) {
920 			err = ret;
921 			goto out;
922 		}
923 		if (ret > 0 && path2->slots[level] > 0)
924 			path2->slots[level]--;
925 
926 		eb = path2->nodes[level];
927 		if (btrfs_node_blockptr(eb, path2->slots[level]) !=
928 		    cur->bytenr) {
929 			btrfs_err(root->fs_info,
930 	"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
931 				  cur->bytenr, level - 1, root->objectid,
932 				  node_key->objectid, node_key->type,
933 				  node_key->offset);
934 			err = -ENOENT;
935 			goto out;
936 		}
937 		lower = cur;
938 		need_check = true;
939 		for (; level < BTRFS_MAX_LEVEL; level++) {
940 			if (!path2->nodes[level]) {
941 				ASSERT(btrfs_root_bytenr(&root->root_item) ==
942 				       lower->bytenr);
943 				if (should_ignore_root(root))
944 					list_add(&lower->list, &useless);
945 				else
946 					lower->root = root;
947 				break;
948 			}
949 
950 			edge = alloc_backref_edge(cache);
951 			if (!edge) {
952 				err = -ENOMEM;
953 				goto out;
954 			}
955 
956 			eb = path2->nodes[level];
957 			rb_node = tree_search(&cache->rb_root, eb->start);
958 			if (!rb_node) {
959 				upper = alloc_backref_node(cache);
960 				if (!upper) {
961 					free_backref_edge(cache, edge);
962 					err = -ENOMEM;
963 					goto out;
964 				}
965 				upper->bytenr = eb->start;
966 				upper->owner = btrfs_header_owner(eb);
967 				upper->level = lower->level + 1;
968 				if (!test_bit(BTRFS_ROOT_REF_COWS,
969 					      &root->state))
970 					upper->cowonly = 1;
971 
972 				/*
973 				 * if we know the block isn't shared
974 				 * we can void checking its backrefs.
975 				 */
976 				if (btrfs_block_can_be_shared(root, eb))
977 					upper->checked = 0;
978 				else
979 					upper->checked = 1;
980 
981 				/*
982 				 * add the block to pending list if we
983 				 * need check its backrefs, we only do this once
984 				 * while walking up a tree as we will catch
985 				 * anything else later on.
986 				 */
987 				if (!upper->checked && need_check) {
988 					need_check = false;
989 					list_add_tail(&edge->list[UPPER],
990 						      &list);
991 				} else {
992 					if (upper->checked)
993 						need_check = true;
994 					INIT_LIST_HEAD(&edge->list[UPPER]);
995 				}
996 			} else {
997 				upper = rb_entry(rb_node, struct backref_node,
998 						 rb_node);
999 				ASSERT(upper->checked);
1000 				INIT_LIST_HEAD(&edge->list[UPPER]);
1001 				if (!upper->owner)
1002 					upper->owner = btrfs_header_owner(eb);
1003 			}
1004 			list_add_tail(&edge->list[LOWER], &lower->upper);
1005 			edge->node[LOWER] = lower;
1006 			edge->node[UPPER] = upper;
1007 
1008 			if (rb_node)
1009 				break;
1010 			lower = upper;
1011 			upper = NULL;
1012 		}
1013 		btrfs_release_path(path2);
1014 next:
1015 		if (ptr < end) {
1016 			ptr += btrfs_extent_inline_ref_size(key.type);
1017 			if (ptr >= end) {
1018 				WARN_ON(ptr > end);
1019 				ptr = 0;
1020 				end = 0;
1021 			}
1022 		}
1023 		if (ptr >= end)
1024 			path1->slots[0]++;
1025 	}
1026 	btrfs_release_path(path1);
1027 
1028 	cur->checked = 1;
1029 	WARN_ON(exist);
1030 
1031 	/* the pending list isn't empty, take the first block to process */
1032 	if (!list_empty(&list)) {
1033 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034 		list_del_init(&edge->list[UPPER]);
1035 		cur = edge->node[UPPER];
1036 		goto again;
1037 	}
1038 
1039 	/*
1040 	 * everything goes well, connect backref nodes and insert backref nodes
1041 	 * into the cache.
1042 	 */
1043 	ASSERT(node->checked);
1044 	cowonly = node->cowonly;
1045 	if (!cowonly) {
1046 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
1047 				      &node->rb_node);
1048 		if (rb_node)
1049 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1050 		list_add_tail(&node->lower, &cache->leaves);
1051 	}
1052 
1053 	list_for_each_entry(edge, &node->upper, list[LOWER])
1054 		list_add_tail(&edge->list[UPPER], &list);
1055 
1056 	while (!list_empty(&list)) {
1057 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1058 		list_del_init(&edge->list[UPPER]);
1059 		upper = edge->node[UPPER];
1060 		if (upper->detached) {
1061 			list_del(&edge->list[LOWER]);
1062 			lower = edge->node[LOWER];
1063 			free_backref_edge(cache, edge);
1064 			if (list_empty(&lower->upper))
1065 				list_add(&lower->list, &useless);
1066 			continue;
1067 		}
1068 
1069 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
1070 			if (upper->lowest) {
1071 				list_del_init(&upper->lower);
1072 				upper->lowest = 0;
1073 			}
1074 
1075 			list_add_tail(&edge->list[UPPER], &upper->lower);
1076 			continue;
1077 		}
1078 
1079 		if (!upper->checked) {
1080 			/*
1081 			 * Still want to blow up for developers since this is a
1082 			 * logic bug.
1083 			 */
1084 			ASSERT(0);
1085 			err = -EINVAL;
1086 			goto out;
1087 		}
1088 		if (cowonly != upper->cowonly) {
1089 			ASSERT(0);
1090 			err = -EINVAL;
1091 			goto out;
1092 		}
1093 
1094 		if (!cowonly) {
1095 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1096 					      &upper->rb_node);
1097 			if (rb_node)
1098 				backref_tree_panic(rb_node, -EEXIST,
1099 						   upper->bytenr);
1100 		}
1101 
1102 		list_add_tail(&edge->list[UPPER], &upper->lower);
1103 
1104 		list_for_each_entry(edge, &upper->upper, list[LOWER])
1105 			list_add_tail(&edge->list[UPPER], &list);
1106 	}
1107 	/*
1108 	 * process useless backref nodes. backref nodes for tree leaves
1109 	 * are deleted from the cache. backref nodes for upper level
1110 	 * tree blocks are left in the cache to avoid unnecessary backref
1111 	 * lookup.
1112 	 */
1113 	while (!list_empty(&useless)) {
1114 		upper = list_entry(useless.next, struct backref_node, list);
1115 		list_del_init(&upper->list);
1116 		ASSERT(list_empty(&upper->upper));
1117 		if (upper == node)
1118 			node = NULL;
1119 		if (upper->lowest) {
1120 			list_del_init(&upper->lower);
1121 			upper->lowest = 0;
1122 		}
1123 		while (!list_empty(&upper->lower)) {
1124 			edge = list_entry(upper->lower.next,
1125 					  struct backref_edge, list[UPPER]);
1126 			list_del(&edge->list[UPPER]);
1127 			list_del(&edge->list[LOWER]);
1128 			lower = edge->node[LOWER];
1129 			free_backref_edge(cache, edge);
1130 
1131 			if (list_empty(&lower->upper))
1132 				list_add(&lower->list, &useless);
1133 		}
1134 		__mark_block_processed(rc, upper);
1135 		if (upper->level > 0) {
1136 			list_add(&upper->list, &cache->detached);
1137 			upper->detached = 1;
1138 		} else {
1139 			rb_erase(&upper->rb_node, &cache->rb_root);
1140 			free_backref_node(cache, upper);
1141 		}
1142 	}
1143 out:
1144 	btrfs_free_path(path1);
1145 	btrfs_free_path(path2);
1146 	if (err) {
1147 		while (!list_empty(&useless)) {
1148 			lower = list_entry(useless.next,
1149 					   struct backref_node, list);
1150 			list_del_init(&lower->list);
1151 		}
1152 		while (!list_empty(&list)) {
1153 			edge = list_first_entry(&list, struct backref_edge,
1154 						list[UPPER]);
1155 			list_del(&edge->list[UPPER]);
1156 			list_del(&edge->list[LOWER]);
1157 			lower = edge->node[LOWER];
1158 			upper = edge->node[UPPER];
1159 			free_backref_edge(cache, edge);
1160 
1161 			/*
1162 			 * Lower is no longer linked to any upper backref nodes
1163 			 * and isn't in the cache, we can free it ourselves.
1164 			 */
1165 			if (list_empty(&lower->upper) &&
1166 			    RB_EMPTY_NODE(&lower->rb_node))
1167 				list_add(&lower->list, &useless);
1168 
1169 			if (!RB_EMPTY_NODE(&upper->rb_node))
1170 				continue;
1171 
1172 			/* Add this guy's upper edges to the list to process */
1173 			list_for_each_entry(edge, &upper->upper, list[LOWER])
1174 				list_add_tail(&edge->list[UPPER], &list);
1175 			if (list_empty(&upper->upper))
1176 				list_add(&upper->list, &useless);
1177 		}
1178 
1179 		while (!list_empty(&useless)) {
1180 			lower = list_entry(useless.next,
1181 					   struct backref_node, list);
1182 			list_del_init(&lower->list);
1183 			if (lower == node)
1184 				node = NULL;
1185 			free_backref_node(cache, lower);
1186 		}
1187 
1188 		free_backref_node(cache, node);
1189 		return ERR_PTR(err);
1190 	}
1191 	ASSERT(!node || !node->detached);
1192 	return node;
1193 }
1194 
1195 /*
1196  * helper to add backref node for the newly created snapshot.
1197  * the backref node is created by cloning backref node that
1198  * corresponds to root of source tree
1199  */
1200 static int clone_backref_node(struct btrfs_trans_handle *trans,
1201 			      struct reloc_control *rc,
1202 			      struct btrfs_root *src,
1203 			      struct btrfs_root *dest)
1204 {
1205 	struct btrfs_root *reloc_root = src->reloc_root;
1206 	struct backref_cache *cache = &rc->backref_cache;
1207 	struct backref_node *node = NULL;
1208 	struct backref_node *new_node;
1209 	struct backref_edge *edge;
1210 	struct backref_edge *new_edge;
1211 	struct rb_node *rb_node;
1212 
1213 	if (cache->last_trans > 0)
1214 		update_backref_cache(trans, cache);
1215 
1216 	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1217 	if (rb_node) {
1218 		node = rb_entry(rb_node, struct backref_node, rb_node);
1219 		if (node->detached)
1220 			node = NULL;
1221 		else
1222 			BUG_ON(node->new_bytenr != reloc_root->node->start);
1223 	}
1224 
1225 	if (!node) {
1226 		rb_node = tree_search(&cache->rb_root,
1227 				      reloc_root->commit_root->start);
1228 		if (rb_node) {
1229 			node = rb_entry(rb_node, struct backref_node,
1230 					rb_node);
1231 			BUG_ON(node->detached);
1232 		}
1233 	}
1234 
1235 	if (!node)
1236 		return 0;
1237 
1238 	new_node = alloc_backref_node(cache);
1239 	if (!new_node)
1240 		return -ENOMEM;
1241 
1242 	new_node->bytenr = dest->node->start;
1243 	new_node->level = node->level;
1244 	new_node->lowest = node->lowest;
1245 	new_node->checked = 1;
1246 	new_node->root = dest;
1247 
1248 	if (!node->lowest) {
1249 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
1250 			new_edge = alloc_backref_edge(cache);
1251 			if (!new_edge)
1252 				goto fail;
1253 
1254 			new_edge->node[UPPER] = new_node;
1255 			new_edge->node[LOWER] = edge->node[LOWER];
1256 			list_add_tail(&new_edge->list[UPPER],
1257 				      &new_node->lower);
1258 		}
1259 	} else {
1260 		list_add_tail(&new_node->lower, &cache->leaves);
1261 	}
1262 
1263 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1264 			      &new_node->rb_node);
1265 	if (rb_node)
1266 		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1267 
1268 	if (!new_node->lowest) {
1269 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1270 			list_add_tail(&new_edge->list[LOWER],
1271 				      &new_edge->node[LOWER]->upper);
1272 		}
1273 	}
1274 	return 0;
1275 fail:
1276 	while (!list_empty(&new_node->lower)) {
1277 		new_edge = list_entry(new_node->lower.next,
1278 				      struct backref_edge, list[UPPER]);
1279 		list_del(&new_edge->list[UPPER]);
1280 		free_backref_edge(cache, new_edge);
1281 	}
1282 	free_backref_node(cache, new_node);
1283 	return -ENOMEM;
1284 }
1285 
1286 /*
1287  * helper to add 'address of tree root -> reloc tree' mapping
1288  */
1289 static int __must_check __add_reloc_root(struct btrfs_root *root)
1290 {
1291 	struct btrfs_fs_info *fs_info = root->fs_info;
1292 	struct rb_node *rb_node;
1293 	struct mapping_node *node;
1294 	struct reloc_control *rc = fs_info->reloc_ctl;
1295 
1296 	node = kmalloc(sizeof(*node), GFP_NOFS);
1297 	if (!node)
1298 		return -ENOMEM;
1299 
1300 	node->bytenr = root->node->start;
1301 	node->data = root;
1302 
1303 	spin_lock(&rc->reloc_root_tree.lock);
1304 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1305 			      node->bytenr, &node->rb_node);
1306 	spin_unlock(&rc->reloc_root_tree.lock);
1307 	if (rb_node) {
1308 		btrfs_panic(fs_info, -EEXIST,
1309 			    "Duplicate root found for start=%llu while inserting into relocation tree",
1310 			    node->bytenr);
1311 		kfree(node);
1312 		return -EEXIST;
1313 	}
1314 
1315 	list_add_tail(&root->root_list, &rc->reloc_roots);
1316 	return 0;
1317 }
1318 
1319 /*
1320  * helper to delete the 'address of tree root -> reloc tree'
1321  * mapping
1322  */
1323 static void __del_reloc_root(struct btrfs_root *root)
1324 {
1325 	struct btrfs_fs_info *fs_info = root->fs_info;
1326 	struct rb_node *rb_node;
1327 	struct mapping_node *node = NULL;
1328 	struct reloc_control *rc = fs_info->reloc_ctl;
1329 
1330 	spin_lock(&rc->reloc_root_tree.lock);
1331 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1332 			      root->node->start);
1333 	if (rb_node) {
1334 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1335 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1336 	}
1337 	spin_unlock(&rc->reloc_root_tree.lock);
1338 
1339 	if (!node)
1340 		return;
1341 	BUG_ON((struct btrfs_root *)node->data != root);
1342 
1343 	spin_lock(&fs_info->trans_lock);
1344 	list_del_init(&root->root_list);
1345 	spin_unlock(&fs_info->trans_lock);
1346 	kfree(node);
1347 }
1348 
1349 /*
1350  * helper to update the 'address of tree root -> reloc tree'
1351  * mapping
1352  */
1353 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1354 {
1355 	struct btrfs_fs_info *fs_info = root->fs_info;
1356 	struct rb_node *rb_node;
1357 	struct mapping_node *node = NULL;
1358 	struct reloc_control *rc = fs_info->reloc_ctl;
1359 
1360 	spin_lock(&rc->reloc_root_tree.lock);
1361 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1362 			      root->node->start);
1363 	if (rb_node) {
1364 		node = rb_entry(rb_node, struct mapping_node, rb_node);
1365 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1366 	}
1367 	spin_unlock(&rc->reloc_root_tree.lock);
1368 
1369 	if (!node)
1370 		return 0;
1371 	BUG_ON((struct btrfs_root *)node->data != root);
1372 
1373 	spin_lock(&rc->reloc_root_tree.lock);
1374 	node->bytenr = new_bytenr;
1375 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1376 			      node->bytenr, &node->rb_node);
1377 	spin_unlock(&rc->reloc_root_tree.lock);
1378 	if (rb_node)
1379 		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1380 	return 0;
1381 }
1382 
1383 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1384 					struct btrfs_root *root, u64 objectid)
1385 {
1386 	struct btrfs_fs_info *fs_info = root->fs_info;
1387 	struct btrfs_root *reloc_root;
1388 	struct extent_buffer *eb;
1389 	struct btrfs_root_item *root_item;
1390 	struct btrfs_key root_key;
1391 	int ret;
1392 
1393 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1394 	BUG_ON(!root_item);
1395 
1396 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1397 	root_key.type = BTRFS_ROOT_ITEM_KEY;
1398 	root_key.offset = objectid;
1399 
1400 	if (root->root_key.objectid == objectid) {
1401 		u64 commit_root_gen;
1402 
1403 		/* called by btrfs_init_reloc_root */
1404 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1405 				      BTRFS_TREE_RELOC_OBJECTID);
1406 		BUG_ON(ret);
1407 		/*
1408 		 * Set the last_snapshot field to the generation of the commit
1409 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1410 		 * correctly (returns true) when the relocation root is created
1411 		 * either inside the critical section of a transaction commit
1412 		 * (through transaction.c:qgroup_account_snapshot()) and when
1413 		 * it's created before the transaction commit is started.
1414 		 */
1415 		commit_root_gen = btrfs_header_generation(root->commit_root);
1416 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1417 	} else {
1418 		/*
1419 		 * called by btrfs_reloc_post_snapshot_hook.
1420 		 * the source tree is a reloc tree, all tree blocks
1421 		 * modified after it was created have RELOC flag
1422 		 * set in their headers. so it's OK to not update
1423 		 * the 'last_snapshot'.
1424 		 */
1425 		ret = btrfs_copy_root(trans, root, root->node, &eb,
1426 				      BTRFS_TREE_RELOC_OBJECTID);
1427 		BUG_ON(ret);
1428 	}
1429 
1430 	memcpy(root_item, &root->root_item, sizeof(*root_item));
1431 	btrfs_set_root_bytenr(root_item, eb->start);
1432 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
1433 	btrfs_set_root_generation(root_item, trans->transid);
1434 
1435 	if (root->root_key.objectid == objectid) {
1436 		btrfs_set_root_refs(root_item, 0);
1437 		memset(&root_item->drop_progress, 0,
1438 		       sizeof(struct btrfs_disk_key));
1439 		root_item->drop_level = 0;
1440 	}
1441 
1442 	btrfs_tree_unlock(eb);
1443 	free_extent_buffer(eb);
1444 
1445 	ret = btrfs_insert_root(trans, fs_info->tree_root,
1446 				&root_key, root_item);
1447 	BUG_ON(ret);
1448 	kfree(root_item);
1449 
1450 	reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1451 	BUG_ON(IS_ERR(reloc_root));
1452 	reloc_root->last_trans = trans->transid;
1453 	return reloc_root;
1454 }
1455 
1456 /*
1457  * create reloc tree for a given fs tree. reloc tree is just a
1458  * snapshot of the fs tree with special root objectid.
1459  */
1460 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1461 			  struct btrfs_root *root)
1462 {
1463 	struct btrfs_fs_info *fs_info = root->fs_info;
1464 	struct btrfs_root *reloc_root;
1465 	struct reloc_control *rc = fs_info->reloc_ctl;
1466 	struct btrfs_block_rsv *rsv;
1467 	int clear_rsv = 0;
1468 	int ret;
1469 
1470 	if (root->reloc_root) {
1471 		reloc_root = root->reloc_root;
1472 		reloc_root->last_trans = trans->transid;
1473 		return 0;
1474 	}
1475 
1476 	if (!rc || !rc->create_reloc_tree ||
1477 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1478 		return 0;
1479 
1480 	if (!trans->reloc_reserved) {
1481 		rsv = trans->block_rsv;
1482 		trans->block_rsv = rc->block_rsv;
1483 		clear_rsv = 1;
1484 	}
1485 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1486 	if (clear_rsv)
1487 		trans->block_rsv = rsv;
1488 
1489 	ret = __add_reloc_root(reloc_root);
1490 	BUG_ON(ret < 0);
1491 	root->reloc_root = reloc_root;
1492 	return 0;
1493 }
1494 
1495 /*
1496  * update root item of reloc tree
1497  */
1498 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1499 			    struct btrfs_root *root)
1500 {
1501 	struct btrfs_fs_info *fs_info = root->fs_info;
1502 	struct btrfs_root *reloc_root;
1503 	struct btrfs_root_item *root_item;
1504 	int ret;
1505 
1506 	if (!root->reloc_root)
1507 		goto out;
1508 
1509 	reloc_root = root->reloc_root;
1510 	root_item = &reloc_root->root_item;
1511 
1512 	if (fs_info->reloc_ctl->merge_reloc_tree &&
1513 	    btrfs_root_refs(root_item) == 0) {
1514 		root->reloc_root = NULL;
1515 		__del_reloc_root(reloc_root);
1516 	}
1517 
1518 	if (reloc_root->commit_root != reloc_root->node) {
1519 		btrfs_set_root_node(root_item, reloc_root->node);
1520 		free_extent_buffer(reloc_root->commit_root);
1521 		reloc_root->commit_root = btrfs_root_node(reloc_root);
1522 	}
1523 
1524 	ret = btrfs_update_root(trans, fs_info->tree_root,
1525 				&reloc_root->root_key, root_item);
1526 	BUG_ON(ret);
1527 
1528 out:
1529 	return 0;
1530 }
1531 
1532 /*
1533  * helper to find first cached inode with inode number >= objectid
1534  * in a subvolume
1535  */
1536 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1537 {
1538 	struct rb_node *node;
1539 	struct rb_node *prev;
1540 	struct btrfs_inode *entry;
1541 	struct inode *inode;
1542 
1543 	spin_lock(&root->inode_lock);
1544 again:
1545 	node = root->inode_tree.rb_node;
1546 	prev = NULL;
1547 	while (node) {
1548 		prev = node;
1549 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1550 
1551 		if (objectid < btrfs_ino(entry))
1552 			node = node->rb_left;
1553 		else if (objectid > btrfs_ino(entry))
1554 			node = node->rb_right;
1555 		else
1556 			break;
1557 	}
1558 	if (!node) {
1559 		while (prev) {
1560 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
1561 			if (objectid <= btrfs_ino(entry)) {
1562 				node = prev;
1563 				break;
1564 			}
1565 			prev = rb_next(prev);
1566 		}
1567 	}
1568 	while (node) {
1569 		entry = rb_entry(node, struct btrfs_inode, rb_node);
1570 		inode = igrab(&entry->vfs_inode);
1571 		if (inode) {
1572 			spin_unlock(&root->inode_lock);
1573 			return inode;
1574 		}
1575 
1576 		objectid = btrfs_ino(entry) + 1;
1577 		if (cond_resched_lock(&root->inode_lock))
1578 			goto again;
1579 
1580 		node = rb_next(node);
1581 	}
1582 	spin_unlock(&root->inode_lock);
1583 	return NULL;
1584 }
1585 
1586 static int in_block_group(u64 bytenr,
1587 			  struct btrfs_block_group_cache *block_group)
1588 {
1589 	if (bytenr >= block_group->key.objectid &&
1590 	    bytenr < block_group->key.objectid + block_group->key.offset)
1591 		return 1;
1592 	return 0;
1593 }
1594 
1595 /*
1596  * get new location of data
1597  */
1598 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1599 			    u64 bytenr, u64 num_bytes)
1600 {
1601 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1602 	struct btrfs_path *path;
1603 	struct btrfs_file_extent_item *fi;
1604 	struct extent_buffer *leaf;
1605 	int ret;
1606 
1607 	path = btrfs_alloc_path();
1608 	if (!path)
1609 		return -ENOMEM;
1610 
1611 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1612 	ret = btrfs_lookup_file_extent(NULL, root, path,
1613 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1614 	if (ret < 0)
1615 		goto out;
1616 	if (ret > 0) {
1617 		ret = -ENOENT;
1618 		goto out;
1619 	}
1620 
1621 	leaf = path->nodes[0];
1622 	fi = btrfs_item_ptr(leaf, path->slots[0],
1623 			    struct btrfs_file_extent_item);
1624 
1625 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1626 	       btrfs_file_extent_compression(leaf, fi) ||
1627 	       btrfs_file_extent_encryption(leaf, fi) ||
1628 	       btrfs_file_extent_other_encoding(leaf, fi));
1629 
1630 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1631 		ret = -EINVAL;
1632 		goto out;
1633 	}
1634 
1635 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1636 	ret = 0;
1637 out:
1638 	btrfs_free_path(path);
1639 	return ret;
1640 }
1641 
1642 /*
1643  * update file extent items in the tree leaf to point to
1644  * the new locations.
1645  */
1646 static noinline_for_stack
1647 int replace_file_extents(struct btrfs_trans_handle *trans,
1648 			 struct reloc_control *rc,
1649 			 struct btrfs_root *root,
1650 			 struct extent_buffer *leaf)
1651 {
1652 	struct btrfs_fs_info *fs_info = root->fs_info;
1653 	struct btrfs_key key;
1654 	struct btrfs_file_extent_item *fi;
1655 	struct inode *inode = NULL;
1656 	u64 parent;
1657 	u64 bytenr;
1658 	u64 new_bytenr = 0;
1659 	u64 num_bytes;
1660 	u64 end;
1661 	u32 nritems;
1662 	u32 i;
1663 	int ret = 0;
1664 	int first = 1;
1665 	int dirty = 0;
1666 
1667 	if (rc->stage != UPDATE_DATA_PTRS)
1668 		return 0;
1669 
1670 	/* reloc trees always use full backref */
1671 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1672 		parent = leaf->start;
1673 	else
1674 		parent = 0;
1675 
1676 	nritems = btrfs_header_nritems(leaf);
1677 	for (i = 0; i < nritems; i++) {
1678 		cond_resched();
1679 		btrfs_item_key_to_cpu(leaf, &key, i);
1680 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1681 			continue;
1682 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1683 		if (btrfs_file_extent_type(leaf, fi) ==
1684 		    BTRFS_FILE_EXTENT_INLINE)
1685 			continue;
1686 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1687 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1688 		if (bytenr == 0)
1689 			continue;
1690 		if (!in_block_group(bytenr, rc->block_group))
1691 			continue;
1692 
1693 		/*
1694 		 * if we are modifying block in fs tree, wait for readpage
1695 		 * to complete and drop the extent cache
1696 		 */
1697 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1698 			if (first) {
1699 				inode = find_next_inode(root, key.objectid);
1700 				first = 0;
1701 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1702 				btrfs_add_delayed_iput(inode);
1703 				inode = find_next_inode(root, key.objectid);
1704 			}
1705 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1706 				end = key.offset +
1707 				      btrfs_file_extent_num_bytes(leaf, fi);
1708 				WARN_ON(!IS_ALIGNED(key.offset,
1709 						    fs_info->sectorsize));
1710 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1711 				end--;
1712 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1713 						      key.offset, end);
1714 				if (!ret)
1715 					continue;
1716 
1717 				btrfs_drop_extent_cache(BTRFS_I(inode),
1718 						key.offset,	end, 1);
1719 				unlock_extent(&BTRFS_I(inode)->io_tree,
1720 					      key.offset, end);
1721 			}
1722 		}
1723 
1724 		ret = get_new_location(rc->data_inode, &new_bytenr,
1725 				       bytenr, num_bytes);
1726 		if (ret) {
1727 			/*
1728 			 * Don't have to abort since we've not changed anything
1729 			 * in the file extent yet.
1730 			 */
1731 			break;
1732 		}
1733 
1734 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1735 		dirty = 1;
1736 
1737 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1738 		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1739 					   num_bytes, parent,
1740 					   btrfs_header_owner(leaf),
1741 					   key.objectid, key.offset);
1742 		if (ret) {
1743 			btrfs_abort_transaction(trans, ret);
1744 			break;
1745 		}
1746 
1747 		ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
1748 					parent, btrfs_header_owner(leaf),
1749 					key.objectid, key.offset);
1750 		if (ret) {
1751 			btrfs_abort_transaction(trans, ret);
1752 			break;
1753 		}
1754 	}
1755 	if (dirty)
1756 		btrfs_mark_buffer_dirty(leaf);
1757 	if (inode)
1758 		btrfs_add_delayed_iput(inode);
1759 	return ret;
1760 }
1761 
1762 static noinline_for_stack
1763 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1764 		     struct btrfs_path *path, int level)
1765 {
1766 	struct btrfs_disk_key key1;
1767 	struct btrfs_disk_key key2;
1768 	btrfs_node_key(eb, &key1, slot);
1769 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1770 	return memcmp(&key1, &key2, sizeof(key1));
1771 }
1772 
1773 /*
1774  * try to replace tree blocks in fs tree with the new blocks
1775  * in reloc tree. tree blocks haven't been modified since the
1776  * reloc tree was create can be replaced.
1777  *
1778  * if a block was replaced, level of the block + 1 is returned.
1779  * if no block got replaced, 0 is returned. if there are other
1780  * errors, a negative error number is returned.
1781  */
1782 static noinline_for_stack
1783 int replace_path(struct btrfs_trans_handle *trans,
1784 		 struct btrfs_root *dest, struct btrfs_root *src,
1785 		 struct btrfs_path *path, struct btrfs_key *next_key,
1786 		 int lowest_level, int max_level)
1787 {
1788 	struct btrfs_fs_info *fs_info = dest->fs_info;
1789 	struct extent_buffer *eb;
1790 	struct extent_buffer *parent;
1791 	struct btrfs_key key;
1792 	u64 old_bytenr;
1793 	u64 new_bytenr;
1794 	u64 old_ptr_gen;
1795 	u64 new_ptr_gen;
1796 	u64 last_snapshot;
1797 	u32 blocksize;
1798 	int cow = 0;
1799 	int level;
1800 	int ret;
1801 	int slot;
1802 
1803 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1804 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1805 
1806 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1807 again:
1808 	slot = path->slots[lowest_level];
1809 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1810 
1811 	eb = btrfs_lock_root_node(dest);
1812 	btrfs_set_lock_blocking(eb);
1813 	level = btrfs_header_level(eb);
1814 
1815 	if (level < lowest_level) {
1816 		btrfs_tree_unlock(eb);
1817 		free_extent_buffer(eb);
1818 		return 0;
1819 	}
1820 
1821 	if (cow) {
1822 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1823 		BUG_ON(ret);
1824 	}
1825 	btrfs_set_lock_blocking(eb);
1826 
1827 	if (next_key) {
1828 		next_key->objectid = (u64)-1;
1829 		next_key->type = (u8)-1;
1830 		next_key->offset = (u64)-1;
1831 	}
1832 
1833 	parent = eb;
1834 	while (1) {
1835 		level = btrfs_header_level(parent);
1836 		BUG_ON(level < lowest_level);
1837 
1838 		ret = btrfs_bin_search(parent, &key, level, &slot);
1839 		if (ret && slot > 0)
1840 			slot--;
1841 
1842 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1843 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1844 
1845 		old_bytenr = btrfs_node_blockptr(parent, slot);
1846 		blocksize = fs_info->nodesize;
1847 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1848 
1849 		if (level <= max_level) {
1850 			eb = path->nodes[level];
1851 			new_bytenr = btrfs_node_blockptr(eb,
1852 							path->slots[level]);
1853 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1854 							path->slots[level]);
1855 		} else {
1856 			new_bytenr = 0;
1857 			new_ptr_gen = 0;
1858 		}
1859 
1860 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1861 			ret = level;
1862 			break;
1863 		}
1864 
1865 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1866 		    memcmp_node_keys(parent, slot, path, level)) {
1867 			if (level <= lowest_level) {
1868 				ret = 0;
1869 				break;
1870 			}
1871 
1872 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
1873 			if (IS_ERR(eb)) {
1874 				ret = PTR_ERR(eb);
1875 				break;
1876 			} else if (!extent_buffer_uptodate(eb)) {
1877 				ret = -EIO;
1878 				free_extent_buffer(eb);
1879 				break;
1880 			}
1881 			btrfs_tree_lock(eb);
1882 			if (cow) {
1883 				ret = btrfs_cow_block(trans, dest, eb, parent,
1884 						      slot, &eb);
1885 				BUG_ON(ret);
1886 			}
1887 			btrfs_set_lock_blocking(eb);
1888 
1889 			btrfs_tree_unlock(parent);
1890 			free_extent_buffer(parent);
1891 
1892 			parent = eb;
1893 			continue;
1894 		}
1895 
1896 		if (!cow) {
1897 			btrfs_tree_unlock(parent);
1898 			free_extent_buffer(parent);
1899 			cow = 1;
1900 			goto again;
1901 		}
1902 
1903 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1904 				      path->slots[level]);
1905 		btrfs_release_path(path);
1906 
1907 		path->lowest_level = level;
1908 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1909 		path->lowest_level = 0;
1910 		BUG_ON(ret);
1911 
1912 		/*
1913 		 * Info qgroup to trace both subtrees.
1914 		 *
1915 		 * We must trace both trees.
1916 		 * 1) Tree reloc subtree
1917 		 *    If not traced, we will leak data numbers
1918 		 * 2) Fs subtree
1919 		 *    If not traced, we will double count old data
1920 		 *    and tree block numbers, if current trans doesn't free
1921 		 *    data reloc tree inode.
1922 		 */
1923 		ret = btrfs_qgroup_trace_subtree(trans, src, parent,
1924 				btrfs_header_generation(parent),
1925 				btrfs_header_level(parent));
1926 		if (ret < 0)
1927 			break;
1928 		ret = btrfs_qgroup_trace_subtree(trans, dest,
1929 				path->nodes[level],
1930 				btrfs_header_generation(path->nodes[level]),
1931 				btrfs_header_level(path->nodes[level]));
1932 		if (ret < 0)
1933 			break;
1934 
1935 		/*
1936 		 * swap blocks in fs tree and reloc tree.
1937 		 */
1938 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1939 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1940 		btrfs_mark_buffer_dirty(parent);
1941 
1942 		btrfs_set_node_blockptr(path->nodes[level],
1943 					path->slots[level], old_bytenr);
1944 		btrfs_set_node_ptr_generation(path->nodes[level],
1945 					      path->slots[level], old_ptr_gen);
1946 		btrfs_mark_buffer_dirty(path->nodes[level]);
1947 
1948 		ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
1949 					blocksize, path->nodes[level]->start,
1950 					src->root_key.objectid, level - 1, 0);
1951 		BUG_ON(ret);
1952 		ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1953 					blocksize, 0, dest->root_key.objectid,
1954 					level - 1, 0);
1955 		BUG_ON(ret);
1956 
1957 		ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
1958 					path->nodes[level]->start,
1959 					src->root_key.objectid, level - 1, 0);
1960 		BUG_ON(ret);
1961 
1962 		ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
1963 					0, dest->root_key.objectid, level - 1,
1964 					0);
1965 		BUG_ON(ret);
1966 
1967 		btrfs_unlock_up_safe(path, 0);
1968 
1969 		ret = level;
1970 		break;
1971 	}
1972 	btrfs_tree_unlock(parent);
1973 	free_extent_buffer(parent);
1974 	return ret;
1975 }
1976 
1977 /*
1978  * helper to find next relocated block in reloc tree
1979  */
1980 static noinline_for_stack
1981 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1982 		       int *level)
1983 {
1984 	struct extent_buffer *eb;
1985 	int i;
1986 	u64 last_snapshot;
1987 	u32 nritems;
1988 
1989 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1990 
1991 	for (i = 0; i < *level; i++) {
1992 		free_extent_buffer(path->nodes[i]);
1993 		path->nodes[i] = NULL;
1994 	}
1995 
1996 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1997 		eb = path->nodes[i];
1998 		nritems = btrfs_header_nritems(eb);
1999 		while (path->slots[i] + 1 < nritems) {
2000 			path->slots[i]++;
2001 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2002 			    last_snapshot)
2003 				continue;
2004 
2005 			*level = i;
2006 			return 0;
2007 		}
2008 		free_extent_buffer(path->nodes[i]);
2009 		path->nodes[i] = NULL;
2010 	}
2011 	return 1;
2012 }
2013 
2014 /*
2015  * walk down reloc tree to find relocated block of lowest level
2016  */
2017 static noinline_for_stack
2018 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2019 			 int *level)
2020 {
2021 	struct btrfs_fs_info *fs_info = root->fs_info;
2022 	struct extent_buffer *eb = NULL;
2023 	int i;
2024 	u64 bytenr;
2025 	u64 ptr_gen = 0;
2026 	u64 last_snapshot;
2027 	u32 nritems;
2028 
2029 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2030 
2031 	for (i = *level; i > 0; i--) {
2032 		eb = path->nodes[i];
2033 		nritems = btrfs_header_nritems(eb);
2034 		while (path->slots[i] < nritems) {
2035 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2036 			if (ptr_gen > last_snapshot)
2037 				break;
2038 			path->slots[i]++;
2039 		}
2040 		if (path->slots[i] >= nritems) {
2041 			if (i == *level)
2042 				break;
2043 			*level = i + 1;
2044 			return 0;
2045 		}
2046 		if (i == 1) {
2047 			*level = i;
2048 			return 0;
2049 		}
2050 
2051 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2052 		eb = read_tree_block(fs_info, bytenr, ptr_gen);
2053 		if (IS_ERR(eb)) {
2054 			return PTR_ERR(eb);
2055 		} else if (!extent_buffer_uptodate(eb)) {
2056 			free_extent_buffer(eb);
2057 			return -EIO;
2058 		}
2059 		BUG_ON(btrfs_header_level(eb) != i - 1);
2060 		path->nodes[i - 1] = eb;
2061 		path->slots[i - 1] = 0;
2062 	}
2063 	return 1;
2064 }
2065 
2066 /*
2067  * invalidate extent cache for file extents whose key in range of
2068  * [min_key, max_key)
2069  */
2070 static int invalidate_extent_cache(struct btrfs_root *root,
2071 				   struct btrfs_key *min_key,
2072 				   struct btrfs_key *max_key)
2073 {
2074 	struct btrfs_fs_info *fs_info = root->fs_info;
2075 	struct inode *inode = NULL;
2076 	u64 objectid;
2077 	u64 start, end;
2078 	u64 ino;
2079 
2080 	objectid = min_key->objectid;
2081 	while (1) {
2082 		cond_resched();
2083 		iput(inode);
2084 
2085 		if (objectid > max_key->objectid)
2086 			break;
2087 
2088 		inode = find_next_inode(root, objectid);
2089 		if (!inode)
2090 			break;
2091 		ino = btrfs_ino(BTRFS_I(inode));
2092 
2093 		if (ino > max_key->objectid) {
2094 			iput(inode);
2095 			break;
2096 		}
2097 
2098 		objectid = ino + 1;
2099 		if (!S_ISREG(inode->i_mode))
2100 			continue;
2101 
2102 		if (unlikely(min_key->objectid == ino)) {
2103 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2104 				continue;
2105 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2106 				start = 0;
2107 			else {
2108 				start = min_key->offset;
2109 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2110 			}
2111 		} else {
2112 			start = 0;
2113 		}
2114 
2115 		if (unlikely(max_key->objectid == ino)) {
2116 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2117 				continue;
2118 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2119 				end = (u64)-1;
2120 			} else {
2121 				if (max_key->offset == 0)
2122 					continue;
2123 				end = max_key->offset;
2124 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2125 				end--;
2126 			}
2127 		} else {
2128 			end = (u64)-1;
2129 		}
2130 
2131 		/* the lock_extent waits for readpage to complete */
2132 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2133 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2134 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2135 	}
2136 	return 0;
2137 }
2138 
2139 static int find_next_key(struct btrfs_path *path, int level,
2140 			 struct btrfs_key *key)
2141 
2142 {
2143 	while (level < BTRFS_MAX_LEVEL) {
2144 		if (!path->nodes[level])
2145 			break;
2146 		if (path->slots[level] + 1 <
2147 		    btrfs_header_nritems(path->nodes[level])) {
2148 			btrfs_node_key_to_cpu(path->nodes[level], key,
2149 					      path->slots[level] + 1);
2150 			return 0;
2151 		}
2152 		level++;
2153 	}
2154 	return 1;
2155 }
2156 
2157 /*
2158  * merge the relocated tree blocks in reloc tree with corresponding
2159  * fs tree.
2160  */
2161 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2162 					       struct btrfs_root *root)
2163 {
2164 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2165 	LIST_HEAD(inode_list);
2166 	struct btrfs_key key;
2167 	struct btrfs_key next_key;
2168 	struct btrfs_trans_handle *trans = NULL;
2169 	struct btrfs_root *reloc_root;
2170 	struct btrfs_root_item *root_item;
2171 	struct btrfs_path *path;
2172 	struct extent_buffer *leaf;
2173 	int level;
2174 	int max_level;
2175 	int replaced = 0;
2176 	int ret;
2177 	int err = 0;
2178 	u32 min_reserved;
2179 
2180 	path = btrfs_alloc_path();
2181 	if (!path)
2182 		return -ENOMEM;
2183 	path->reada = READA_FORWARD;
2184 
2185 	reloc_root = root->reloc_root;
2186 	root_item = &reloc_root->root_item;
2187 
2188 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2189 		level = btrfs_root_level(root_item);
2190 		extent_buffer_get(reloc_root->node);
2191 		path->nodes[level] = reloc_root->node;
2192 		path->slots[level] = 0;
2193 	} else {
2194 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2195 
2196 		level = root_item->drop_level;
2197 		BUG_ON(level == 0);
2198 		path->lowest_level = level;
2199 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2200 		path->lowest_level = 0;
2201 		if (ret < 0) {
2202 			btrfs_free_path(path);
2203 			return ret;
2204 		}
2205 
2206 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2207 				      path->slots[level]);
2208 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2209 
2210 		btrfs_unlock_up_safe(path, 0);
2211 	}
2212 
2213 	min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2214 	memset(&next_key, 0, sizeof(next_key));
2215 
2216 	while (1) {
2217 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2218 					     BTRFS_RESERVE_FLUSH_ALL);
2219 		if (ret) {
2220 			err = ret;
2221 			goto out;
2222 		}
2223 		trans = btrfs_start_transaction(root, 0);
2224 		if (IS_ERR(trans)) {
2225 			err = PTR_ERR(trans);
2226 			trans = NULL;
2227 			goto out;
2228 		}
2229 		trans->block_rsv = rc->block_rsv;
2230 
2231 		replaced = 0;
2232 		max_level = level;
2233 
2234 		ret = walk_down_reloc_tree(reloc_root, path, &level);
2235 		if (ret < 0) {
2236 			err = ret;
2237 			goto out;
2238 		}
2239 		if (ret > 0)
2240 			break;
2241 
2242 		if (!find_next_key(path, level, &key) &&
2243 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2244 			ret = 0;
2245 		} else {
2246 			ret = replace_path(trans, root, reloc_root, path,
2247 					   &next_key, level, max_level);
2248 		}
2249 		if (ret < 0) {
2250 			err = ret;
2251 			goto out;
2252 		}
2253 
2254 		if (ret > 0) {
2255 			level = ret;
2256 			btrfs_node_key_to_cpu(path->nodes[level], &key,
2257 					      path->slots[level]);
2258 			replaced = 1;
2259 		}
2260 
2261 		ret = walk_up_reloc_tree(reloc_root, path, &level);
2262 		if (ret > 0)
2263 			break;
2264 
2265 		BUG_ON(level == 0);
2266 		/*
2267 		 * save the merging progress in the drop_progress.
2268 		 * this is OK since root refs == 1 in this case.
2269 		 */
2270 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2271 			       path->slots[level]);
2272 		root_item->drop_level = level;
2273 
2274 		btrfs_end_transaction_throttle(trans);
2275 		trans = NULL;
2276 
2277 		btrfs_btree_balance_dirty(fs_info);
2278 
2279 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
2280 			invalidate_extent_cache(root, &key, &next_key);
2281 	}
2282 
2283 	/*
2284 	 * handle the case only one block in the fs tree need to be
2285 	 * relocated and the block is tree root.
2286 	 */
2287 	leaf = btrfs_lock_root_node(root);
2288 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2289 	btrfs_tree_unlock(leaf);
2290 	free_extent_buffer(leaf);
2291 	if (ret < 0)
2292 		err = ret;
2293 out:
2294 	btrfs_free_path(path);
2295 
2296 	if (err == 0) {
2297 		memset(&root_item->drop_progress, 0,
2298 		       sizeof(root_item->drop_progress));
2299 		root_item->drop_level = 0;
2300 		btrfs_set_root_refs(root_item, 0);
2301 		btrfs_update_reloc_root(trans, root);
2302 	}
2303 
2304 	if (trans)
2305 		btrfs_end_transaction_throttle(trans);
2306 
2307 	btrfs_btree_balance_dirty(fs_info);
2308 
2309 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
2310 		invalidate_extent_cache(root, &key, &next_key);
2311 
2312 	return err;
2313 }
2314 
2315 static noinline_for_stack
2316 int prepare_to_merge(struct reloc_control *rc, int err)
2317 {
2318 	struct btrfs_root *root = rc->extent_root;
2319 	struct btrfs_fs_info *fs_info = root->fs_info;
2320 	struct btrfs_root *reloc_root;
2321 	struct btrfs_trans_handle *trans;
2322 	LIST_HEAD(reloc_roots);
2323 	u64 num_bytes = 0;
2324 	int ret;
2325 
2326 	mutex_lock(&fs_info->reloc_mutex);
2327 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2328 	rc->merging_rsv_size += rc->nodes_relocated * 2;
2329 	mutex_unlock(&fs_info->reloc_mutex);
2330 
2331 again:
2332 	if (!err) {
2333 		num_bytes = rc->merging_rsv_size;
2334 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2335 					  BTRFS_RESERVE_FLUSH_ALL);
2336 		if (ret)
2337 			err = ret;
2338 	}
2339 
2340 	trans = btrfs_join_transaction(rc->extent_root);
2341 	if (IS_ERR(trans)) {
2342 		if (!err)
2343 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2344 						num_bytes);
2345 		return PTR_ERR(trans);
2346 	}
2347 
2348 	if (!err) {
2349 		if (num_bytes != rc->merging_rsv_size) {
2350 			btrfs_end_transaction(trans);
2351 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
2352 						num_bytes);
2353 			goto again;
2354 		}
2355 	}
2356 
2357 	rc->merge_reloc_tree = 1;
2358 
2359 	while (!list_empty(&rc->reloc_roots)) {
2360 		reloc_root = list_entry(rc->reloc_roots.next,
2361 					struct btrfs_root, root_list);
2362 		list_del_init(&reloc_root->root_list);
2363 
2364 		root = read_fs_root(fs_info, reloc_root->root_key.offset);
2365 		BUG_ON(IS_ERR(root));
2366 		BUG_ON(root->reloc_root != reloc_root);
2367 
2368 		/*
2369 		 * set reference count to 1, so btrfs_recover_relocation
2370 		 * knows it should resumes merging
2371 		 */
2372 		if (!err)
2373 			btrfs_set_root_refs(&reloc_root->root_item, 1);
2374 		btrfs_update_reloc_root(trans, root);
2375 
2376 		list_add(&reloc_root->root_list, &reloc_roots);
2377 	}
2378 
2379 	list_splice(&reloc_roots, &rc->reloc_roots);
2380 
2381 	if (!err)
2382 		btrfs_commit_transaction(trans);
2383 	else
2384 		btrfs_end_transaction(trans);
2385 	return err;
2386 }
2387 
2388 static noinline_for_stack
2389 void free_reloc_roots(struct list_head *list)
2390 {
2391 	struct btrfs_root *reloc_root;
2392 
2393 	while (!list_empty(list)) {
2394 		reloc_root = list_entry(list->next, struct btrfs_root,
2395 					root_list);
2396 		free_extent_buffer(reloc_root->node);
2397 		free_extent_buffer(reloc_root->commit_root);
2398 		reloc_root->node = NULL;
2399 		reloc_root->commit_root = NULL;
2400 		__del_reloc_root(reloc_root);
2401 	}
2402 }
2403 
2404 static noinline_for_stack
2405 void merge_reloc_roots(struct reloc_control *rc)
2406 {
2407 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2408 	struct btrfs_root *root;
2409 	struct btrfs_root *reloc_root;
2410 	LIST_HEAD(reloc_roots);
2411 	int found = 0;
2412 	int ret = 0;
2413 again:
2414 	root = rc->extent_root;
2415 
2416 	/*
2417 	 * this serializes us with btrfs_record_root_in_transaction,
2418 	 * we have to make sure nobody is in the middle of
2419 	 * adding their roots to the list while we are
2420 	 * doing this splice
2421 	 */
2422 	mutex_lock(&fs_info->reloc_mutex);
2423 	list_splice_init(&rc->reloc_roots, &reloc_roots);
2424 	mutex_unlock(&fs_info->reloc_mutex);
2425 
2426 	while (!list_empty(&reloc_roots)) {
2427 		found = 1;
2428 		reloc_root = list_entry(reloc_roots.next,
2429 					struct btrfs_root, root_list);
2430 
2431 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2432 			root = read_fs_root(fs_info,
2433 					    reloc_root->root_key.offset);
2434 			BUG_ON(IS_ERR(root));
2435 			BUG_ON(root->reloc_root != reloc_root);
2436 
2437 			ret = merge_reloc_root(rc, root);
2438 			if (ret) {
2439 				if (list_empty(&reloc_root->root_list))
2440 					list_add_tail(&reloc_root->root_list,
2441 						      &reloc_roots);
2442 				goto out;
2443 			}
2444 		} else {
2445 			list_del_init(&reloc_root->root_list);
2446 		}
2447 
2448 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2449 		if (ret < 0) {
2450 			if (list_empty(&reloc_root->root_list))
2451 				list_add_tail(&reloc_root->root_list,
2452 					      &reloc_roots);
2453 			goto out;
2454 		}
2455 	}
2456 
2457 	if (found) {
2458 		found = 0;
2459 		goto again;
2460 	}
2461 out:
2462 	if (ret) {
2463 		btrfs_handle_fs_error(fs_info, ret, NULL);
2464 		if (!list_empty(&reloc_roots))
2465 			free_reloc_roots(&reloc_roots);
2466 
2467 		/* new reloc root may be added */
2468 		mutex_lock(&fs_info->reloc_mutex);
2469 		list_splice_init(&rc->reloc_roots, &reloc_roots);
2470 		mutex_unlock(&fs_info->reloc_mutex);
2471 		if (!list_empty(&reloc_roots))
2472 			free_reloc_roots(&reloc_roots);
2473 	}
2474 
2475 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2476 }
2477 
2478 static void free_block_list(struct rb_root *blocks)
2479 {
2480 	struct tree_block *block;
2481 	struct rb_node *rb_node;
2482 	while ((rb_node = rb_first(blocks))) {
2483 		block = rb_entry(rb_node, struct tree_block, rb_node);
2484 		rb_erase(rb_node, blocks);
2485 		kfree(block);
2486 	}
2487 }
2488 
2489 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2490 				      struct btrfs_root *reloc_root)
2491 {
2492 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2493 	struct btrfs_root *root;
2494 
2495 	if (reloc_root->last_trans == trans->transid)
2496 		return 0;
2497 
2498 	root = read_fs_root(fs_info, reloc_root->root_key.offset);
2499 	BUG_ON(IS_ERR(root));
2500 	BUG_ON(root->reloc_root != reloc_root);
2501 
2502 	return btrfs_record_root_in_trans(trans, root);
2503 }
2504 
2505 static noinline_for_stack
2506 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2507 				     struct reloc_control *rc,
2508 				     struct backref_node *node,
2509 				     struct backref_edge *edges[])
2510 {
2511 	struct backref_node *next;
2512 	struct btrfs_root *root;
2513 	int index = 0;
2514 
2515 	next = node;
2516 	while (1) {
2517 		cond_resched();
2518 		next = walk_up_backref(next, edges, &index);
2519 		root = next->root;
2520 		BUG_ON(!root);
2521 		BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2522 
2523 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2524 			record_reloc_root_in_trans(trans, root);
2525 			break;
2526 		}
2527 
2528 		btrfs_record_root_in_trans(trans, root);
2529 		root = root->reloc_root;
2530 
2531 		if (next->new_bytenr != root->node->start) {
2532 			BUG_ON(next->new_bytenr);
2533 			BUG_ON(!list_empty(&next->list));
2534 			next->new_bytenr = root->node->start;
2535 			next->root = root;
2536 			list_add_tail(&next->list,
2537 				      &rc->backref_cache.changed);
2538 			__mark_block_processed(rc, next);
2539 			break;
2540 		}
2541 
2542 		WARN_ON(1);
2543 		root = NULL;
2544 		next = walk_down_backref(edges, &index);
2545 		if (!next || next->level <= node->level)
2546 			break;
2547 	}
2548 	if (!root)
2549 		return NULL;
2550 
2551 	next = node;
2552 	/* setup backref node path for btrfs_reloc_cow_block */
2553 	while (1) {
2554 		rc->backref_cache.path[next->level] = next;
2555 		if (--index < 0)
2556 			break;
2557 		next = edges[index]->node[UPPER];
2558 	}
2559 	return root;
2560 }
2561 
2562 /*
2563  * select a tree root for relocation. return NULL if the block
2564  * is reference counted. we should use do_relocation() in this
2565  * case. return a tree root pointer if the block isn't reference
2566  * counted. return -ENOENT if the block is root of reloc tree.
2567  */
2568 static noinline_for_stack
2569 struct btrfs_root *select_one_root(struct backref_node *node)
2570 {
2571 	struct backref_node *next;
2572 	struct btrfs_root *root;
2573 	struct btrfs_root *fs_root = NULL;
2574 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2575 	int index = 0;
2576 
2577 	next = node;
2578 	while (1) {
2579 		cond_resched();
2580 		next = walk_up_backref(next, edges, &index);
2581 		root = next->root;
2582 		BUG_ON(!root);
2583 
2584 		/* no other choice for non-references counted tree */
2585 		if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2586 			return root;
2587 
2588 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2589 			fs_root = root;
2590 
2591 		if (next != node)
2592 			return NULL;
2593 
2594 		next = walk_down_backref(edges, &index);
2595 		if (!next || next->level <= node->level)
2596 			break;
2597 	}
2598 
2599 	if (!fs_root)
2600 		return ERR_PTR(-ENOENT);
2601 	return fs_root;
2602 }
2603 
2604 static noinline_for_stack
2605 u64 calcu_metadata_size(struct reloc_control *rc,
2606 			struct backref_node *node, int reserve)
2607 {
2608 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2609 	struct backref_node *next = node;
2610 	struct backref_edge *edge;
2611 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2612 	u64 num_bytes = 0;
2613 	int index = 0;
2614 
2615 	BUG_ON(reserve && node->processed);
2616 
2617 	while (next) {
2618 		cond_resched();
2619 		while (1) {
2620 			if (next->processed && (reserve || next != node))
2621 				break;
2622 
2623 			num_bytes += fs_info->nodesize;
2624 
2625 			if (list_empty(&next->upper))
2626 				break;
2627 
2628 			edge = list_entry(next->upper.next,
2629 					  struct backref_edge, list[LOWER]);
2630 			edges[index++] = edge;
2631 			next = edge->node[UPPER];
2632 		}
2633 		next = walk_down_backref(edges, &index);
2634 	}
2635 	return num_bytes;
2636 }
2637 
2638 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2639 				  struct reloc_control *rc,
2640 				  struct backref_node *node)
2641 {
2642 	struct btrfs_root *root = rc->extent_root;
2643 	struct btrfs_fs_info *fs_info = root->fs_info;
2644 	u64 num_bytes;
2645 	int ret;
2646 	u64 tmp;
2647 
2648 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2649 
2650 	trans->block_rsv = rc->block_rsv;
2651 	rc->reserved_bytes += num_bytes;
2652 
2653 	/*
2654 	 * We are under a transaction here so we can only do limited flushing.
2655 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2656 	 * transaction and try to refill when we can flush all the things.
2657 	 */
2658 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2659 				BTRFS_RESERVE_FLUSH_LIMIT);
2660 	if (ret) {
2661 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2662 		while (tmp <= rc->reserved_bytes)
2663 			tmp <<= 1;
2664 		/*
2665 		 * only one thread can access block_rsv at this point,
2666 		 * so we don't need hold lock to protect block_rsv.
2667 		 * we expand more reservation size here to allow enough
2668 		 * space for relocation and we will return eailer in
2669 		 * enospc case.
2670 		 */
2671 		rc->block_rsv->size = tmp + fs_info->nodesize *
2672 				      RELOCATION_RESERVED_NODES;
2673 		return -EAGAIN;
2674 	}
2675 
2676 	return 0;
2677 }
2678 
2679 /*
2680  * relocate a block tree, and then update pointers in upper level
2681  * blocks that reference the block to point to the new location.
2682  *
2683  * if called by link_to_upper, the block has already been relocated.
2684  * in that case this function just updates pointers.
2685  */
2686 static int do_relocation(struct btrfs_trans_handle *trans,
2687 			 struct reloc_control *rc,
2688 			 struct backref_node *node,
2689 			 struct btrfs_key *key,
2690 			 struct btrfs_path *path, int lowest)
2691 {
2692 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2693 	struct backref_node *upper;
2694 	struct backref_edge *edge;
2695 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2696 	struct btrfs_root *root;
2697 	struct extent_buffer *eb;
2698 	u32 blocksize;
2699 	u64 bytenr;
2700 	u64 generation;
2701 	int slot;
2702 	int ret;
2703 	int err = 0;
2704 
2705 	BUG_ON(lowest && node->eb);
2706 
2707 	path->lowest_level = node->level + 1;
2708 	rc->backref_cache.path[node->level] = node;
2709 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2710 		cond_resched();
2711 
2712 		upper = edge->node[UPPER];
2713 		root = select_reloc_root(trans, rc, upper, edges);
2714 		BUG_ON(!root);
2715 
2716 		if (upper->eb && !upper->locked) {
2717 			if (!lowest) {
2718 				ret = btrfs_bin_search(upper->eb, key,
2719 						       upper->level, &slot);
2720 				BUG_ON(ret);
2721 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2722 				if (node->eb->start == bytenr)
2723 					goto next;
2724 			}
2725 			drop_node_buffer(upper);
2726 		}
2727 
2728 		if (!upper->eb) {
2729 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730 			if (ret) {
2731 				if (ret < 0)
2732 					err = ret;
2733 				else
2734 					err = -ENOENT;
2735 
2736 				btrfs_release_path(path);
2737 				break;
2738 			}
2739 
2740 			if (!upper->eb) {
2741 				upper->eb = path->nodes[upper->level];
2742 				path->nodes[upper->level] = NULL;
2743 			} else {
2744 				BUG_ON(upper->eb != path->nodes[upper->level]);
2745 			}
2746 
2747 			upper->locked = 1;
2748 			path->locks[upper->level] = 0;
2749 
2750 			slot = path->slots[upper->level];
2751 			btrfs_release_path(path);
2752 		} else {
2753 			ret = btrfs_bin_search(upper->eb, key, upper->level,
2754 					       &slot);
2755 			BUG_ON(ret);
2756 		}
2757 
2758 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2759 		if (lowest) {
2760 			if (bytenr != node->bytenr) {
2761 				btrfs_err(root->fs_info,
2762 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2763 					  bytenr, node->bytenr, slot,
2764 					  upper->eb->start);
2765 				err = -EIO;
2766 				goto next;
2767 			}
2768 		} else {
2769 			if (node->eb->start == bytenr)
2770 				goto next;
2771 		}
2772 
2773 		blocksize = root->fs_info->nodesize;
2774 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2775 		eb = read_tree_block(fs_info, bytenr, generation);
2776 		if (IS_ERR(eb)) {
2777 			err = PTR_ERR(eb);
2778 			goto next;
2779 		} else if (!extent_buffer_uptodate(eb)) {
2780 			free_extent_buffer(eb);
2781 			err = -EIO;
2782 			goto next;
2783 		}
2784 		btrfs_tree_lock(eb);
2785 		btrfs_set_lock_blocking(eb);
2786 
2787 		if (!node->eb) {
2788 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2789 					      slot, &eb);
2790 			btrfs_tree_unlock(eb);
2791 			free_extent_buffer(eb);
2792 			if (ret < 0) {
2793 				err = ret;
2794 				goto next;
2795 			}
2796 			BUG_ON(node->eb != eb);
2797 		} else {
2798 			btrfs_set_node_blockptr(upper->eb, slot,
2799 						node->eb->start);
2800 			btrfs_set_node_ptr_generation(upper->eb, slot,
2801 						      trans->transid);
2802 			btrfs_mark_buffer_dirty(upper->eb);
2803 
2804 			ret = btrfs_inc_extent_ref(trans, root->fs_info,
2805 						node->eb->start, blocksize,
2806 						upper->eb->start,
2807 						btrfs_header_owner(upper->eb),
2808 						node->level, 0);
2809 			BUG_ON(ret);
2810 
2811 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2812 			BUG_ON(ret);
2813 		}
2814 next:
2815 		if (!upper->pending)
2816 			drop_node_buffer(upper);
2817 		else
2818 			unlock_node_buffer(upper);
2819 		if (err)
2820 			break;
2821 	}
2822 
2823 	if (!err && node->pending) {
2824 		drop_node_buffer(node);
2825 		list_move_tail(&node->list, &rc->backref_cache.changed);
2826 		node->pending = 0;
2827 	}
2828 
2829 	path->lowest_level = 0;
2830 	BUG_ON(err == -ENOSPC);
2831 	return err;
2832 }
2833 
2834 static int link_to_upper(struct btrfs_trans_handle *trans,
2835 			 struct reloc_control *rc,
2836 			 struct backref_node *node,
2837 			 struct btrfs_path *path)
2838 {
2839 	struct btrfs_key key;
2840 
2841 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2842 	return do_relocation(trans, rc, node, &key, path, 0);
2843 }
2844 
2845 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2846 				struct reloc_control *rc,
2847 				struct btrfs_path *path, int err)
2848 {
2849 	LIST_HEAD(list);
2850 	struct backref_cache *cache = &rc->backref_cache;
2851 	struct backref_node *node;
2852 	int level;
2853 	int ret;
2854 
2855 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2856 		while (!list_empty(&cache->pending[level])) {
2857 			node = list_entry(cache->pending[level].next,
2858 					  struct backref_node, list);
2859 			list_move_tail(&node->list, &list);
2860 			BUG_ON(!node->pending);
2861 
2862 			if (!err) {
2863 				ret = link_to_upper(trans, rc, node, path);
2864 				if (ret < 0)
2865 					err = ret;
2866 			}
2867 		}
2868 		list_splice_init(&list, &cache->pending[level]);
2869 	}
2870 	return err;
2871 }
2872 
2873 static void mark_block_processed(struct reloc_control *rc,
2874 				 u64 bytenr, u32 blocksize)
2875 {
2876 	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2877 			EXTENT_DIRTY);
2878 }
2879 
2880 static void __mark_block_processed(struct reloc_control *rc,
2881 				   struct backref_node *node)
2882 {
2883 	u32 blocksize;
2884 	if (node->level == 0 ||
2885 	    in_block_group(node->bytenr, rc->block_group)) {
2886 		blocksize = rc->extent_root->fs_info->nodesize;
2887 		mark_block_processed(rc, node->bytenr, blocksize);
2888 	}
2889 	node->processed = 1;
2890 }
2891 
2892 /*
2893  * mark a block and all blocks directly/indirectly reference the block
2894  * as processed.
2895  */
2896 static void update_processed_blocks(struct reloc_control *rc,
2897 				    struct backref_node *node)
2898 {
2899 	struct backref_node *next = node;
2900 	struct backref_edge *edge;
2901 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2902 	int index = 0;
2903 
2904 	while (next) {
2905 		cond_resched();
2906 		while (1) {
2907 			if (next->processed)
2908 				break;
2909 
2910 			__mark_block_processed(rc, next);
2911 
2912 			if (list_empty(&next->upper))
2913 				break;
2914 
2915 			edge = list_entry(next->upper.next,
2916 					  struct backref_edge, list[LOWER]);
2917 			edges[index++] = edge;
2918 			next = edge->node[UPPER];
2919 		}
2920 		next = walk_down_backref(edges, &index);
2921 	}
2922 }
2923 
2924 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2925 {
2926 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2927 
2928 	if (test_range_bit(&rc->processed_blocks, bytenr,
2929 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2930 		return 1;
2931 	return 0;
2932 }
2933 
2934 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2935 			      struct tree_block *block)
2936 {
2937 	struct extent_buffer *eb;
2938 
2939 	BUG_ON(block->key_ready);
2940 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
2941 	if (IS_ERR(eb)) {
2942 		return PTR_ERR(eb);
2943 	} else if (!extent_buffer_uptodate(eb)) {
2944 		free_extent_buffer(eb);
2945 		return -EIO;
2946 	}
2947 	WARN_ON(btrfs_header_level(eb) != block->level);
2948 	if (block->level == 0)
2949 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2950 	else
2951 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2952 	free_extent_buffer(eb);
2953 	block->key_ready = 1;
2954 	return 0;
2955 }
2956 
2957 /*
2958  * helper function to relocate a tree block
2959  */
2960 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2961 				struct reloc_control *rc,
2962 				struct backref_node *node,
2963 				struct btrfs_key *key,
2964 				struct btrfs_path *path)
2965 {
2966 	struct btrfs_root *root;
2967 	int ret = 0;
2968 
2969 	if (!node)
2970 		return 0;
2971 
2972 	BUG_ON(node->processed);
2973 	root = select_one_root(node);
2974 	if (root == ERR_PTR(-ENOENT)) {
2975 		update_processed_blocks(rc, node);
2976 		goto out;
2977 	}
2978 
2979 	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2980 		ret = reserve_metadata_space(trans, rc, node);
2981 		if (ret)
2982 			goto out;
2983 	}
2984 
2985 	if (root) {
2986 		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2987 			BUG_ON(node->new_bytenr);
2988 			BUG_ON(!list_empty(&node->list));
2989 			btrfs_record_root_in_trans(trans, root);
2990 			root = root->reloc_root;
2991 			node->new_bytenr = root->node->start;
2992 			node->root = root;
2993 			list_add_tail(&node->list, &rc->backref_cache.changed);
2994 		} else {
2995 			path->lowest_level = node->level;
2996 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2997 			btrfs_release_path(path);
2998 			if (ret > 0)
2999 				ret = 0;
3000 		}
3001 		if (!ret)
3002 			update_processed_blocks(rc, node);
3003 	} else {
3004 		ret = do_relocation(trans, rc, node, key, path, 1);
3005 	}
3006 out:
3007 	if (ret || node->level == 0 || node->cowonly)
3008 		remove_backref_node(&rc->backref_cache, node);
3009 	return ret;
3010 }
3011 
3012 /*
3013  * relocate a list of blocks
3014  */
3015 static noinline_for_stack
3016 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3017 			 struct reloc_control *rc, struct rb_root *blocks)
3018 {
3019 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3020 	struct backref_node *node;
3021 	struct btrfs_path *path;
3022 	struct tree_block *block;
3023 	struct rb_node *rb_node;
3024 	int ret;
3025 	int err = 0;
3026 
3027 	path = btrfs_alloc_path();
3028 	if (!path) {
3029 		err = -ENOMEM;
3030 		goto out_free_blocks;
3031 	}
3032 
3033 	rb_node = rb_first(blocks);
3034 	while (rb_node) {
3035 		block = rb_entry(rb_node, struct tree_block, rb_node);
3036 		if (!block->key_ready)
3037 			readahead_tree_block(fs_info, block->bytenr);
3038 		rb_node = rb_next(rb_node);
3039 	}
3040 
3041 	rb_node = rb_first(blocks);
3042 	while (rb_node) {
3043 		block = rb_entry(rb_node, struct tree_block, rb_node);
3044 		if (!block->key_ready) {
3045 			err = get_tree_block_key(fs_info, block);
3046 			if (err)
3047 				goto out_free_path;
3048 		}
3049 		rb_node = rb_next(rb_node);
3050 	}
3051 
3052 	rb_node = rb_first(blocks);
3053 	while (rb_node) {
3054 		block = rb_entry(rb_node, struct tree_block, rb_node);
3055 
3056 		node = build_backref_tree(rc, &block->key,
3057 					  block->level, block->bytenr);
3058 		if (IS_ERR(node)) {
3059 			err = PTR_ERR(node);
3060 			goto out;
3061 		}
3062 
3063 		ret = relocate_tree_block(trans, rc, node, &block->key,
3064 					  path);
3065 		if (ret < 0) {
3066 			if (ret != -EAGAIN || rb_node == rb_first(blocks))
3067 				err = ret;
3068 			goto out;
3069 		}
3070 		rb_node = rb_next(rb_node);
3071 	}
3072 out:
3073 	err = finish_pending_nodes(trans, rc, path, err);
3074 
3075 out_free_path:
3076 	btrfs_free_path(path);
3077 out_free_blocks:
3078 	free_block_list(blocks);
3079 	return err;
3080 }
3081 
3082 static noinline_for_stack
3083 int prealloc_file_extent_cluster(struct inode *inode,
3084 				 struct file_extent_cluster *cluster)
3085 {
3086 	u64 alloc_hint = 0;
3087 	u64 start;
3088 	u64 end;
3089 	u64 offset = BTRFS_I(inode)->index_cnt;
3090 	u64 num_bytes;
3091 	int nr = 0;
3092 	int ret = 0;
3093 	u64 prealloc_start = cluster->start - offset;
3094 	u64 prealloc_end = cluster->end - offset;
3095 	u64 cur_offset;
3096 	struct extent_changeset *data_reserved = NULL;
3097 
3098 	BUG_ON(cluster->start != cluster->boundary[0]);
3099 	inode_lock(inode);
3100 
3101 	ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3102 					  prealloc_end + 1 - prealloc_start);
3103 	if (ret)
3104 		goto out;
3105 
3106 	cur_offset = prealloc_start;
3107 	while (nr < cluster->nr) {
3108 		start = cluster->boundary[nr] - offset;
3109 		if (nr + 1 < cluster->nr)
3110 			end = cluster->boundary[nr + 1] - 1 - offset;
3111 		else
3112 			end = cluster->end - offset;
3113 
3114 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3115 		num_bytes = end + 1 - start;
3116 		if (cur_offset < start)
3117 			btrfs_free_reserved_data_space(inode, data_reserved,
3118 					cur_offset, start - cur_offset);
3119 		ret = btrfs_prealloc_file_range(inode, 0, start,
3120 						num_bytes, num_bytes,
3121 						end + 1, &alloc_hint);
3122 		cur_offset = end + 1;
3123 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3124 		if (ret)
3125 			break;
3126 		nr++;
3127 	}
3128 	if (cur_offset < prealloc_end)
3129 		btrfs_free_reserved_data_space(inode, data_reserved,
3130 				cur_offset, prealloc_end + 1 - cur_offset);
3131 out:
3132 	inode_unlock(inode);
3133 	extent_changeset_free(data_reserved);
3134 	return ret;
3135 }
3136 
3137 static noinline_for_stack
3138 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3139 			 u64 block_start)
3140 {
3141 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3142 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3143 	struct extent_map *em;
3144 	int ret = 0;
3145 
3146 	em = alloc_extent_map();
3147 	if (!em)
3148 		return -ENOMEM;
3149 
3150 	em->start = start;
3151 	em->len = end + 1 - start;
3152 	em->block_len = em->len;
3153 	em->block_start = block_start;
3154 	em->bdev = fs_info->fs_devices->latest_bdev;
3155 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
3156 
3157 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3158 	while (1) {
3159 		write_lock(&em_tree->lock);
3160 		ret = add_extent_mapping(em_tree, em, 0);
3161 		write_unlock(&em_tree->lock);
3162 		if (ret != -EEXIST) {
3163 			free_extent_map(em);
3164 			break;
3165 		}
3166 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3167 	}
3168 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3169 	return ret;
3170 }
3171 
3172 static int relocate_file_extent_cluster(struct inode *inode,
3173 					struct file_extent_cluster *cluster)
3174 {
3175 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3176 	u64 page_start;
3177 	u64 page_end;
3178 	u64 offset = BTRFS_I(inode)->index_cnt;
3179 	unsigned long index;
3180 	unsigned long last_index;
3181 	struct page *page;
3182 	struct file_ra_state *ra;
3183 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3184 	int nr = 0;
3185 	int ret = 0;
3186 
3187 	if (!cluster->nr)
3188 		return 0;
3189 
3190 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
3191 	if (!ra)
3192 		return -ENOMEM;
3193 
3194 	ret = prealloc_file_extent_cluster(inode, cluster);
3195 	if (ret)
3196 		goto out;
3197 
3198 	file_ra_state_init(ra, inode->i_mapping);
3199 
3200 	ret = setup_extent_mapping(inode, cluster->start - offset,
3201 				   cluster->end - offset, cluster->start);
3202 	if (ret)
3203 		goto out;
3204 
3205 	index = (cluster->start - offset) >> PAGE_SHIFT;
3206 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3207 	while (index <= last_index) {
3208 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3209 				PAGE_SIZE);
3210 		if (ret)
3211 			goto out;
3212 
3213 		page = find_lock_page(inode->i_mapping, index);
3214 		if (!page) {
3215 			page_cache_sync_readahead(inode->i_mapping,
3216 						  ra, NULL, index,
3217 						  last_index + 1 - index);
3218 			page = find_or_create_page(inode->i_mapping, index,
3219 						   mask);
3220 			if (!page) {
3221 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3222 							PAGE_SIZE);
3223 				ret = -ENOMEM;
3224 				goto out;
3225 			}
3226 		}
3227 
3228 		if (PageReadahead(page)) {
3229 			page_cache_async_readahead(inode->i_mapping,
3230 						   ra, NULL, page, index,
3231 						   last_index + 1 - index);
3232 		}
3233 
3234 		if (!PageUptodate(page)) {
3235 			btrfs_readpage(NULL, page);
3236 			lock_page(page);
3237 			if (!PageUptodate(page)) {
3238 				unlock_page(page);
3239 				put_page(page);
3240 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
3241 							PAGE_SIZE);
3242 				ret = -EIO;
3243 				goto out;
3244 			}
3245 		}
3246 
3247 		page_start = page_offset(page);
3248 		page_end = page_start + PAGE_SIZE - 1;
3249 
3250 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3251 
3252 		set_page_extent_mapped(page);
3253 
3254 		if (nr < cluster->nr &&
3255 		    page_start + offset == cluster->boundary[nr]) {
3256 			set_extent_bits(&BTRFS_I(inode)->io_tree,
3257 					page_start, page_end,
3258 					EXTENT_BOUNDARY);
3259 			nr++;
3260 		}
3261 
3262 		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
3263 		set_page_dirty(page);
3264 
3265 		unlock_extent(&BTRFS_I(inode)->io_tree,
3266 			      page_start, page_end);
3267 		unlock_page(page);
3268 		put_page(page);
3269 
3270 		index++;
3271 		balance_dirty_pages_ratelimited(inode->i_mapping);
3272 		btrfs_throttle(fs_info);
3273 	}
3274 	WARN_ON(nr != cluster->nr);
3275 out:
3276 	kfree(ra);
3277 	return ret;
3278 }
3279 
3280 static noinline_for_stack
3281 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3282 			 struct file_extent_cluster *cluster)
3283 {
3284 	int ret;
3285 
3286 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3287 		ret = relocate_file_extent_cluster(inode, cluster);
3288 		if (ret)
3289 			return ret;
3290 		cluster->nr = 0;
3291 	}
3292 
3293 	if (!cluster->nr)
3294 		cluster->start = extent_key->objectid;
3295 	else
3296 		BUG_ON(cluster->nr >= MAX_EXTENTS);
3297 	cluster->end = extent_key->objectid + extent_key->offset - 1;
3298 	cluster->boundary[cluster->nr] = extent_key->objectid;
3299 	cluster->nr++;
3300 
3301 	if (cluster->nr >= MAX_EXTENTS) {
3302 		ret = relocate_file_extent_cluster(inode, cluster);
3303 		if (ret)
3304 			return ret;
3305 		cluster->nr = 0;
3306 	}
3307 	return 0;
3308 }
3309 
3310 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3311 static int get_ref_objectid_v0(struct reloc_control *rc,
3312 			       struct btrfs_path *path,
3313 			       struct btrfs_key *extent_key,
3314 			       u64 *ref_objectid, int *path_change)
3315 {
3316 	struct btrfs_key key;
3317 	struct extent_buffer *leaf;
3318 	struct btrfs_extent_ref_v0 *ref0;
3319 	int ret;
3320 	int slot;
3321 
3322 	leaf = path->nodes[0];
3323 	slot = path->slots[0];
3324 	while (1) {
3325 		if (slot >= btrfs_header_nritems(leaf)) {
3326 			ret = btrfs_next_leaf(rc->extent_root, path);
3327 			if (ret < 0)
3328 				return ret;
3329 			BUG_ON(ret > 0);
3330 			leaf = path->nodes[0];
3331 			slot = path->slots[0];
3332 			if (path_change)
3333 				*path_change = 1;
3334 		}
3335 		btrfs_item_key_to_cpu(leaf, &key, slot);
3336 		if (key.objectid != extent_key->objectid)
3337 			return -ENOENT;
3338 
3339 		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3340 			slot++;
3341 			continue;
3342 		}
3343 		ref0 = btrfs_item_ptr(leaf, slot,
3344 				struct btrfs_extent_ref_v0);
3345 		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3346 		break;
3347 	}
3348 	return 0;
3349 }
3350 #endif
3351 
3352 /*
3353  * helper to add a tree block to the list.
3354  * the major work is getting the generation and level of the block
3355  */
3356 static int add_tree_block(struct reloc_control *rc,
3357 			  struct btrfs_key *extent_key,
3358 			  struct btrfs_path *path,
3359 			  struct rb_root *blocks)
3360 {
3361 	struct extent_buffer *eb;
3362 	struct btrfs_extent_item *ei;
3363 	struct btrfs_tree_block_info *bi;
3364 	struct tree_block *block;
3365 	struct rb_node *rb_node;
3366 	u32 item_size;
3367 	int level = -1;
3368 	u64 generation;
3369 
3370 	eb =  path->nodes[0];
3371 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
3372 
3373 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3374 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3375 		ei = btrfs_item_ptr(eb, path->slots[0],
3376 				struct btrfs_extent_item);
3377 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3378 			bi = (struct btrfs_tree_block_info *)(ei + 1);
3379 			level = btrfs_tree_block_level(eb, bi);
3380 		} else {
3381 			level = (int)extent_key->offset;
3382 		}
3383 		generation = btrfs_extent_generation(eb, ei);
3384 	} else {
3385 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3386 		u64 ref_owner;
3387 		int ret;
3388 
3389 		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3390 		ret = get_ref_objectid_v0(rc, path, extent_key,
3391 					  &ref_owner, NULL);
3392 		if (ret < 0)
3393 			return ret;
3394 		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3395 		level = (int)ref_owner;
3396 		/* FIXME: get real generation */
3397 		generation = 0;
3398 #else
3399 		BUG();
3400 #endif
3401 	}
3402 
3403 	btrfs_release_path(path);
3404 
3405 	BUG_ON(level == -1);
3406 
3407 	block = kmalloc(sizeof(*block), GFP_NOFS);
3408 	if (!block)
3409 		return -ENOMEM;
3410 
3411 	block->bytenr = extent_key->objectid;
3412 	block->key.objectid = rc->extent_root->fs_info->nodesize;
3413 	block->key.offset = generation;
3414 	block->level = level;
3415 	block->key_ready = 0;
3416 
3417 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3418 	if (rb_node)
3419 		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3420 
3421 	return 0;
3422 }
3423 
3424 /*
3425  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3426  */
3427 static int __add_tree_block(struct reloc_control *rc,
3428 			    u64 bytenr, u32 blocksize,
3429 			    struct rb_root *blocks)
3430 {
3431 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3432 	struct btrfs_path *path;
3433 	struct btrfs_key key;
3434 	int ret;
3435 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3436 
3437 	if (tree_block_processed(bytenr, rc))
3438 		return 0;
3439 
3440 	if (tree_search(blocks, bytenr))
3441 		return 0;
3442 
3443 	path = btrfs_alloc_path();
3444 	if (!path)
3445 		return -ENOMEM;
3446 again:
3447 	key.objectid = bytenr;
3448 	if (skinny) {
3449 		key.type = BTRFS_METADATA_ITEM_KEY;
3450 		key.offset = (u64)-1;
3451 	} else {
3452 		key.type = BTRFS_EXTENT_ITEM_KEY;
3453 		key.offset = blocksize;
3454 	}
3455 
3456 	path->search_commit_root = 1;
3457 	path->skip_locking = 1;
3458 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3459 	if (ret < 0)
3460 		goto out;
3461 
3462 	if (ret > 0 && skinny) {
3463 		if (path->slots[0]) {
3464 			path->slots[0]--;
3465 			btrfs_item_key_to_cpu(path->nodes[0], &key,
3466 					      path->slots[0]);
3467 			if (key.objectid == bytenr &&
3468 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
3469 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
3470 			      key.offset == blocksize)))
3471 				ret = 0;
3472 		}
3473 
3474 		if (ret) {
3475 			skinny = false;
3476 			btrfs_release_path(path);
3477 			goto again;
3478 		}
3479 	}
3480 	BUG_ON(ret);
3481 
3482 	ret = add_tree_block(rc, &key, path, blocks);
3483 out:
3484 	btrfs_free_path(path);
3485 	return ret;
3486 }
3487 
3488 /*
3489  * helper to check if the block use full backrefs for pointers in it
3490  */
3491 static int block_use_full_backref(struct reloc_control *rc,
3492 				  struct extent_buffer *eb)
3493 {
3494 	u64 flags;
3495 	int ret;
3496 
3497 	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3498 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3499 		return 1;
3500 
3501 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3502 				       eb->start, btrfs_header_level(eb), 1,
3503 				       NULL, &flags);
3504 	BUG_ON(ret);
3505 
3506 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3507 		ret = 1;
3508 	else
3509 		ret = 0;
3510 	return ret;
3511 }
3512 
3513 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3514 				    struct btrfs_block_group_cache *block_group,
3515 				    struct inode *inode,
3516 				    u64 ino)
3517 {
3518 	struct btrfs_key key;
3519 	struct btrfs_root *root = fs_info->tree_root;
3520 	struct btrfs_trans_handle *trans;
3521 	int ret = 0;
3522 
3523 	if (inode)
3524 		goto truncate;
3525 
3526 	key.objectid = ino;
3527 	key.type = BTRFS_INODE_ITEM_KEY;
3528 	key.offset = 0;
3529 
3530 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3531 	if (IS_ERR(inode) || is_bad_inode(inode)) {
3532 		if (!IS_ERR(inode))
3533 			iput(inode);
3534 		return -ENOENT;
3535 	}
3536 
3537 truncate:
3538 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3539 						 &fs_info->global_block_rsv);
3540 	if (ret)
3541 		goto out;
3542 
3543 	trans = btrfs_join_transaction(root);
3544 	if (IS_ERR(trans)) {
3545 		ret = PTR_ERR(trans);
3546 		goto out;
3547 	}
3548 
3549 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3550 
3551 	btrfs_end_transaction(trans);
3552 	btrfs_btree_balance_dirty(fs_info);
3553 out:
3554 	iput(inode);
3555 	return ret;
3556 }
3557 
3558 /*
3559  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3560  * this function scans fs tree to find blocks reference the data extent
3561  */
3562 static int find_data_references(struct reloc_control *rc,
3563 				struct btrfs_key *extent_key,
3564 				struct extent_buffer *leaf,
3565 				struct btrfs_extent_data_ref *ref,
3566 				struct rb_root *blocks)
3567 {
3568 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3569 	struct btrfs_path *path;
3570 	struct tree_block *block;
3571 	struct btrfs_root *root;
3572 	struct btrfs_file_extent_item *fi;
3573 	struct rb_node *rb_node;
3574 	struct btrfs_key key;
3575 	u64 ref_root;
3576 	u64 ref_objectid;
3577 	u64 ref_offset;
3578 	u32 ref_count;
3579 	u32 nritems;
3580 	int err = 0;
3581 	int added = 0;
3582 	int counted;
3583 	int ret;
3584 
3585 	ref_root = btrfs_extent_data_ref_root(leaf, ref);
3586 	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3587 	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3588 	ref_count = btrfs_extent_data_ref_count(leaf, ref);
3589 
3590 	/*
3591 	 * This is an extent belonging to the free space cache, lets just delete
3592 	 * it and redo the search.
3593 	 */
3594 	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3595 		ret = delete_block_group_cache(fs_info, rc->block_group,
3596 					       NULL, ref_objectid);
3597 		if (ret != -ENOENT)
3598 			return ret;
3599 		ret = 0;
3600 	}
3601 
3602 	path = btrfs_alloc_path();
3603 	if (!path)
3604 		return -ENOMEM;
3605 	path->reada = READA_FORWARD;
3606 
3607 	root = read_fs_root(fs_info, ref_root);
3608 	if (IS_ERR(root)) {
3609 		err = PTR_ERR(root);
3610 		goto out;
3611 	}
3612 
3613 	key.objectid = ref_objectid;
3614 	key.type = BTRFS_EXTENT_DATA_KEY;
3615 	if (ref_offset > ((u64)-1 << 32))
3616 		key.offset = 0;
3617 	else
3618 		key.offset = ref_offset;
3619 
3620 	path->search_commit_root = 1;
3621 	path->skip_locking = 1;
3622 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3623 	if (ret < 0) {
3624 		err = ret;
3625 		goto out;
3626 	}
3627 
3628 	leaf = path->nodes[0];
3629 	nritems = btrfs_header_nritems(leaf);
3630 	/*
3631 	 * the references in tree blocks that use full backrefs
3632 	 * are not counted in
3633 	 */
3634 	if (block_use_full_backref(rc, leaf))
3635 		counted = 0;
3636 	else
3637 		counted = 1;
3638 	rb_node = tree_search(blocks, leaf->start);
3639 	if (rb_node) {
3640 		if (counted)
3641 			added = 1;
3642 		else
3643 			path->slots[0] = nritems;
3644 	}
3645 
3646 	while (ref_count > 0) {
3647 		while (path->slots[0] >= nritems) {
3648 			ret = btrfs_next_leaf(root, path);
3649 			if (ret < 0) {
3650 				err = ret;
3651 				goto out;
3652 			}
3653 			if (WARN_ON(ret > 0))
3654 				goto out;
3655 
3656 			leaf = path->nodes[0];
3657 			nritems = btrfs_header_nritems(leaf);
3658 			added = 0;
3659 
3660 			if (block_use_full_backref(rc, leaf))
3661 				counted = 0;
3662 			else
3663 				counted = 1;
3664 			rb_node = tree_search(blocks, leaf->start);
3665 			if (rb_node) {
3666 				if (counted)
3667 					added = 1;
3668 				else
3669 					path->slots[0] = nritems;
3670 			}
3671 		}
3672 
3673 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3674 		if (WARN_ON(key.objectid != ref_objectid ||
3675 		    key.type != BTRFS_EXTENT_DATA_KEY))
3676 			break;
3677 
3678 		fi = btrfs_item_ptr(leaf, path->slots[0],
3679 				    struct btrfs_file_extent_item);
3680 
3681 		if (btrfs_file_extent_type(leaf, fi) ==
3682 		    BTRFS_FILE_EXTENT_INLINE)
3683 			goto next;
3684 
3685 		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3686 		    extent_key->objectid)
3687 			goto next;
3688 
3689 		key.offset -= btrfs_file_extent_offset(leaf, fi);
3690 		if (key.offset != ref_offset)
3691 			goto next;
3692 
3693 		if (counted)
3694 			ref_count--;
3695 		if (added)
3696 			goto next;
3697 
3698 		if (!tree_block_processed(leaf->start, rc)) {
3699 			block = kmalloc(sizeof(*block), GFP_NOFS);
3700 			if (!block) {
3701 				err = -ENOMEM;
3702 				break;
3703 			}
3704 			block->bytenr = leaf->start;
3705 			btrfs_item_key_to_cpu(leaf, &block->key, 0);
3706 			block->level = 0;
3707 			block->key_ready = 1;
3708 			rb_node = tree_insert(blocks, block->bytenr,
3709 					      &block->rb_node);
3710 			if (rb_node)
3711 				backref_tree_panic(rb_node, -EEXIST,
3712 						   block->bytenr);
3713 		}
3714 		if (counted)
3715 			added = 1;
3716 		else
3717 			path->slots[0] = nritems;
3718 next:
3719 		path->slots[0]++;
3720 
3721 	}
3722 out:
3723 	btrfs_free_path(path);
3724 	return err;
3725 }
3726 
3727 /*
3728  * helper to find all tree blocks that reference a given data extent
3729  */
3730 static noinline_for_stack
3731 int add_data_references(struct reloc_control *rc,
3732 			struct btrfs_key *extent_key,
3733 			struct btrfs_path *path,
3734 			struct rb_root *blocks)
3735 {
3736 	struct btrfs_key key;
3737 	struct extent_buffer *eb;
3738 	struct btrfs_extent_data_ref *dref;
3739 	struct btrfs_extent_inline_ref *iref;
3740 	unsigned long ptr;
3741 	unsigned long end;
3742 	u32 blocksize = rc->extent_root->fs_info->nodesize;
3743 	int ret = 0;
3744 	int err = 0;
3745 
3746 	eb = path->nodes[0];
3747 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3748 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3749 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3750 	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3751 		ptr = end;
3752 	else
3753 #endif
3754 		ptr += sizeof(struct btrfs_extent_item);
3755 
3756 	while (ptr < end) {
3757 		iref = (struct btrfs_extent_inline_ref *)ptr;
3758 		key.type = btrfs_extent_inline_ref_type(eb, iref);
3759 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3760 			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3761 			ret = __add_tree_block(rc, key.offset, blocksize,
3762 					       blocks);
3763 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3764 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3765 			ret = find_data_references(rc, extent_key,
3766 						   eb, dref, blocks);
3767 		} else {
3768 			BUG();
3769 		}
3770 		if (ret) {
3771 			err = ret;
3772 			goto out;
3773 		}
3774 		ptr += btrfs_extent_inline_ref_size(key.type);
3775 	}
3776 	WARN_ON(ptr > end);
3777 
3778 	while (1) {
3779 		cond_resched();
3780 		eb = path->nodes[0];
3781 		if (path->slots[0] >= btrfs_header_nritems(eb)) {
3782 			ret = btrfs_next_leaf(rc->extent_root, path);
3783 			if (ret < 0) {
3784 				err = ret;
3785 				break;
3786 			}
3787 			if (ret > 0)
3788 				break;
3789 			eb = path->nodes[0];
3790 		}
3791 
3792 		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3793 		if (key.objectid != extent_key->objectid)
3794 			break;
3795 
3796 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3797 		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3798 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3799 #else
3800 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3801 		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3802 #endif
3803 			ret = __add_tree_block(rc, key.offset, blocksize,
3804 					       blocks);
3805 		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3806 			dref = btrfs_item_ptr(eb, path->slots[0],
3807 					      struct btrfs_extent_data_ref);
3808 			ret = find_data_references(rc, extent_key,
3809 						   eb, dref, blocks);
3810 		} else {
3811 			ret = 0;
3812 		}
3813 		if (ret) {
3814 			err = ret;
3815 			break;
3816 		}
3817 		path->slots[0]++;
3818 	}
3819 out:
3820 	btrfs_release_path(path);
3821 	if (err)
3822 		free_block_list(blocks);
3823 	return err;
3824 }
3825 
3826 /*
3827  * helper to find next unprocessed extent
3828  */
3829 static noinline_for_stack
3830 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3831 		     struct btrfs_key *extent_key)
3832 {
3833 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3834 	struct btrfs_key key;
3835 	struct extent_buffer *leaf;
3836 	u64 start, end, last;
3837 	int ret;
3838 
3839 	last = rc->block_group->key.objectid + rc->block_group->key.offset;
3840 	while (1) {
3841 		cond_resched();
3842 		if (rc->search_start >= last) {
3843 			ret = 1;
3844 			break;
3845 		}
3846 
3847 		key.objectid = rc->search_start;
3848 		key.type = BTRFS_EXTENT_ITEM_KEY;
3849 		key.offset = 0;
3850 
3851 		path->search_commit_root = 1;
3852 		path->skip_locking = 1;
3853 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3854 					0, 0);
3855 		if (ret < 0)
3856 			break;
3857 next:
3858 		leaf = path->nodes[0];
3859 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3860 			ret = btrfs_next_leaf(rc->extent_root, path);
3861 			if (ret != 0)
3862 				break;
3863 			leaf = path->nodes[0];
3864 		}
3865 
3866 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3867 		if (key.objectid >= last) {
3868 			ret = 1;
3869 			break;
3870 		}
3871 
3872 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3873 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3874 			path->slots[0]++;
3875 			goto next;
3876 		}
3877 
3878 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3879 		    key.objectid + key.offset <= rc->search_start) {
3880 			path->slots[0]++;
3881 			goto next;
3882 		}
3883 
3884 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3885 		    key.objectid + fs_info->nodesize <=
3886 		    rc->search_start) {
3887 			path->slots[0]++;
3888 			goto next;
3889 		}
3890 
3891 		ret = find_first_extent_bit(&rc->processed_blocks,
3892 					    key.objectid, &start, &end,
3893 					    EXTENT_DIRTY, NULL);
3894 
3895 		if (ret == 0 && start <= key.objectid) {
3896 			btrfs_release_path(path);
3897 			rc->search_start = end + 1;
3898 		} else {
3899 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3900 				rc->search_start = key.objectid + key.offset;
3901 			else
3902 				rc->search_start = key.objectid +
3903 					fs_info->nodesize;
3904 			memcpy(extent_key, &key, sizeof(key));
3905 			return 0;
3906 		}
3907 	}
3908 	btrfs_release_path(path);
3909 	return ret;
3910 }
3911 
3912 static void set_reloc_control(struct reloc_control *rc)
3913 {
3914 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3915 
3916 	mutex_lock(&fs_info->reloc_mutex);
3917 	fs_info->reloc_ctl = rc;
3918 	mutex_unlock(&fs_info->reloc_mutex);
3919 }
3920 
3921 static void unset_reloc_control(struct reloc_control *rc)
3922 {
3923 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3924 
3925 	mutex_lock(&fs_info->reloc_mutex);
3926 	fs_info->reloc_ctl = NULL;
3927 	mutex_unlock(&fs_info->reloc_mutex);
3928 }
3929 
3930 static int check_extent_flags(u64 flags)
3931 {
3932 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3933 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3934 		return 1;
3935 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3936 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3937 		return 1;
3938 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3939 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3940 		return 1;
3941 	return 0;
3942 }
3943 
3944 static noinline_for_stack
3945 int prepare_to_relocate(struct reloc_control *rc)
3946 {
3947 	struct btrfs_trans_handle *trans;
3948 	int ret;
3949 
3950 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3951 					      BTRFS_BLOCK_RSV_TEMP);
3952 	if (!rc->block_rsv)
3953 		return -ENOMEM;
3954 
3955 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3956 	rc->search_start = rc->block_group->key.objectid;
3957 	rc->extents_found = 0;
3958 	rc->nodes_relocated = 0;
3959 	rc->merging_rsv_size = 0;
3960 	rc->reserved_bytes = 0;
3961 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3962 			      RELOCATION_RESERVED_NODES;
3963 	ret = btrfs_block_rsv_refill(rc->extent_root,
3964 				     rc->block_rsv, rc->block_rsv->size,
3965 				     BTRFS_RESERVE_FLUSH_ALL);
3966 	if (ret)
3967 		return ret;
3968 
3969 	rc->create_reloc_tree = 1;
3970 	set_reloc_control(rc);
3971 
3972 	trans = btrfs_join_transaction(rc->extent_root);
3973 	if (IS_ERR(trans)) {
3974 		unset_reloc_control(rc);
3975 		/*
3976 		 * extent tree is not a ref_cow tree and has no reloc_root to
3977 		 * cleanup.  And callers are responsible to free the above
3978 		 * block rsv.
3979 		 */
3980 		return PTR_ERR(trans);
3981 	}
3982 	btrfs_commit_transaction(trans);
3983 	return 0;
3984 }
3985 
3986 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3987 {
3988 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3989 	struct rb_root blocks = RB_ROOT;
3990 	struct btrfs_key key;
3991 	struct btrfs_trans_handle *trans = NULL;
3992 	struct btrfs_path *path;
3993 	struct btrfs_extent_item *ei;
3994 	u64 flags;
3995 	u32 item_size;
3996 	int ret;
3997 	int err = 0;
3998 	int progress = 0;
3999 
4000 	path = btrfs_alloc_path();
4001 	if (!path)
4002 		return -ENOMEM;
4003 	path->reada = READA_FORWARD;
4004 
4005 	ret = prepare_to_relocate(rc);
4006 	if (ret) {
4007 		err = ret;
4008 		goto out_free;
4009 	}
4010 
4011 	while (1) {
4012 		rc->reserved_bytes = 0;
4013 		ret = btrfs_block_rsv_refill(rc->extent_root,
4014 					rc->block_rsv, rc->block_rsv->size,
4015 					BTRFS_RESERVE_FLUSH_ALL);
4016 		if (ret) {
4017 			err = ret;
4018 			break;
4019 		}
4020 		progress++;
4021 		trans = btrfs_start_transaction(rc->extent_root, 0);
4022 		if (IS_ERR(trans)) {
4023 			err = PTR_ERR(trans);
4024 			trans = NULL;
4025 			break;
4026 		}
4027 restart:
4028 		if (update_backref_cache(trans, &rc->backref_cache)) {
4029 			btrfs_end_transaction(trans);
4030 			continue;
4031 		}
4032 
4033 		ret = find_next_extent(rc, path, &key);
4034 		if (ret < 0)
4035 			err = ret;
4036 		if (ret != 0)
4037 			break;
4038 
4039 		rc->extents_found++;
4040 
4041 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4042 				    struct btrfs_extent_item);
4043 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4044 		if (item_size >= sizeof(*ei)) {
4045 			flags = btrfs_extent_flags(path->nodes[0], ei);
4046 			ret = check_extent_flags(flags);
4047 			BUG_ON(ret);
4048 
4049 		} else {
4050 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4051 			u64 ref_owner;
4052 			int path_change = 0;
4053 
4054 			BUG_ON(item_size !=
4055 			       sizeof(struct btrfs_extent_item_v0));
4056 			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4057 						  &path_change);
4058 			if (ret < 0) {
4059 				err = ret;
4060 				break;
4061 			}
4062 			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4063 				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4064 			else
4065 				flags = BTRFS_EXTENT_FLAG_DATA;
4066 
4067 			if (path_change) {
4068 				btrfs_release_path(path);
4069 
4070 				path->search_commit_root = 1;
4071 				path->skip_locking = 1;
4072 				ret = btrfs_search_slot(NULL, rc->extent_root,
4073 							&key, path, 0, 0);
4074 				if (ret < 0) {
4075 					err = ret;
4076 					break;
4077 				}
4078 				BUG_ON(ret > 0);
4079 			}
4080 #else
4081 			BUG();
4082 #endif
4083 		}
4084 
4085 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4086 			ret = add_tree_block(rc, &key, path, &blocks);
4087 		} else if (rc->stage == UPDATE_DATA_PTRS &&
4088 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
4089 			ret = add_data_references(rc, &key, path, &blocks);
4090 		} else {
4091 			btrfs_release_path(path);
4092 			ret = 0;
4093 		}
4094 		if (ret < 0) {
4095 			err = ret;
4096 			break;
4097 		}
4098 
4099 		if (!RB_EMPTY_ROOT(&blocks)) {
4100 			ret = relocate_tree_blocks(trans, rc, &blocks);
4101 			if (ret < 0) {
4102 				/*
4103 				 * if we fail to relocate tree blocks, force to update
4104 				 * backref cache when committing transaction.
4105 				 */
4106 				rc->backref_cache.last_trans = trans->transid - 1;
4107 
4108 				if (ret != -EAGAIN) {
4109 					err = ret;
4110 					break;
4111 				}
4112 				rc->extents_found--;
4113 				rc->search_start = key.objectid;
4114 			}
4115 		}
4116 
4117 		btrfs_end_transaction_throttle(trans);
4118 		btrfs_btree_balance_dirty(fs_info);
4119 		trans = NULL;
4120 
4121 		if (rc->stage == MOVE_DATA_EXTENTS &&
4122 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4123 			rc->found_file_extent = 1;
4124 			ret = relocate_data_extent(rc->data_inode,
4125 						   &key, &rc->cluster);
4126 			if (ret < 0) {
4127 				err = ret;
4128 				break;
4129 			}
4130 		}
4131 	}
4132 	if (trans && progress && err == -ENOSPC) {
4133 		ret = btrfs_force_chunk_alloc(trans, fs_info,
4134 					      rc->block_group->flags);
4135 		if (ret == 1) {
4136 			err = 0;
4137 			progress = 0;
4138 			goto restart;
4139 		}
4140 	}
4141 
4142 	btrfs_release_path(path);
4143 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4144 
4145 	if (trans) {
4146 		btrfs_end_transaction_throttle(trans);
4147 		btrfs_btree_balance_dirty(fs_info);
4148 	}
4149 
4150 	if (!err) {
4151 		ret = relocate_file_extent_cluster(rc->data_inode,
4152 						   &rc->cluster);
4153 		if (ret < 0)
4154 			err = ret;
4155 	}
4156 
4157 	rc->create_reloc_tree = 0;
4158 	set_reloc_control(rc);
4159 
4160 	backref_cache_cleanup(&rc->backref_cache);
4161 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4162 
4163 	err = prepare_to_merge(rc, err);
4164 
4165 	merge_reloc_roots(rc);
4166 
4167 	rc->merge_reloc_tree = 0;
4168 	unset_reloc_control(rc);
4169 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4170 
4171 	/* get rid of pinned extents */
4172 	trans = btrfs_join_transaction(rc->extent_root);
4173 	if (IS_ERR(trans)) {
4174 		err = PTR_ERR(trans);
4175 		goto out_free;
4176 	}
4177 	btrfs_commit_transaction(trans);
4178 out_free:
4179 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
4180 	btrfs_free_path(path);
4181 	return err;
4182 }
4183 
4184 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4185 				 struct btrfs_root *root, u64 objectid)
4186 {
4187 	struct btrfs_path *path;
4188 	struct btrfs_inode_item *item;
4189 	struct extent_buffer *leaf;
4190 	int ret;
4191 
4192 	path = btrfs_alloc_path();
4193 	if (!path)
4194 		return -ENOMEM;
4195 
4196 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4197 	if (ret)
4198 		goto out;
4199 
4200 	leaf = path->nodes[0];
4201 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4202 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4203 	btrfs_set_inode_generation(leaf, item, 1);
4204 	btrfs_set_inode_size(leaf, item, 0);
4205 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4206 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4207 					  BTRFS_INODE_PREALLOC);
4208 	btrfs_mark_buffer_dirty(leaf);
4209 out:
4210 	btrfs_free_path(path);
4211 	return ret;
4212 }
4213 
4214 /*
4215  * helper to create inode for data relocation.
4216  * the inode is in data relocation tree and its link count is 0
4217  */
4218 static noinline_for_stack
4219 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4220 				 struct btrfs_block_group_cache *group)
4221 {
4222 	struct inode *inode = NULL;
4223 	struct btrfs_trans_handle *trans;
4224 	struct btrfs_root *root;
4225 	struct btrfs_key key;
4226 	u64 objectid;
4227 	int err = 0;
4228 
4229 	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4230 	if (IS_ERR(root))
4231 		return ERR_CAST(root);
4232 
4233 	trans = btrfs_start_transaction(root, 6);
4234 	if (IS_ERR(trans))
4235 		return ERR_CAST(trans);
4236 
4237 	err = btrfs_find_free_objectid(root, &objectid);
4238 	if (err)
4239 		goto out;
4240 
4241 	err = __insert_orphan_inode(trans, root, objectid);
4242 	BUG_ON(err);
4243 
4244 	key.objectid = objectid;
4245 	key.type = BTRFS_INODE_ITEM_KEY;
4246 	key.offset = 0;
4247 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4248 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4249 	BTRFS_I(inode)->index_cnt = group->key.objectid;
4250 
4251 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4252 out:
4253 	btrfs_end_transaction(trans);
4254 	btrfs_btree_balance_dirty(fs_info);
4255 	if (err) {
4256 		if (inode)
4257 			iput(inode);
4258 		inode = ERR_PTR(err);
4259 	}
4260 	return inode;
4261 }
4262 
4263 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4264 {
4265 	struct reloc_control *rc;
4266 
4267 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
4268 	if (!rc)
4269 		return NULL;
4270 
4271 	INIT_LIST_HEAD(&rc->reloc_roots);
4272 	backref_cache_init(&rc->backref_cache);
4273 	mapping_tree_init(&rc->reloc_root_tree);
4274 	extent_io_tree_init(&rc->processed_blocks, NULL);
4275 	return rc;
4276 }
4277 
4278 /*
4279  * Print the block group being relocated
4280  */
4281 static void describe_relocation(struct btrfs_fs_info *fs_info,
4282 				struct btrfs_block_group_cache *block_group)
4283 {
4284 	char buf[128];		/* prefixed by a '|' that'll be dropped */
4285 	u64 flags = block_group->flags;
4286 
4287 	/* Shouldn't happen */
4288 	if (!flags) {
4289 		strcpy(buf, "|NONE");
4290 	} else {
4291 		char *bp = buf;
4292 
4293 #define DESCRIBE_FLAG(f, d) \
4294 		if (flags & BTRFS_BLOCK_GROUP_##f) { \
4295 			bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
4296 			flags &= ~BTRFS_BLOCK_GROUP_##f; \
4297 		}
4298 		DESCRIBE_FLAG(DATA,     "data");
4299 		DESCRIBE_FLAG(SYSTEM,   "system");
4300 		DESCRIBE_FLAG(METADATA, "metadata");
4301 		DESCRIBE_FLAG(RAID0,    "raid0");
4302 		DESCRIBE_FLAG(RAID1,    "raid1");
4303 		DESCRIBE_FLAG(DUP,      "dup");
4304 		DESCRIBE_FLAG(RAID10,   "raid10");
4305 		DESCRIBE_FLAG(RAID5,    "raid5");
4306 		DESCRIBE_FLAG(RAID6,    "raid6");
4307 		if (flags)
4308 			snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
4309 #undef DESCRIBE_FLAG
4310 	}
4311 
4312 	btrfs_info(fs_info,
4313 		   "relocating block group %llu flags %s",
4314 		   block_group->key.objectid, buf + 1);
4315 }
4316 
4317 /*
4318  * function to relocate all extents in a block group.
4319  */
4320 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4321 {
4322 	struct btrfs_root *extent_root = fs_info->extent_root;
4323 	struct reloc_control *rc;
4324 	struct inode *inode;
4325 	struct btrfs_path *path;
4326 	int ret;
4327 	int rw = 0;
4328 	int err = 0;
4329 
4330 	rc = alloc_reloc_control(fs_info);
4331 	if (!rc)
4332 		return -ENOMEM;
4333 
4334 	rc->extent_root = extent_root;
4335 
4336 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4337 	BUG_ON(!rc->block_group);
4338 
4339 	ret = btrfs_inc_block_group_ro(fs_info, rc->block_group);
4340 	if (ret) {
4341 		err = ret;
4342 		goto out;
4343 	}
4344 	rw = 1;
4345 
4346 	path = btrfs_alloc_path();
4347 	if (!path) {
4348 		err = -ENOMEM;
4349 		goto out;
4350 	}
4351 
4352 	inode = lookup_free_space_inode(fs_info, rc->block_group, path);
4353 	btrfs_free_path(path);
4354 
4355 	if (!IS_ERR(inode))
4356 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4357 	else
4358 		ret = PTR_ERR(inode);
4359 
4360 	if (ret && ret != -ENOENT) {
4361 		err = ret;
4362 		goto out;
4363 	}
4364 
4365 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4366 	if (IS_ERR(rc->data_inode)) {
4367 		err = PTR_ERR(rc->data_inode);
4368 		rc->data_inode = NULL;
4369 		goto out;
4370 	}
4371 
4372 	describe_relocation(fs_info, rc->block_group);
4373 
4374 	btrfs_wait_block_group_reservations(rc->block_group);
4375 	btrfs_wait_nocow_writers(rc->block_group);
4376 	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4377 				 rc->block_group->key.objectid,
4378 				 rc->block_group->key.offset);
4379 
4380 	while (1) {
4381 		mutex_lock(&fs_info->cleaner_mutex);
4382 		ret = relocate_block_group(rc);
4383 		mutex_unlock(&fs_info->cleaner_mutex);
4384 		if (ret < 0) {
4385 			err = ret;
4386 			goto out;
4387 		}
4388 
4389 		if (rc->extents_found == 0)
4390 			break;
4391 
4392 		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4393 
4394 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4395 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4396 						       (u64)-1);
4397 			if (ret) {
4398 				err = ret;
4399 				goto out;
4400 			}
4401 			invalidate_mapping_pages(rc->data_inode->i_mapping,
4402 						 0, -1);
4403 			rc->stage = UPDATE_DATA_PTRS;
4404 		}
4405 	}
4406 
4407 	WARN_ON(rc->block_group->pinned > 0);
4408 	WARN_ON(rc->block_group->reserved > 0);
4409 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4410 out:
4411 	if (err && rw)
4412 		btrfs_dec_block_group_ro(rc->block_group);
4413 	iput(rc->data_inode);
4414 	btrfs_put_block_group(rc->block_group);
4415 	kfree(rc);
4416 	return err;
4417 }
4418 
4419 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4420 {
4421 	struct btrfs_fs_info *fs_info = root->fs_info;
4422 	struct btrfs_trans_handle *trans;
4423 	int ret, err;
4424 
4425 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4426 	if (IS_ERR(trans))
4427 		return PTR_ERR(trans);
4428 
4429 	memset(&root->root_item.drop_progress, 0,
4430 		sizeof(root->root_item.drop_progress));
4431 	root->root_item.drop_level = 0;
4432 	btrfs_set_root_refs(&root->root_item, 0);
4433 	ret = btrfs_update_root(trans, fs_info->tree_root,
4434 				&root->root_key, &root->root_item);
4435 
4436 	err = btrfs_end_transaction(trans);
4437 	if (err)
4438 		return err;
4439 	return ret;
4440 }
4441 
4442 /*
4443  * recover relocation interrupted by system crash.
4444  *
4445  * this function resumes merging reloc trees with corresponding fs trees.
4446  * this is important for keeping the sharing of tree blocks
4447  */
4448 int btrfs_recover_relocation(struct btrfs_root *root)
4449 {
4450 	struct btrfs_fs_info *fs_info = root->fs_info;
4451 	LIST_HEAD(reloc_roots);
4452 	struct btrfs_key key;
4453 	struct btrfs_root *fs_root;
4454 	struct btrfs_root *reloc_root;
4455 	struct btrfs_path *path;
4456 	struct extent_buffer *leaf;
4457 	struct reloc_control *rc = NULL;
4458 	struct btrfs_trans_handle *trans;
4459 	int ret;
4460 	int err = 0;
4461 
4462 	path = btrfs_alloc_path();
4463 	if (!path)
4464 		return -ENOMEM;
4465 	path->reada = READA_BACK;
4466 
4467 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4468 	key.type = BTRFS_ROOT_ITEM_KEY;
4469 	key.offset = (u64)-1;
4470 
4471 	while (1) {
4472 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4473 					path, 0, 0);
4474 		if (ret < 0) {
4475 			err = ret;
4476 			goto out;
4477 		}
4478 		if (ret > 0) {
4479 			if (path->slots[0] == 0)
4480 				break;
4481 			path->slots[0]--;
4482 		}
4483 		leaf = path->nodes[0];
4484 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4485 		btrfs_release_path(path);
4486 
4487 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4488 		    key.type != BTRFS_ROOT_ITEM_KEY)
4489 			break;
4490 
4491 		reloc_root = btrfs_read_fs_root(root, &key);
4492 		if (IS_ERR(reloc_root)) {
4493 			err = PTR_ERR(reloc_root);
4494 			goto out;
4495 		}
4496 
4497 		list_add(&reloc_root->root_list, &reloc_roots);
4498 
4499 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4500 			fs_root = read_fs_root(fs_info,
4501 					       reloc_root->root_key.offset);
4502 			if (IS_ERR(fs_root)) {
4503 				ret = PTR_ERR(fs_root);
4504 				if (ret != -ENOENT) {
4505 					err = ret;
4506 					goto out;
4507 				}
4508 				ret = mark_garbage_root(reloc_root);
4509 				if (ret < 0) {
4510 					err = ret;
4511 					goto out;
4512 				}
4513 			}
4514 		}
4515 
4516 		if (key.offset == 0)
4517 			break;
4518 
4519 		key.offset--;
4520 	}
4521 	btrfs_release_path(path);
4522 
4523 	if (list_empty(&reloc_roots))
4524 		goto out;
4525 
4526 	rc = alloc_reloc_control(fs_info);
4527 	if (!rc) {
4528 		err = -ENOMEM;
4529 		goto out;
4530 	}
4531 
4532 	rc->extent_root = fs_info->extent_root;
4533 
4534 	set_reloc_control(rc);
4535 
4536 	trans = btrfs_join_transaction(rc->extent_root);
4537 	if (IS_ERR(trans)) {
4538 		unset_reloc_control(rc);
4539 		err = PTR_ERR(trans);
4540 		goto out_free;
4541 	}
4542 
4543 	rc->merge_reloc_tree = 1;
4544 
4545 	while (!list_empty(&reloc_roots)) {
4546 		reloc_root = list_entry(reloc_roots.next,
4547 					struct btrfs_root, root_list);
4548 		list_del(&reloc_root->root_list);
4549 
4550 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4551 			list_add_tail(&reloc_root->root_list,
4552 				      &rc->reloc_roots);
4553 			continue;
4554 		}
4555 
4556 		fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4557 		if (IS_ERR(fs_root)) {
4558 			err = PTR_ERR(fs_root);
4559 			goto out_free;
4560 		}
4561 
4562 		err = __add_reloc_root(reloc_root);
4563 		BUG_ON(err < 0); /* -ENOMEM or logic error */
4564 		fs_root->reloc_root = reloc_root;
4565 	}
4566 
4567 	err = btrfs_commit_transaction(trans);
4568 	if (err)
4569 		goto out_free;
4570 
4571 	merge_reloc_roots(rc);
4572 
4573 	unset_reloc_control(rc);
4574 
4575 	trans = btrfs_join_transaction(rc->extent_root);
4576 	if (IS_ERR(trans)) {
4577 		err = PTR_ERR(trans);
4578 		goto out_free;
4579 	}
4580 	err = btrfs_commit_transaction(trans);
4581 out_free:
4582 	kfree(rc);
4583 out:
4584 	if (!list_empty(&reloc_roots))
4585 		free_reloc_roots(&reloc_roots);
4586 
4587 	btrfs_free_path(path);
4588 
4589 	if (err == 0) {
4590 		/* cleanup orphan inode in data relocation tree */
4591 		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4592 		if (IS_ERR(fs_root))
4593 			err = PTR_ERR(fs_root);
4594 		else
4595 			err = btrfs_orphan_cleanup(fs_root);
4596 	}
4597 	return err;
4598 }
4599 
4600 /*
4601  * helper to add ordered checksum for data relocation.
4602  *
4603  * cloning checksum properly handles the nodatasum extents.
4604  * it also saves CPU time to re-calculate the checksum.
4605  */
4606 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4607 {
4608 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4609 	struct btrfs_ordered_sum *sums;
4610 	struct btrfs_ordered_extent *ordered;
4611 	int ret;
4612 	u64 disk_bytenr;
4613 	u64 new_bytenr;
4614 	LIST_HEAD(list);
4615 
4616 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4617 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4618 
4619 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4620 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4621 				       disk_bytenr + len - 1, &list, 0);
4622 	if (ret)
4623 		goto out;
4624 
4625 	while (!list_empty(&list)) {
4626 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4627 		list_del_init(&sums->list);
4628 
4629 		/*
4630 		 * We need to offset the new_bytenr based on where the csum is.
4631 		 * We need to do this because we will read in entire prealloc
4632 		 * extents but we may have written to say the middle of the
4633 		 * prealloc extent, so we need to make sure the csum goes with
4634 		 * the right disk offset.
4635 		 *
4636 		 * We can do this because the data reloc inode refers strictly
4637 		 * to the on disk bytes, so we don't have to worry about
4638 		 * disk_len vs real len like with real inodes since it's all
4639 		 * disk length.
4640 		 */
4641 		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4642 		sums->bytenr = new_bytenr;
4643 
4644 		btrfs_add_ordered_sum(inode, ordered, sums);
4645 	}
4646 out:
4647 	btrfs_put_ordered_extent(ordered);
4648 	return ret;
4649 }
4650 
4651 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4652 			  struct btrfs_root *root, struct extent_buffer *buf,
4653 			  struct extent_buffer *cow)
4654 {
4655 	struct btrfs_fs_info *fs_info = root->fs_info;
4656 	struct reloc_control *rc;
4657 	struct backref_node *node;
4658 	int first_cow = 0;
4659 	int level;
4660 	int ret = 0;
4661 
4662 	rc = fs_info->reloc_ctl;
4663 	if (!rc)
4664 		return 0;
4665 
4666 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4667 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4668 
4669 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4670 		if (buf == root->node)
4671 			__update_reloc_root(root, cow->start);
4672 	}
4673 
4674 	level = btrfs_header_level(buf);
4675 	if (btrfs_header_generation(buf) <=
4676 	    btrfs_root_last_snapshot(&root->root_item))
4677 		first_cow = 1;
4678 
4679 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4680 	    rc->create_reloc_tree) {
4681 		WARN_ON(!first_cow && level == 0);
4682 
4683 		node = rc->backref_cache.path[level];
4684 		BUG_ON(node->bytenr != buf->start &&
4685 		       node->new_bytenr != buf->start);
4686 
4687 		drop_node_buffer(node);
4688 		extent_buffer_get(cow);
4689 		node->eb = cow;
4690 		node->new_bytenr = cow->start;
4691 
4692 		if (!node->pending) {
4693 			list_move_tail(&node->list,
4694 				       &rc->backref_cache.pending[level]);
4695 			node->pending = 1;
4696 		}
4697 
4698 		if (first_cow)
4699 			__mark_block_processed(rc, node);
4700 
4701 		if (first_cow && level > 0)
4702 			rc->nodes_relocated += buf->len;
4703 	}
4704 
4705 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4706 		ret = replace_file_extents(trans, rc, root, cow);
4707 	return ret;
4708 }
4709 
4710 /*
4711  * called before creating snapshot. it calculates metadata reservation
4712  * required for relocating tree blocks in the snapshot
4713  */
4714 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4715 			      u64 *bytes_to_reserve)
4716 {
4717 	struct btrfs_root *root;
4718 	struct reloc_control *rc;
4719 
4720 	root = pending->root;
4721 	if (!root->reloc_root)
4722 		return;
4723 
4724 	rc = root->fs_info->reloc_ctl;
4725 	if (!rc->merge_reloc_tree)
4726 		return;
4727 
4728 	root = root->reloc_root;
4729 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4730 	/*
4731 	 * relocation is in the stage of merging trees. the space
4732 	 * used by merging a reloc tree is twice the size of
4733 	 * relocated tree nodes in the worst case. half for cowing
4734 	 * the reloc tree, half for cowing the fs tree. the space
4735 	 * used by cowing the reloc tree will be freed after the
4736 	 * tree is dropped. if we create snapshot, cowing the fs
4737 	 * tree may use more space than it frees. so we need
4738 	 * reserve extra space.
4739 	 */
4740 	*bytes_to_reserve += rc->nodes_relocated;
4741 }
4742 
4743 /*
4744  * called after snapshot is created. migrate block reservation
4745  * and create reloc root for the newly created snapshot
4746  */
4747 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4748 			       struct btrfs_pending_snapshot *pending)
4749 {
4750 	struct btrfs_root *root = pending->root;
4751 	struct btrfs_root *reloc_root;
4752 	struct btrfs_root *new_root;
4753 	struct reloc_control *rc;
4754 	int ret;
4755 
4756 	if (!root->reloc_root)
4757 		return 0;
4758 
4759 	rc = root->fs_info->reloc_ctl;
4760 	rc->merging_rsv_size += rc->nodes_relocated;
4761 
4762 	if (rc->merge_reloc_tree) {
4763 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4764 					      rc->block_rsv,
4765 					      rc->nodes_relocated, 1);
4766 		if (ret)
4767 			return ret;
4768 	}
4769 
4770 	new_root = pending->snap;
4771 	reloc_root = create_reloc_root(trans, root->reloc_root,
4772 				       new_root->root_key.objectid);
4773 	if (IS_ERR(reloc_root))
4774 		return PTR_ERR(reloc_root);
4775 
4776 	ret = __add_reloc_root(reloc_root);
4777 	BUG_ON(ret < 0);
4778 	new_root->reloc_root = reloc_root;
4779 
4780 	if (rc->create_reloc_tree)
4781 		ret = clone_backref_node(trans, rc, root, reloc_root);
4782 	return ret;
4783 }
4784