xref: /linux/fs/btrfs/ctree.c (revision 071bf69a0220253a44acb8b2a27f7a262b9a46bf)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include <linux/vmalloc.h>
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "transaction.h"
26 #include "print-tree.h"
27 #include "locking.h"
28 
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 		      *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
32 		      *root, struct btrfs_key *ins_key,
33 		      struct btrfs_path *path, int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35 			  struct btrfs_root *root, struct extent_buffer *dst,
36 			  struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38 			      struct btrfs_root *root,
39 			      struct extent_buffer *dst_buf,
40 			      struct extent_buffer *src_buf);
41 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
42 		    int level, int slot);
43 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 				 struct extent_buffer *eb);
45 
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48 	struct btrfs_path *path;
49 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 	return path;
51 }
52 
53 /*
54  * set all locked nodes in the path to blocking locks.  This should
55  * be done before scheduling
56  */
57 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
58 {
59 	int i;
60 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
61 		if (!p->nodes[i] || !p->locks[i])
62 			continue;
63 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
64 		if (p->locks[i] == BTRFS_READ_LOCK)
65 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
66 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
67 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
68 	}
69 }
70 
71 /*
72  * reset all the locked nodes in the patch to spinning locks.
73  *
74  * held is used to keep lockdep happy, when lockdep is enabled
75  * we set held to a blocking lock before we go around and
76  * retake all the spinlocks in the path.  You can safely use NULL
77  * for held
78  */
79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80 					struct extent_buffer *held, int held_rw)
81 {
82 	int i;
83 
84 	if (held) {
85 		btrfs_set_lock_blocking_rw(held, held_rw);
86 		if (held_rw == BTRFS_WRITE_LOCK)
87 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
88 		else if (held_rw == BTRFS_READ_LOCK)
89 			held_rw = BTRFS_READ_LOCK_BLOCKING;
90 	}
91 	btrfs_set_path_blocking(p);
92 
93 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
94 		if (p->nodes[i] && p->locks[i]) {
95 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
96 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
97 				p->locks[i] = BTRFS_WRITE_LOCK;
98 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
99 				p->locks[i] = BTRFS_READ_LOCK;
100 		}
101 	}
102 
103 	if (held)
104 		btrfs_clear_lock_blocking_rw(held, held_rw);
105 }
106 
107 /* this also releases the path */
108 void btrfs_free_path(struct btrfs_path *p)
109 {
110 	if (!p)
111 		return;
112 	btrfs_release_path(p);
113 	kmem_cache_free(btrfs_path_cachep, p);
114 }
115 
116 /*
117  * path release drops references on the extent buffers in the path
118  * and it drops any locks held by this path
119  *
120  * It is safe to call this on paths that no locks or extent buffers held.
121  */
122 noinline void btrfs_release_path(struct btrfs_path *p)
123 {
124 	int i;
125 
126 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
127 		p->slots[i] = 0;
128 		if (!p->nodes[i])
129 			continue;
130 		if (p->locks[i]) {
131 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
132 			p->locks[i] = 0;
133 		}
134 		free_extent_buffer(p->nodes[i]);
135 		p->nodes[i] = NULL;
136 	}
137 }
138 
139 /*
140  * safely gets a reference on the root node of a tree.  A lock
141  * is not taken, so a concurrent writer may put a different node
142  * at the root of the tree.  See btrfs_lock_root_node for the
143  * looping required.
144  *
145  * The extent buffer returned by this has a reference taken, so
146  * it won't disappear.  It may stop being the root of the tree
147  * at any time because there are no locks held.
148  */
149 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
150 {
151 	struct extent_buffer *eb;
152 
153 	while (1) {
154 		rcu_read_lock();
155 		eb = rcu_dereference(root->node);
156 
157 		/*
158 		 * RCU really hurts here, we could free up the root node because
159 		 * it was COWed but we may not get the new root node yet so do
160 		 * the inc_not_zero dance and if it doesn't work then
161 		 * synchronize_rcu and try again.
162 		 */
163 		if (atomic_inc_not_zero(&eb->refs)) {
164 			rcu_read_unlock();
165 			break;
166 		}
167 		rcu_read_unlock();
168 		synchronize_rcu();
169 	}
170 	return eb;
171 }
172 
173 /* loop around taking references on and locking the root node of the
174  * tree until you end up with a lock on the root.  A locked buffer
175  * is returned, with a reference held.
176  */
177 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
178 {
179 	struct extent_buffer *eb;
180 
181 	while (1) {
182 		eb = btrfs_root_node(root);
183 		btrfs_tree_lock(eb);
184 		if (eb == root->node)
185 			break;
186 		btrfs_tree_unlock(eb);
187 		free_extent_buffer(eb);
188 	}
189 	return eb;
190 }
191 
192 /* loop around taking references on and locking the root node of the
193  * tree until you end up with a lock on the root.  A locked buffer
194  * is returned, with a reference held.
195  */
196 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
197 {
198 	struct extent_buffer *eb;
199 
200 	while (1) {
201 		eb = btrfs_root_node(root);
202 		btrfs_tree_read_lock(eb);
203 		if (eb == root->node)
204 			break;
205 		btrfs_tree_read_unlock(eb);
206 		free_extent_buffer(eb);
207 	}
208 	return eb;
209 }
210 
211 /* cowonly root (everything not a reference counted cow subvolume), just get
212  * put onto a simple dirty list.  transaction.c walks this to make sure they
213  * get properly updated on disk.
214  */
215 static void add_root_to_dirty_list(struct btrfs_root *root)
216 {
217 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
218 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
219 		return;
220 
221 	spin_lock(&root->fs_info->trans_lock);
222 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
223 		/* Want the extent tree to be the last on the list */
224 		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
225 			list_move_tail(&root->dirty_list,
226 				       &root->fs_info->dirty_cowonly_roots);
227 		else
228 			list_move(&root->dirty_list,
229 				  &root->fs_info->dirty_cowonly_roots);
230 	}
231 	spin_unlock(&root->fs_info->trans_lock);
232 }
233 
234 /*
235  * used by snapshot creation to make a copy of a root for a tree with
236  * a given objectid.  The buffer with the new root node is returned in
237  * cow_ret, and this func returns zero on success or a negative error code.
238  */
239 int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 		      struct btrfs_root *root,
241 		      struct extent_buffer *buf,
242 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
243 {
244 	struct extent_buffer *cow;
245 	int ret = 0;
246 	int level;
247 	struct btrfs_disk_key disk_key;
248 
249 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
250 		trans->transid != root->fs_info->running_transaction->transid);
251 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
252 		trans->transid != root->last_trans);
253 
254 	level = btrfs_header_level(buf);
255 	if (level == 0)
256 		btrfs_item_key(buf, &disk_key, 0);
257 	else
258 		btrfs_node_key(buf, &disk_key, 0);
259 
260 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
261 			&disk_key, level, buf->start, 0);
262 	if (IS_ERR(cow))
263 		return PTR_ERR(cow);
264 
265 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
266 	btrfs_set_header_bytenr(cow, cow->start);
267 	btrfs_set_header_generation(cow, trans->transid);
268 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
269 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
270 				     BTRFS_HEADER_FLAG_RELOC);
271 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
273 	else
274 		btrfs_set_header_owner(cow, new_root_objectid);
275 
276 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
277 			    BTRFS_FSID_SIZE);
278 
279 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
280 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
281 		ret = btrfs_inc_ref(trans, root, cow, 1);
282 	else
283 		ret = btrfs_inc_ref(trans, root, cow, 0);
284 
285 	if (ret)
286 		return ret;
287 
288 	btrfs_mark_buffer_dirty(cow);
289 	*cow_ret = cow;
290 	return 0;
291 }
292 
293 enum mod_log_op {
294 	MOD_LOG_KEY_REPLACE,
295 	MOD_LOG_KEY_ADD,
296 	MOD_LOG_KEY_REMOVE,
297 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
298 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
299 	MOD_LOG_MOVE_KEYS,
300 	MOD_LOG_ROOT_REPLACE,
301 };
302 
303 struct tree_mod_move {
304 	int dst_slot;
305 	int nr_items;
306 };
307 
308 struct tree_mod_root {
309 	u64 logical;
310 	u8 level;
311 };
312 
313 struct tree_mod_elem {
314 	struct rb_node node;
315 	u64 logical;
316 	u64 seq;
317 	enum mod_log_op op;
318 
319 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
320 	int slot;
321 
322 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
323 	u64 generation;
324 
325 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
326 	struct btrfs_disk_key key;
327 	u64 blockptr;
328 
329 	/* this is used for op == MOD_LOG_MOVE_KEYS */
330 	struct tree_mod_move move;
331 
332 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
333 	struct tree_mod_root old_root;
334 };
335 
336 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
337 {
338 	read_lock(&fs_info->tree_mod_log_lock);
339 }
340 
341 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
342 {
343 	read_unlock(&fs_info->tree_mod_log_lock);
344 }
345 
346 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
347 {
348 	write_lock(&fs_info->tree_mod_log_lock);
349 }
350 
351 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
352 {
353 	write_unlock(&fs_info->tree_mod_log_lock);
354 }
355 
356 /*
357  * Pull a new tree mod seq number for our operation.
358  */
359 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
360 {
361 	return atomic64_inc_return(&fs_info->tree_mod_seq);
362 }
363 
364 /*
365  * This adds a new blocker to the tree mod log's blocker list if the @elem
366  * passed does not already have a sequence number set. So when a caller expects
367  * to record tree modifications, it should ensure to set elem->seq to zero
368  * before calling btrfs_get_tree_mod_seq.
369  * Returns a fresh, unused tree log modification sequence number, even if no new
370  * blocker was added.
371  */
372 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
373 			   struct seq_list *elem)
374 {
375 	tree_mod_log_write_lock(fs_info);
376 	spin_lock(&fs_info->tree_mod_seq_lock);
377 	if (!elem->seq) {
378 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
379 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
380 	}
381 	spin_unlock(&fs_info->tree_mod_seq_lock);
382 	tree_mod_log_write_unlock(fs_info);
383 
384 	return elem->seq;
385 }
386 
387 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
388 			    struct seq_list *elem)
389 {
390 	struct rb_root *tm_root;
391 	struct rb_node *node;
392 	struct rb_node *next;
393 	struct seq_list *cur_elem;
394 	struct tree_mod_elem *tm;
395 	u64 min_seq = (u64)-1;
396 	u64 seq_putting = elem->seq;
397 
398 	if (!seq_putting)
399 		return;
400 
401 	spin_lock(&fs_info->tree_mod_seq_lock);
402 	list_del(&elem->list);
403 	elem->seq = 0;
404 
405 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
406 		if (cur_elem->seq < min_seq) {
407 			if (seq_putting > cur_elem->seq) {
408 				/*
409 				 * blocker with lower sequence number exists, we
410 				 * cannot remove anything from the log
411 				 */
412 				spin_unlock(&fs_info->tree_mod_seq_lock);
413 				return;
414 			}
415 			min_seq = cur_elem->seq;
416 		}
417 	}
418 	spin_unlock(&fs_info->tree_mod_seq_lock);
419 
420 	/*
421 	 * anything that's lower than the lowest existing (read: blocked)
422 	 * sequence number can be removed from the tree.
423 	 */
424 	tree_mod_log_write_lock(fs_info);
425 	tm_root = &fs_info->tree_mod_log;
426 	for (node = rb_first(tm_root); node; node = next) {
427 		next = rb_next(node);
428 		tm = container_of(node, struct tree_mod_elem, node);
429 		if (tm->seq > min_seq)
430 			continue;
431 		rb_erase(node, tm_root);
432 		kfree(tm);
433 	}
434 	tree_mod_log_write_unlock(fs_info);
435 }
436 
437 /*
438  * key order of the log:
439  *       node/leaf start address -> sequence
440  *
441  * The 'start address' is the logical address of the *new* root node
442  * for root replace operations, or the logical address of the affected
443  * block for all other operations.
444  *
445  * Note: must be called with write lock (tree_mod_log_write_lock).
446  */
447 static noinline int
448 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
449 {
450 	struct rb_root *tm_root;
451 	struct rb_node **new;
452 	struct rb_node *parent = NULL;
453 	struct tree_mod_elem *cur;
454 
455 	BUG_ON(!tm);
456 
457 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
458 
459 	tm_root = &fs_info->tree_mod_log;
460 	new = &tm_root->rb_node;
461 	while (*new) {
462 		cur = container_of(*new, struct tree_mod_elem, node);
463 		parent = *new;
464 		if (cur->logical < tm->logical)
465 			new = &((*new)->rb_left);
466 		else if (cur->logical > tm->logical)
467 			new = &((*new)->rb_right);
468 		else if (cur->seq < tm->seq)
469 			new = &((*new)->rb_left);
470 		else if (cur->seq > tm->seq)
471 			new = &((*new)->rb_right);
472 		else
473 			return -EEXIST;
474 	}
475 
476 	rb_link_node(&tm->node, parent, new);
477 	rb_insert_color(&tm->node, tm_root);
478 	return 0;
479 }
480 
481 /*
482  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
483  * returns zero with the tree_mod_log_lock acquired. The caller must hold
484  * this until all tree mod log insertions are recorded in the rb tree and then
485  * call tree_mod_log_write_unlock() to release.
486  */
487 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
488 				    struct extent_buffer *eb) {
489 	smp_mb();
490 	if (list_empty(&(fs_info)->tree_mod_seq_list))
491 		return 1;
492 	if (eb && btrfs_header_level(eb) == 0)
493 		return 1;
494 
495 	tree_mod_log_write_lock(fs_info);
496 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
497 		tree_mod_log_write_unlock(fs_info);
498 		return 1;
499 	}
500 
501 	return 0;
502 }
503 
504 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
505 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
506 				    struct extent_buffer *eb)
507 {
508 	smp_mb();
509 	if (list_empty(&(fs_info)->tree_mod_seq_list))
510 		return 0;
511 	if (eb && btrfs_header_level(eb) == 0)
512 		return 0;
513 
514 	return 1;
515 }
516 
517 static struct tree_mod_elem *
518 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
519 		    enum mod_log_op op, gfp_t flags)
520 {
521 	struct tree_mod_elem *tm;
522 
523 	tm = kzalloc(sizeof(*tm), flags);
524 	if (!tm)
525 		return NULL;
526 
527 	tm->logical = eb->start;
528 	if (op != MOD_LOG_KEY_ADD) {
529 		btrfs_node_key(eb, &tm->key, slot);
530 		tm->blockptr = btrfs_node_blockptr(eb, slot);
531 	}
532 	tm->op = op;
533 	tm->slot = slot;
534 	tm->generation = btrfs_node_ptr_generation(eb, slot);
535 	RB_CLEAR_NODE(&tm->node);
536 
537 	return tm;
538 }
539 
540 static noinline int
541 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
542 			struct extent_buffer *eb, int slot,
543 			enum mod_log_op op, gfp_t flags)
544 {
545 	struct tree_mod_elem *tm;
546 	int ret;
547 
548 	if (!tree_mod_need_log(fs_info, eb))
549 		return 0;
550 
551 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
552 	if (!tm)
553 		return -ENOMEM;
554 
555 	if (tree_mod_dont_log(fs_info, eb)) {
556 		kfree(tm);
557 		return 0;
558 	}
559 
560 	ret = __tree_mod_log_insert(fs_info, tm);
561 	tree_mod_log_write_unlock(fs_info);
562 	if (ret)
563 		kfree(tm);
564 
565 	return ret;
566 }
567 
568 static noinline int
569 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
570 			 struct extent_buffer *eb, int dst_slot, int src_slot,
571 			 int nr_items, gfp_t flags)
572 {
573 	struct tree_mod_elem *tm = NULL;
574 	struct tree_mod_elem **tm_list = NULL;
575 	int ret = 0;
576 	int i;
577 	int locked = 0;
578 
579 	if (!tree_mod_need_log(fs_info, eb))
580 		return 0;
581 
582 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
583 	if (!tm_list)
584 		return -ENOMEM;
585 
586 	tm = kzalloc(sizeof(*tm), flags);
587 	if (!tm) {
588 		ret = -ENOMEM;
589 		goto free_tms;
590 	}
591 
592 	tm->logical = eb->start;
593 	tm->slot = src_slot;
594 	tm->move.dst_slot = dst_slot;
595 	tm->move.nr_items = nr_items;
596 	tm->op = MOD_LOG_MOVE_KEYS;
597 
598 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
599 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
600 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
601 		if (!tm_list[i]) {
602 			ret = -ENOMEM;
603 			goto free_tms;
604 		}
605 	}
606 
607 	if (tree_mod_dont_log(fs_info, eb))
608 		goto free_tms;
609 	locked = 1;
610 
611 	/*
612 	 * When we override something during the move, we log these removals.
613 	 * This can only happen when we move towards the beginning of the
614 	 * buffer, i.e. dst_slot < src_slot.
615 	 */
616 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
617 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
618 		if (ret)
619 			goto free_tms;
620 	}
621 
622 	ret = __tree_mod_log_insert(fs_info, tm);
623 	if (ret)
624 		goto free_tms;
625 	tree_mod_log_write_unlock(fs_info);
626 	kfree(tm_list);
627 
628 	return 0;
629 free_tms:
630 	for (i = 0; i < nr_items; i++) {
631 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
632 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
633 		kfree(tm_list[i]);
634 	}
635 	if (locked)
636 		tree_mod_log_write_unlock(fs_info);
637 	kfree(tm_list);
638 	kfree(tm);
639 
640 	return ret;
641 }
642 
643 static inline int
644 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
645 		       struct tree_mod_elem **tm_list,
646 		       int nritems)
647 {
648 	int i, j;
649 	int ret;
650 
651 	for (i = nritems - 1; i >= 0; i--) {
652 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
653 		if (ret) {
654 			for (j = nritems - 1; j > i; j--)
655 				rb_erase(&tm_list[j]->node,
656 					 &fs_info->tree_mod_log);
657 			return ret;
658 		}
659 	}
660 
661 	return 0;
662 }
663 
664 static noinline int
665 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
666 			 struct extent_buffer *old_root,
667 			 struct extent_buffer *new_root, gfp_t flags,
668 			 int log_removal)
669 {
670 	struct tree_mod_elem *tm = NULL;
671 	struct tree_mod_elem **tm_list = NULL;
672 	int nritems = 0;
673 	int ret = 0;
674 	int i;
675 
676 	if (!tree_mod_need_log(fs_info, NULL))
677 		return 0;
678 
679 	if (log_removal && btrfs_header_level(old_root) > 0) {
680 		nritems = btrfs_header_nritems(old_root);
681 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
682 				  flags);
683 		if (!tm_list) {
684 			ret = -ENOMEM;
685 			goto free_tms;
686 		}
687 		for (i = 0; i < nritems; i++) {
688 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
689 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
690 			if (!tm_list[i]) {
691 				ret = -ENOMEM;
692 				goto free_tms;
693 			}
694 		}
695 	}
696 
697 	tm = kzalloc(sizeof(*tm), flags);
698 	if (!tm) {
699 		ret = -ENOMEM;
700 		goto free_tms;
701 	}
702 
703 	tm->logical = new_root->start;
704 	tm->old_root.logical = old_root->start;
705 	tm->old_root.level = btrfs_header_level(old_root);
706 	tm->generation = btrfs_header_generation(old_root);
707 	tm->op = MOD_LOG_ROOT_REPLACE;
708 
709 	if (tree_mod_dont_log(fs_info, NULL))
710 		goto free_tms;
711 
712 	if (tm_list)
713 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
714 	if (!ret)
715 		ret = __tree_mod_log_insert(fs_info, tm);
716 
717 	tree_mod_log_write_unlock(fs_info);
718 	if (ret)
719 		goto free_tms;
720 	kfree(tm_list);
721 
722 	return ret;
723 
724 free_tms:
725 	if (tm_list) {
726 		for (i = 0; i < nritems; i++)
727 			kfree(tm_list[i]);
728 		kfree(tm_list);
729 	}
730 	kfree(tm);
731 
732 	return ret;
733 }
734 
735 static struct tree_mod_elem *
736 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
737 		      int smallest)
738 {
739 	struct rb_root *tm_root;
740 	struct rb_node *node;
741 	struct tree_mod_elem *cur = NULL;
742 	struct tree_mod_elem *found = NULL;
743 
744 	tree_mod_log_read_lock(fs_info);
745 	tm_root = &fs_info->tree_mod_log;
746 	node = tm_root->rb_node;
747 	while (node) {
748 		cur = container_of(node, struct tree_mod_elem, node);
749 		if (cur->logical < start) {
750 			node = node->rb_left;
751 		} else if (cur->logical > start) {
752 			node = node->rb_right;
753 		} else if (cur->seq < min_seq) {
754 			node = node->rb_left;
755 		} else if (!smallest) {
756 			/* we want the node with the highest seq */
757 			if (found)
758 				BUG_ON(found->seq > cur->seq);
759 			found = cur;
760 			node = node->rb_left;
761 		} else if (cur->seq > min_seq) {
762 			/* we want the node with the smallest seq */
763 			if (found)
764 				BUG_ON(found->seq < cur->seq);
765 			found = cur;
766 			node = node->rb_right;
767 		} else {
768 			found = cur;
769 			break;
770 		}
771 	}
772 	tree_mod_log_read_unlock(fs_info);
773 
774 	return found;
775 }
776 
777 /*
778  * this returns the element from the log with the smallest time sequence
779  * value that's in the log (the oldest log item). any element with a time
780  * sequence lower than min_seq will be ignored.
781  */
782 static struct tree_mod_elem *
783 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
784 			   u64 min_seq)
785 {
786 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
787 }
788 
789 /*
790  * this returns the element from the log with the largest time sequence
791  * value that's in the log (the most recent log item). any element with
792  * a time sequence lower than min_seq will be ignored.
793  */
794 static struct tree_mod_elem *
795 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
796 {
797 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
798 }
799 
800 static noinline int
801 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
802 		     struct extent_buffer *src, unsigned long dst_offset,
803 		     unsigned long src_offset, int nr_items)
804 {
805 	int ret = 0;
806 	struct tree_mod_elem **tm_list = NULL;
807 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
808 	int i;
809 	int locked = 0;
810 
811 	if (!tree_mod_need_log(fs_info, NULL))
812 		return 0;
813 
814 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
815 		return 0;
816 
817 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
818 			  GFP_NOFS);
819 	if (!tm_list)
820 		return -ENOMEM;
821 
822 	tm_list_add = tm_list;
823 	tm_list_rem = tm_list + nr_items;
824 	for (i = 0; i < nr_items; i++) {
825 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
826 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
827 		if (!tm_list_rem[i]) {
828 			ret = -ENOMEM;
829 			goto free_tms;
830 		}
831 
832 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
833 		    MOD_LOG_KEY_ADD, GFP_NOFS);
834 		if (!tm_list_add[i]) {
835 			ret = -ENOMEM;
836 			goto free_tms;
837 		}
838 	}
839 
840 	if (tree_mod_dont_log(fs_info, NULL))
841 		goto free_tms;
842 	locked = 1;
843 
844 	for (i = 0; i < nr_items; i++) {
845 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
846 		if (ret)
847 			goto free_tms;
848 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
849 		if (ret)
850 			goto free_tms;
851 	}
852 
853 	tree_mod_log_write_unlock(fs_info);
854 	kfree(tm_list);
855 
856 	return 0;
857 
858 free_tms:
859 	for (i = 0; i < nr_items * 2; i++) {
860 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
861 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
862 		kfree(tm_list[i]);
863 	}
864 	if (locked)
865 		tree_mod_log_write_unlock(fs_info);
866 	kfree(tm_list);
867 
868 	return ret;
869 }
870 
871 static inline void
872 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
873 		     int dst_offset, int src_offset, int nr_items)
874 {
875 	int ret;
876 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
877 				       nr_items, GFP_NOFS);
878 	BUG_ON(ret < 0);
879 }
880 
881 static noinline void
882 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
883 			  struct extent_buffer *eb, int slot, int atomic)
884 {
885 	int ret;
886 
887 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
888 					MOD_LOG_KEY_REPLACE,
889 					atomic ? GFP_ATOMIC : GFP_NOFS);
890 	BUG_ON(ret < 0);
891 }
892 
893 static noinline int
894 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
895 {
896 	struct tree_mod_elem **tm_list = NULL;
897 	int nritems = 0;
898 	int i;
899 	int ret = 0;
900 
901 	if (btrfs_header_level(eb) == 0)
902 		return 0;
903 
904 	if (!tree_mod_need_log(fs_info, NULL))
905 		return 0;
906 
907 	nritems = btrfs_header_nritems(eb);
908 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
909 	if (!tm_list)
910 		return -ENOMEM;
911 
912 	for (i = 0; i < nritems; i++) {
913 		tm_list[i] = alloc_tree_mod_elem(eb, i,
914 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
915 		if (!tm_list[i]) {
916 			ret = -ENOMEM;
917 			goto free_tms;
918 		}
919 	}
920 
921 	if (tree_mod_dont_log(fs_info, eb))
922 		goto free_tms;
923 
924 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
925 	tree_mod_log_write_unlock(fs_info);
926 	if (ret)
927 		goto free_tms;
928 	kfree(tm_list);
929 
930 	return 0;
931 
932 free_tms:
933 	for (i = 0; i < nritems; i++)
934 		kfree(tm_list[i]);
935 	kfree(tm_list);
936 
937 	return ret;
938 }
939 
940 static noinline void
941 tree_mod_log_set_root_pointer(struct btrfs_root *root,
942 			      struct extent_buffer *new_root_node,
943 			      int log_removal)
944 {
945 	int ret;
946 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
947 				       new_root_node, GFP_NOFS, log_removal);
948 	BUG_ON(ret < 0);
949 }
950 
951 /*
952  * check if the tree block can be shared by multiple trees
953  */
954 int btrfs_block_can_be_shared(struct btrfs_root *root,
955 			      struct extent_buffer *buf)
956 {
957 	/*
958 	 * Tree blocks not in reference counted trees and tree roots
959 	 * are never shared. If a block was allocated after the last
960 	 * snapshot and the block was not allocated by tree relocation,
961 	 * we know the block is not shared.
962 	 */
963 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964 	    buf != root->node && buf != root->commit_root &&
965 	    (btrfs_header_generation(buf) <=
966 	     btrfs_root_last_snapshot(&root->root_item) ||
967 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
968 		return 1;
969 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
970 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
971 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
972 		return 1;
973 #endif
974 	return 0;
975 }
976 
977 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
978 				       struct btrfs_root *root,
979 				       struct extent_buffer *buf,
980 				       struct extent_buffer *cow,
981 				       int *last_ref)
982 {
983 	u64 refs;
984 	u64 owner;
985 	u64 flags;
986 	u64 new_flags = 0;
987 	int ret;
988 
989 	/*
990 	 * Backrefs update rules:
991 	 *
992 	 * Always use full backrefs for extent pointers in tree block
993 	 * allocated by tree relocation.
994 	 *
995 	 * If a shared tree block is no longer referenced by its owner
996 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
997 	 * use full backrefs for extent pointers in tree block.
998 	 *
999 	 * If a tree block is been relocating
1000 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001 	 * use full backrefs for extent pointers in tree block.
1002 	 * The reason for this is some operations (such as drop tree)
1003 	 * are only allowed for blocks use full backrefs.
1004 	 */
1005 
1006 	if (btrfs_block_can_be_shared(root, buf)) {
1007 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1008 					       btrfs_header_level(buf), 1,
1009 					       &refs, &flags);
1010 		if (ret)
1011 			return ret;
1012 		if (refs == 0) {
1013 			ret = -EROFS;
1014 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
1015 			return ret;
1016 		}
1017 	} else {
1018 		refs = 1;
1019 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022 		else
1023 			flags = 0;
1024 	}
1025 
1026 	owner = btrfs_header_owner(buf);
1027 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029 
1030 	if (refs > 1) {
1031 		if ((owner == root->root_key.objectid ||
1032 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034 			ret = btrfs_inc_ref(trans, root, buf, 1);
1035 			BUG_ON(ret); /* -ENOMEM */
1036 
1037 			if (root->root_key.objectid ==
1038 			    BTRFS_TREE_RELOC_OBJECTID) {
1039 				ret = btrfs_dec_ref(trans, root, buf, 0);
1040 				BUG_ON(ret); /* -ENOMEM */
1041 				ret = btrfs_inc_ref(trans, root, cow, 1);
1042 				BUG_ON(ret); /* -ENOMEM */
1043 			}
1044 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1045 		} else {
1046 
1047 			if (root->root_key.objectid ==
1048 			    BTRFS_TREE_RELOC_OBJECTID)
1049 				ret = btrfs_inc_ref(trans, root, cow, 1);
1050 			else
1051 				ret = btrfs_inc_ref(trans, root, cow, 0);
1052 			BUG_ON(ret); /* -ENOMEM */
1053 		}
1054 		if (new_flags != 0) {
1055 			int level = btrfs_header_level(buf);
1056 
1057 			ret = btrfs_set_disk_extent_flags(trans, root,
1058 							  buf->start,
1059 							  buf->len,
1060 							  new_flags, level, 0);
1061 			if (ret)
1062 				return ret;
1063 		}
1064 	} else {
1065 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1066 			if (root->root_key.objectid ==
1067 			    BTRFS_TREE_RELOC_OBJECTID)
1068 				ret = btrfs_inc_ref(trans, root, cow, 1);
1069 			else
1070 				ret = btrfs_inc_ref(trans, root, cow, 0);
1071 			BUG_ON(ret); /* -ENOMEM */
1072 			ret = btrfs_dec_ref(trans, root, buf, 1);
1073 			BUG_ON(ret); /* -ENOMEM */
1074 		}
1075 		clean_tree_block(trans, root->fs_info, buf);
1076 		*last_ref = 1;
1077 	}
1078 	return 0;
1079 }
1080 
1081 /*
1082  * does the dirty work in cow of a single block.  The parent block (if
1083  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1084  * dirty and returned locked.  If you modify the block it needs to be marked
1085  * dirty again.
1086  *
1087  * search_start -- an allocation hint for the new block
1088  *
1089  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1090  * bytes the allocator should try to find free next to the block it returns.
1091  * This is just a hint and may be ignored by the allocator.
1092  */
1093 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1094 			     struct btrfs_root *root,
1095 			     struct extent_buffer *buf,
1096 			     struct extent_buffer *parent, int parent_slot,
1097 			     struct extent_buffer **cow_ret,
1098 			     u64 search_start, u64 empty_size)
1099 {
1100 	struct btrfs_disk_key disk_key;
1101 	struct extent_buffer *cow;
1102 	int level, ret;
1103 	int last_ref = 0;
1104 	int unlock_orig = 0;
1105 	u64 parent_start;
1106 
1107 	if (*cow_ret == buf)
1108 		unlock_orig = 1;
1109 
1110 	btrfs_assert_tree_locked(buf);
1111 
1112 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113 		trans->transid != root->fs_info->running_transaction->transid);
1114 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1115 		trans->transid != root->last_trans);
1116 
1117 	level = btrfs_header_level(buf);
1118 
1119 	if (level == 0)
1120 		btrfs_item_key(buf, &disk_key, 0);
1121 	else
1122 		btrfs_node_key(buf, &disk_key, 0);
1123 
1124 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1125 		if (parent)
1126 			parent_start = parent->start;
1127 		else
1128 			parent_start = 0;
1129 	} else
1130 		parent_start = 0;
1131 
1132 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1133 			root->root_key.objectid, &disk_key, level,
1134 			search_start, empty_size);
1135 	if (IS_ERR(cow))
1136 		return PTR_ERR(cow);
1137 
1138 	/* cow is set to blocking by btrfs_init_new_buffer */
1139 
1140 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1141 	btrfs_set_header_bytenr(cow, cow->start);
1142 	btrfs_set_header_generation(cow, trans->transid);
1143 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1144 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1145 				     BTRFS_HEADER_FLAG_RELOC);
1146 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1147 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1148 	else
1149 		btrfs_set_header_owner(cow, root->root_key.objectid);
1150 
1151 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1152 			    BTRFS_FSID_SIZE);
1153 
1154 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1155 	if (ret) {
1156 		btrfs_abort_transaction(trans, ret);
1157 		return ret;
1158 	}
1159 
1160 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1161 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1162 		if (ret) {
1163 			btrfs_abort_transaction(trans, ret);
1164 			return ret;
1165 		}
1166 	}
1167 
1168 	if (buf == root->node) {
1169 		WARN_ON(parent && parent != buf);
1170 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1171 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1172 			parent_start = buf->start;
1173 		else
1174 			parent_start = 0;
1175 
1176 		extent_buffer_get(cow);
1177 		tree_mod_log_set_root_pointer(root, cow, 1);
1178 		rcu_assign_pointer(root->node, cow);
1179 
1180 		btrfs_free_tree_block(trans, root, buf, parent_start,
1181 				      last_ref);
1182 		free_extent_buffer(buf);
1183 		add_root_to_dirty_list(root);
1184 	} else {
1185 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1186 			parent_start = parent->start;
1187 		else
1188 			parent_start = 0;
1189 
1190 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1191 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1192 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1193 		btrfs_set_node_blockptr(parent, parent_slot,
1194 					cow->start);
1195 		btrfs_set_node_ptr_generation(parent, parent_slot,
1196 					      trans->transid);
1197 		btrfs_mark_buffer_dirty(parent);
1198 		if (last_ref) {
1199 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1200 			if (ret) {
1201 				btrfs_abort_transaction(trans, ret);
1202 				return ret;
1203 			}
1204 		}
1205 		btrfs_free_tree_block(trans, root, buf, parent_start,
1206 				      last_ref);
1207 	}
1208 	if (unlock_orig)
1209 		btrfs_tree_unlock(buf);
1210 	free_extent_buffer_stale(buf);
1211 	btrfs_mark_buffer_dirty(cow);
1212 	*cow_ret = cow;
1213 	return 0;
1214 }
1215 
1216 /*
1217  * returns the logical address of the oldest predecessor of the given root.
1218  * entries older than time_seq are ignored.
1219  */
1220 static struct tree_mod_elem *
1221 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1222 			   struct extent_buffer *eb_root, u64 time_seq)
1223 {
1224 	struct tree_mod_elem *tm;
1225 	struct tree_mod_elem *found = NULL;
1226 	u64 root_logical = eb_root->start;
1227 	int looped = 0;
1228 
1229 	if (!time_seq)
1230 		return NULL;
1231 
1232 	/*
1233 	 * the very last operation that's logged for a root is the
1234 	 * replacement operation (if it is replaced at all). this has
1235 	 * the logical address of the *new* root, making it the very
1236 	 * first operation that's logged for this root.
1237 	 */
1238 	while (1) {
1239 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1240 						time_seq);
1241 		if (!looped && !tm)
1242 			return NULL;
1243 		/*
1244 		 * if there are no tree operation for the oldest root, we simply
1245 		 * return it. this should only happen if that (old) root is at
1246 		 * level 0.
1247 		 */
1248 		if (!tm)
1249 			break;
1250 
1251 		/*
1252 		 * if there's an operation that's not a root replacement, we
1253 		 * found the oldest version of our root. normally, we'll find a
1254 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1255 		 */
1256 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1257 			break;
1258 
1259 		found = tm;
1260 		root_logical = tm->old_root.logical;
1261 		looped = 1;
1262 	}
1263 
1264 	/* if there's no old root to return, return what we found instead */
1265 	if (!found)
1266 		found = tm;
1267 
1268 	return found;
1269 }
1270 
1271 /*
1272  * tm is a pointer to the first operation to rewind within eb. then, all
1273  * previous operations will be rewound (until we reach something older than
1274  * time_seq).
1275  */
1276 static void
1277 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1278 		      u64 time_seq, struct tree_mod_elem *first_tm)
1279 {
1280 	u32 n;
1281 	struct rb_node *next;
1282 	struct tree_mod_elem *tm = first_tm;
1283 	unsigned long o_dst;
1284 	unsigned long o_src;
1285 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1286 
1287 	n = btrfs_header_nritems(eb);
1288 	tree_mod_log_read_lock(fs_info);
1289 	while (tm && tm->seq >= time_seq) {
1290 		/*
1291 		 * all the operations are recorded with the operator used for
1292 		 * the modification. as we're going backwards, we do the
1293 		 * opposite of each operation here.
1294 		 */
1295 		switch (tm->op) {
1296 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1297 			BUG_ON(tm->slot < n);
1298 			/* Fallthrough */
1299 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1300 		case MOD_LOG_KEY_REMOVE:
1301 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1302 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1303 			btrfs_set_node_ptr_generation(eb, tm->slot,
1304 						      tm->generation);
1305 			n++;
1306 			break;
1307 		case MOD_LOG_KEY_REPLACE:
1308 			BUG_ON(tm->slot >= n);
1309 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1310 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1311 			btrfs_set_node_ptr_generation(eb, tm->slot,
1312 						      tm->generation);
1313 			break;
1314 		case MOD_LOG_KEY_ADD:
1315 			/* if a move operation is needed it's in the log */
1316 			n--;
1317 			break;
1318 		case MOD_LOG_MOVE_KEYS:
1319 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1320 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1321 			memmove_extent_buffer(eb, o_dst, o_src,
1322 					      tm->move.nr_items * p_size);
1323 			break;
1324 		case MOD_LOG_ROOT_REPLACE:
1325 			/*
1326 			 * this operation is special. for roots, this must be
1327 			 * handled explicitly before rewinding.
1328 			 * for non-roots, this operation may exist if the node
1329 			 * was a root: root A -> child B; then A gets empty and
1330 			 * B is promoted to the new root. in the mod log, we'll
1331 			 * have a root-replace operation for B, a tree block
1332 			 * that is no root. we simply ignore that operation.
1333 			 */
1334 			break;
1335 		}
1336 		next = rb_next(&tm->node);
1337 		if (!next)
1338 			break;
1339 		tm = container_of(next, struct tree_mod_elem, node);
1340 		if (tm->logical != first_tm->logical)
1341 			break;
1342 	}
1343 	tree_mod_log_read_unlock(fs_info);
1344 	btrfs_set_header_nritems(eb, n);
1345 }
1346 
1347 /*
1348  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1349  * is returned. If rewind operations happen, a fresh buffer is returned. The
1350  * returned buffer is always read-locked. If the returned buffer is not the
1351  * input buffer, the lock on the input buffer is released and the input buffer
1352  * is freed (its refcount is decremented).
1353  */
1354 static struct extent_buffer *
1355 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1356 		    struct extent_buffer *eb, u64 time_seq)
1357 {
1358 	struct extent_buffer *eb_rewin;
1359 	struct tree_mod_elem *tm;
1360 
1361 	if (!time_seq)
1362 		return eb;
1363 
1364 	if (btrfs_header_level(eb) == 0)
1365 		return eb;
1366 
1367 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1368 	if (!tm)
1369 		return eb;
1370 
1371 	btrfs_set_path_blocking(path);
1372 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1373 
1374 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1375 		BUG_ON(tm->slot != 0);
1376 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
1377 						eb->len);
1378 		if (!eb_rewin) {
1379 			btrfs_tree_read_unlock_blocking(eb);
1380 			free_extent_buffer(eb);
1381 			return NULL;
1382 		}
1383 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1384 		btrfs_set_header_backref_rev(eb_rewin,
1385 					     btrfs_header_backref_rev(eb));
1386 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1387 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1388 	} else {
1389 		eb_rewin = btrfs_clone_extent_buffer(eb);
1390 		if (!eb_rewin) {
1391 			btrfs_tree_read_unlock_blocking(eb);
1392 			free_extent_buffer(eb);
1393 			return NULL;
1394 		}
1395 	}
1396 
1397 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1398 	btrfs_tree_read_unlock_blocking(eb);
1399 	free_extent_buffer(eb);
1400 
1401 	extent_buffer_get(eb_rewin);
1402 	btrfs_tree_read_lock(eb_rewin);
1403 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1404 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1405 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1406 
1407 	return eb_rewin;
1408 }
1409 
1410 /*
1411  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1412  * value. If there are no changes, the current root->root_node is returned. If
1413  * anything changed in between, there's a fresh buffer allocated on which the
1414  * rewind operations are done. In any case, the returned buffer is read locked.
1415  * Returns NULL on error (with no locks held).
1416  */
1417 static inline struct extent_buffer *
1418 get_old_root(struct btrfs_root *root, u64 time_seq)
1419 {
1420 	struct tree_mod_elem *tm;
1421 	struct extent_buffer *eb = NULL;
1422 	struct extent_buffer *eb_root;
1423 	struct extent_buffer *old;
1424 	struct tree_mod_root *old_root = NULL;
1425 	u64 old_generation = 0;
1426 	u64 logical;
1427 
1428 	eb_root = btrfs_read_lock_root_node(root);
1429 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1430 	if (!tm)
1431 		return eb_root;
1432 
1433 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1434 		old_root = &tm->old_root;
1435 		old_generation = tm->generation;
1436 		logical = old_root->logical;
1437 	} else {
1438 		logical = eb_root->start;
1439 	}
1440 
1441 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1442 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1443 		btrfs_tree_read_unlock(eb_root);
1444 		free_extent_buffer(eb_root);
1445 		old = read_tree_block(root, logical, 0);
1446 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1447 			if (!IS_ERR(old))
1448 				free_extent_buffer(old);
1449 			btrfs_warn(root->fs_info,
1450 				"failed to read tree block %llu from get_old_root", logical);
1451 		} else {
1452 			eb = btrfs_clone_extent_buffer(old);
1453 			free_extent_buffer(old);
1454 		}
1455 	} else if (old_root) {
1456 		btrfs_tree_read_unlock(eb_root);
1457 		free_extent_buffer(eb_root);
1458 		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
1459 					root->nodesize);
1460 	} else {
1461 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1462 		eb = btrfs_clone_extent_buffer(eb_root);
1463 		btrfs_tree_read_unlock_blocking(eb_root);
1464 		free_extent_buffer(eb_root);
1465 	}
1466 
1467 	if (!eb)
1468 		return NULL;
1469 	extent_buffer_get(eb);
1470 	btrfs_tree_read_lock(eb);
1471 	if (old_root) {
1472 		btrfs_set_header_bytenr(eb, eb->start);
1473 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1474 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1475 		btrfs_set_header_level(eb, old_root->level);
1476 		btrfs_set_header_generation(eb, old_generation);
1477 	}
1478 	if (tm)
1479 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1480 	else
1481 		WARN_ON(btrfs_header_level(eb) != 0);
1482 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1483 
1484 	return eb;
1485 }
1486 
1487 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1488 {
1489 	struct tree_mod_elem *tm;
1490 	int level;
1491 	struct extent_buffer *eb_root = btrfs_root_node(root);
1492 
1493 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1494 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1495 		level = tm->old_root.level;
1496 	} else {
1497 		level = btrfs_header_level(eb_root);
1498 	}
1499 	free_extent_buffer(eb_root);
1500 
1501 	return level;
1502 }
1503 
1504 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1505 				   struct btrfs_root *root,
1506 				   struct extent_buffer *buf)
1507 {
1508 	if (btrfs_is_testing(root->fs_info))
1509 		return 0;
1510 
1511 	/* ensure we can see the force_cow */
1512 	smp_rmb();
1513 
1514 	/*
1515 	 * We do not need to cow a block if
1516 	 * 1) this block is not created or changed in this transaction;
1517 	 * 2) this block does not belong to TREE_RELOC tree;
1518 	 * 3) the root is not forced COW.
1519 	 *
1520 	 * What is forced COW:
1521 	 *    when we create snapshot during committing the transaction,
1522 	 *    after we've finished coping src root, we must COW the shared
1523 	 *    block to ensure the metadata consistency.
1524 	 */
1525 	if (btrfs_header_generation(buf) == trans->transid &&
1526 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1527 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1528 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1529 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1530 		return 0;
1531 	return 1;
1532 }
1533 
1534 /*
1535  * cows a single block, see __btrfs_cow_block for the real work.
1536  * This version of it has extra checks so that a block isn't COWed more than
1537  * once per transaction, as long as it hasn't been written yet
1538  */
1539 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1540 		    struct btrfs_root *root, struct extent_buffer *buf,
1541 		    struct extent_buffer *parent, int parent_slot,
1542 		    struct extent_buffer **cow_ret)
1543 {
1544 	u64 search_start;
1545 	int ret;
1546 
1547 	if (trans->transaction != root->fs_info->running_transaction)
1548 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1549 		       trans->transid,
1550 		       root->fs_info->running_transaction->transid);
1551 
1552 	if (trans->transid != root->fs_info->generation)
1553 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1554 		       trans->transid, root->fs_info->generation);
1555 
1556 	if (!should_cow_block(trans, root, buf)) {
1557 		trans->dirty = true;
1558 		*cow_ret = buf;
1559 		return 0;
1560 	}
1561 
1562 	search_start = buf->start & ~((u64)SZ_1G - 1);
1563 
1564 	if (parent)
1565 		btrfs_set_lock_blocking(parent);
1566 	btrfs_set_lock_blocking(buf);
1567 
1568 	ret = __btrfs_cow_block(trans, root, buf, parent,
1569 				 parent_slot, cow_ret, search_start, 0);
1570 
1571 	trace_btrfs_cow_block(root, buf, *cow_ret);
1572 
1573 	return ret;
1574 }
1575 
1576 /*
1577  * helper function for defrag to decide if two blocks pointed to by a
1578  * node are actually close by
1579  */
1580 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1581 {
1582 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1583 		return 1;
1584 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1585 		return 1;
1586 	return 0;
1587 }
1588 
1589 /*
1590  * compare two keys in a memcmp fashion
1591  */
1592 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1593 {
1594 	struct btrfs_key k1;
1595 
1596 	btrfs_disk_key_to_cpu(&k1, disk);
1597 
1598 	return btrfs_comp_cpu_keys(&k1, k2);
1599 }
1600 
1601 /*
1602  * same as comp_keys only with two btrfs_key's
1603  */
1604 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1605 {
1606 	if (k1->objectid > k2->objectid)
1607 		return 1;
1608 	if (k1->objectid < k2->objectid)
1609 		return -1;
1610 	if (k1->type > k2->type)
1611 		return 1;
1612 	if (k1->type < k2->type)
1613 		return -1;
1614 	if (k1->offset > k2->offset)
1615 		return 1;
1616 	if (k1->offset < k2->offset)
1617 		return -1;
1618 	return 0;
1619 }
1620 
1621 /*
1622  * this is used by the defrag code to go through all the
1623  * leaves pointed to by a node and reallocate them so that
1624  * disk order is close to key order
1625  */
1626 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1627 		       struct btrfs_root *root, struct extent_buffer *parent,
1628 		       int start_slot, u64 *last_ret,
1629 		       struct btrfs_key *progress)
1630 {
1631 	struct extent_buffer *cur;
1632 	u64 blocknr;
1633 	u64 gen;
1634 	u64 search_start = *last_ret;
1635 	u64 last_block = 0;
1636 	u64 other;
1637 	u32 parent_nritems;
1638 	int end_slot;
1639 	int i;
1640 	int err = 0;
1641 	int parent_level;
1642 	int uptodate;
1643 	u32 blocksize;
1644 	int progress_passed = 0;
1645 	struct btrfs_disk_key disk_key;
1646 
1647 	parent_level = btrfs_header_level(parent);
1648 
1649 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1650 	WARN_ON(trans->transid != root->fs_info->generation);
1651 
1652 	parent_nritems = btrfs_header_nritems(parent);
1653 	blocksize = root->nodesize;
1654 	end_slot = parent_nritems - 1;
1655 
1656 	if (parent_nritems <= 1)
1657 		return 0;
1658 
1659 	btrfs_set_lock_blocking(parent);
1660 
1661 	for (i = start_slot; i <= end_slot; i++) {
1662 		int close = 1;
1663 
1664 		btrfs_node_key(parent, &disk_key, i);
1665 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1666 			continue;
1667 
1668 		progress_passed = 1;
1669 		blocknr = btrfs_node_blockptr(parent, i);
1670 		gen = btrfs_node_ptr_generation(parent, i);
1671 		if (last_block == 0)
1672 			last_block = blocknr;
1673 
1674 		if (i > 0) {
1675 			other = btrfs_node_blockptr(parent, i - 1);
1676 			close = close_blocks(blocknr, other, blocksize);
1677 		}
1678 		if (!close && i < end_slot) {
1679 			other = btrfs_node_blockptr(parent, i + 1);
1680 			close = close_blocks(blocknr, other, blocksize);
1681 		}
1682 		if (close) {
1683 			last_block = blocknr;
1684 			continue;
1685 		}
1686 
1687 		cur = btrfs_find_tree_block(root->fs_info, blocknr);
1688 		if (cur)
1689 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1690 		else
1691 			uptodate = 0;
1692 		if (!cur || !uptodate) {
1693 			if (!cur) {
1694 				cur = read_tree_block(root, blocknr, gen);
1695 				if (IS_ERR(cur)) {
1696 					return PTR_ERR(cur);
1697 				} else if (!extent_buffer_uptodate(cur)) {
1698 					free_extent_buffer(cur);
1699 					return -EIO;
1700 				}
1701 			} else if (!uptodate) {
1702 				err = btrfs_read_buffer(cur, gen);
1703 				if (err) {
1704 					free_extent_buffer(cur);
1705 					return err;
1706 				}
1707 			}
1708 		}
1709 		if (search_start == 0)
1710 			search_start = last_block;
1711 
1712 		btrfs_tree_lock(cur);
1713 		btrfs_set_lock_blocking(cur);
1714 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1715 					&cur, search_start,
1716 					min(16 * blocksize,
1717 					    (end_slot - i) * blocksize));
1718 		if (err) {
1719 			btrfs_tree_unlock(cur);
1720 			free_extent_buffer(cur);
1721 			break;
1722 		}
1723 		search_start = cur->start;
1724 		last_block = cur->start;
1725 		*last_ret = search_start;
1726 		btrfs_tree_unlock(cur);
1727 		free_extent_buffer(cur);
1728 	}
1729 	return err;
1730 }
1731 
1732 /*
1733  * The leaf data grows from end-to-front in the node.
1734  * this returns the address of the start of the last item,
1735  * which is the stop of the leaf data stack
1736  */
1737 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1738 					 struct extent_buffer *leaf)
1739 {
1740 	u32 nr = btrfs_header_nritems(leaf);
1741 	if (nr == 0)
1742 		return BTRFS_LEAF_DATA_SIZE(root);
1743 	return btrfs_item_offset_nr(leaf, nr - 1);
1744 }
1745 
1746 
1747 /*
1748  * search for key in the extent_buffer.  The items start at offset p,
1749  * and they are item_size apart.  There are 'max' items in p.
1750  *
1751  * the slot in the array is returned via slot, and it points to
1752  * the place where you would insert key if it is not found in
1753  * the array.
1754  *
1755  * slot may point to max if the key is bigger than all of the keys
1756  */
1757 static noinline int generic_bin_search(struct extent_buffer *eb,
1758 				       unsigned long p,
1759 				       int item_size, struct btrfs_key *key,
1760 				       int max, int *slot)
1761 {
1762 	int low = 0;
1763 	int high = max;
1764 	int mid;
1765 	int ret;
1766 	struct btrfs_disk_key *tmp = NULL;
1767 	struct btrfs_disk_key unaligned;
1768 	unsigned long offset;
1769 	char *kaddr = NULL;
1770 	unsigned long map_start = 0;
1771 	unsigned long map_len = 0;
1772 	int err;
1773 
1774 	if (low > high) {
1775 		btrfs_err(eb->fs_info,
1776 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1777 			  __func__, low, high, eb->start,
1778 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1779 		return -EINVAL;
1780 	}
1781 
1782 	while (low < high) {
1783 		mid = (low + high) / 2;
1784 		offset = p + mid * item_size;
1785 
1786 		if (!kaddr || offset < map_start ||
1787 		    (offset + sizeof(struct btrfs_disk_key)) >
1788 		    map_start + map_len) {
1789 
1790 			err = map_private_extent_buffer(eb, offset,
1791 						sizeof(struct btrfs_disk_key),
1792 						&kaddr, &map_start, &map_len);
1793 
1794 			if (!err) {
1795 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1796 							map_start);
1797 			} else if (err == 1) {
1798 				read_extent_buffer(eb, &unaligned,
1799 						   offset, sizeof(unaligned));
1800 				tmp = &unaligned;
1801 			} else {
1802 				return err;
1803 			}
1804 
1805 		} else {
1806 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1807 							map_start);
1808 		}
1809 		ret = comp_keys(tmp, key);
1810 
1811 		if (ret < 0)
1812 			low = mid + 1;
1813 		else if (ret > 0)
1814 			high = mid;
1815 		else {
1816 			*slot = mid;
1817 			return 0;
1818 		}
1819 	}
1820 	*slot = low;
1821 	return 1;
1822 }
1823 
1824 /*
1825  * simple bin_search frontend that does the right thing for
1826  * leaves vs nodes
1827  */
1828 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1829 		      int level, int *slot)
1830 {
1831 	if (level == 0)
1832 		return generic_bin_search(eb,
1833 					  offsetof(struct btrfs_leaf, items),
1834 					  sizeof(struct btrfs_item),
1835 					  key, btrfs_header_nritems(eb),
1836 					  slot);
1837 	else
1838 		return generic_bin_search(eb,
1839 					  offsetof(struct btrfs_node, ptrs),
1840 					  sizeof(struct btrfs_key_ptr),
1841 					  key, btrfs_header_nritems(eb),
1842 					  slot);
1843 }
1844 
1845 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1846 		     int level, int *slot)
1847 {
1848 	return bin_search(eb, key, level, slot);
1849 }
1850 
1851 static void root_add_used(struct btrfs_root *root, u32 size)
1852 {
1853 	spin_lock(&root->accounting_lock);
1854 	btrfs_set_root_used(&root->root_item,
1855 			    btrfs_root_used(&root->root_item) + size);
1856 	spin_unlock(&root->accounting_lock);
1857 }
1858 
1859 static void root_sub_used(struct btrfs_root *root, u32 size)
1860 {
1861 	spin_lock(&root->accounting_lock);
1862 	btrfs_set_root_used(&root->root_item,
1863 			    btrfs_root_used(&root->root_item) - size);
1864 	spin_unlock(&root->accounting_lock);
1865 }
1866 
1867 /* given a node and slot number, this reads the blocks it points to.  The
1868  * extent buffer is returned with a reference taken (but unlocked).
1869  */
1870 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1871 				   struct extent_buffer *parent, int slot)
1872 {
1873 	int level = btrfs_header_level(parent);
1874 	struct extent_buffer *eb;
1875 
1876 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1877 		return ERR_PTR(-ENOENT);
1878 
1879 	BUG_ON(level == 0);
1880 
1881 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1882 			     btrfs_node_ptr_generation(parent, slot));
1883 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1884 		free_extent_buffer(eb);
1885 		eb = ERR_PTR(-EIO);
1886 	}
1887 
1888 	return eb;
1889 }
1890 
1891 /*
1892  * node level balancing, used to make sure nodes are in proper order for
1893  * item deletion.  We balance from the top down, so we have to make sure
1894  * that a deletion won't leave an node completely empty later on.
1895  */
1896 static noinline int balance_level(struct btrfs_trans_handle *trans,
1897 			 struct btrfs_root *root,
1898 			 struct btrfs_path *path, int level)
1899 {
1900 	struct extent_buffer *right = NULL;
1901 	struct extent_buffer *mid;
1902 	struct extent_buffer *left = NULL;
1903 	struct extent_buffer *parent = NULL;
1904 	int ret = 0;
1905 	int wret;
1906 	int pslot;
1907 	int orig_slot = path->slots[level];
1908 	u64 orig_ptr;
1909 
1910 	if (level == 0)
1911 		return 0;
1912 
1913 	mid = path->nodes[level];
1914 
1915 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1916 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1917 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1918 
1919 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1920 
1921 	if (level < BTRFS_MAX_LEVEL - 1) {
1922 		parent = path->nodes[level + 1];
1923 		pslot = path->slots[level + 1];
1924 	}
1925 
1926 	/*
1927 	 * deal with the case where there is only one pointer in the root
1928 	 * by promoting the node below to a root
1929 	 */
1930 	if (!parent) {
1931 		struct extent_buffer *child;
1932 
1933 		if (btrfs_header_nritems(mid) != 1)
1934 			return 0;
1935 
1936 		/* promote the child to a root */
1937 		child = read_node_slot(root, mid, 0);
1938 		if (IS_ERR(child)) {
1939 			ret = PTR_ERR(child);
1940 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
1941 			goto enospc;
1942 		}
1943 
1944 		btrfs_tree_lock(child);
1945 		btrfs_set_lock_blocking(child);
1946 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1947 		if (ret) {
1948 			btrfs_tree_unlock(child);
1949 			free_extent_buffer(child);
1950 			goto enospc;
1951 		}
1952 
1953 		tree_mod_log_set_root_pointer(root, child, 1);
1954 		rcu_assign_pointer(root->node, child);
1955 
1956 		add_root_to_dirty_list(root);
1957 		btrfs_tree_unlock(child);
1958 
1959 		path->locks[level] = 0;
1960 		path->nodes[level] = NULL;
1961 		clean_tree_block(trans, root->fs_info, mid);
1962 		btrfs_tree_unlock(mid);
1963 		/* once for the path */
1964 		free_extent_buffer(mid);
1965 
1966 		root_sub_used(root, mid->len);
1967 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1968 		/* once for the root ptr */
1969 		free_extent_buffer_stale(mid);
1970 		return 0;
1971 	}
1972 	if (btrfs_header_nritems(mid) >
1973 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1974 		return 0;
1975 
1976 	left = read_node_slot(root, parent, pslot - 1);
1977 	if (IS_ERR(left))
1978 		left = NULL;
1979 
1980 	if (left) {
1981 		btrfs_tree_lock(left);
1982 		btrfs_set_lock_blocking(left);
1983 		wret = btrfs_cow_block(trans, root, left,
1984 				       parent, pslot - 1, &left);
1985 		if (wret) {
1986 			ret = wret;
1987 			goto enospc;
1988 		}
1989 	}
1990 
1991 	right = read_node_slot(root, parent, pslot + 1);
1992 	if (IS_ERR(right))
1993 		right = NULL;
1994 
1995 	if (right) {
1996 		btrfs_tree_lock(right);
1997 		btrfs_set_lock_blocking(right);
1998 		wret = btrfs_cow_block(trans, root, right,
1999 				       parent, pslot + 1, &right);
2000 		if (wret) {
2001 			ret = wret;
2002 			goto enospc;
2003 		}
2004 	}
2005 
2006 	/* first, try to make some room in the middle buffer */
2007 	if (left) {
2008 		orig_slot += btrfs_header_nritems(left);
2009 		wret = push_node_left(trans, root, left, mid, 1);
2010 		if (wret < 0)
2011 			ret = wret;
2012 	}
2013 
2014 	/*
2015 	 * then try to empty the right most buffer into the middle
2016 	 */
2017 	if (right) {
2018 		wret = push_node_left(trans, root, mid, right, 1);
2019 		if (wret < 0 && wret != -ENOSPC)
2020 			ret = wret;
2021 		if (btrfs_header_nritems(right) == 0) {
2022 			clean_tree_block(trans, root->fs_info, right);
2023 			btrfs_tree_unlock(right);
2024 			del_ptr(root, path, level + 1, pslot + 1);
2025 			root_sub_used(root, right->len);
2026 			btrfs_free_tree_block(trans, root, right, 0, 1);
2027 			free_extent_buffer_stale(right);
2028 			right = NULL;
2029 		} else {
2030 			struct btrfs_disk_key right_key;
2031 			btrfs_node_key(right, &right_key, 0);
2032 			tree_mod_log_set_node_key(root->fs_info, parent,
2033 						  pslot + 1, 0);
2034 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2035 			btrfs_mark_buffer_dirty(parent);
2036 		}
2037 	}
2038 	if (btrfs_header_nritems(mid) == 1) {
2039 		/*
2040 		 * we're not allowed to leave a node with one item in the
2041 		 * tree during a delete.  A deletion from lower in the tree
2042 		 * could try to delete the only pointer in this node.
2043 		 * So, pull some keys from the left.
2044 		 * There has to be a left pointer at this point because
2045 		 * otherwise we would have pulled some pointers from the
2046 		 * right
2047 		 */
2048 		if (!left) {
2049 			ret = -EROFS;
2050 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
2051 			goto enospc;
2052 		}
2053 		wret = balance_node_right(trans, root, mid, left);
2054 		if (wret < 0) {
2055 			ret = wret;
2056 			goto enospc;
2057 		}
2058 		if (wret == 1) {
2059 			wret = push_node_left(trans, root, left, mid, 1);
2060 			if (wret < 0)
2061 				ret = wret;
2062 		}
2063 		BUG_ON(wret == 1);
2064 	}
2065 	if (btrfs_header_nritems(mid) == 0) {
2066 		clean_tree_block(trans, root->fs_info, mid);
2067 		btrfs_tree_unlock(mid);
2068 		del_ptr(root, path, level + 1, pslot);
2069 		root_sub_used(root, mid->len);
2070 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2071 		free_extent_buffer_stale(mid);
2072 		mid = NULL;
2073 	} else {
2074 		/* update the parent key to reflect our changes */
2075 		struct btrfs_disk_key mid_key;
2076 		btrfs_node_key(mid, &mid_key, 0);
2077 		tree_mod_log_set_node_key(root->fs_info, parent,
2078 					  pslot, 0);
2079 		btrfs_set_node_key(parent, &mid_key, pslot);
2080 		btrfs_mark_buffer_dirty(parent);
2081 	}
2082 
2083 	/* update the path */
2084 	if (left) {
2085 		if (btrfs_header_nritems(left) > orig_slot) {
2086 			extent_buffer_get(left);
2087 			/* left was locked after cow */
2088 			path->nodes[level] = left;
2089 			path->slots[level + 1] -= 1;
2090 			path->slots[level] = orig_slot;
2091 			if (mid) {
2092 				btrfs_tree_unlock(mid);
2093 				free_extent_buffer(mid);
2094 			}
2095 		} else {
2096 			orig_slot -= btrfs_header_nritems(left);
2097 			path->slots[level] = orig_slot;
2098 		}
2099 	}
2100 	/* double check we haven't messed things up */
2101 	if (orig_ptr !=
2102 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2103 		BUG();
2104 enospc:
2105 	if (right) {
2106 		btrfs_tree_unlock(right);
2107 		free_extent_buffer(right);
2108 	}
2109 	if (left) {
2110 		if (path->nodes[level] != left)
2111 			btrfs_tree_unlock(left);
2112 		free_extent_buffer(left);
2113 	}
2114 	return ret;
2115 }
2116 
2117 /* Node balancing for insertion.  Here we only split or push nodes around
2118  * when they are completely full.  This is also done top down, so we
2119  * have to be pessimistic.
2120  */
2121 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2122 					  struct btrfs_root *root,
2123 					  struct btrfs_path *path, int level)
2124 {
2125 	struct extent_buffer *right = NULL;
2126 	struct extent_buffer *mid;
2127 	struct extent_buffer *left = NULL;
2128 	struct extent_buffer *parent = NULL;
2129 	int ret = 0;
2130 	int wret;
2131 	int pslot;
2132 	int orig_slot = path->slots[level];
2133 
2134 	if (level == 0)
2135 		return 1;
2136 
2137 	mid = path->nodes[level];
2138 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2139 
2140 	if (level < BTRFS_MAX_LEVEL - 1) {
2141 		parent = path->nodes[level + 1];
2142 		pslot = path->slots[level + 1];
2143 	}
2144 
2145 	if (!parent)
2146 		return 1;
2147 
2148 	left = read_node_slot(root, parent, pslot - 1);
2149 	if (IS_ERR(left))
2150 		left = NULL;
2151 
2152 	/* first, try to make some room in the middle buffer */
2153 	if (left) {
2154 		u32 left_nr;
2155 
2156 		btrfs_tree_lock(left);
2157 		btrfs_set_lock_blocking(left);
2158 
2159 		left_nr = btrfs_header_nritems(left);
2160 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2161 			wret = 1;
2162 		} else {
2163 			ret = btrfs_cow_block(trans, root, left, parent,
2164 					      pslot - 1, &left);
2165 			if (ret)
2166 				wret = 1;
2167 			else {
2168 				wret = push_node_left(trans, root,
2169 						      left, mid, 0);
2170 			}
2171 		}
2172 		if (wret < 0)
2173 			ret = wret;
2174 		if (wret == 0) {
2175 			struct btrfs_disk_key disk_key;
2176 			orig_slot += left_nr;
2177 			btrfs_node_key(mid, &disk_key, 0);
2178 			tree_mod_log_set_node_key(root->fs_info, parent,
2179 						  pslot, 0);
2180 			btrfs_set_node_key(parent, &disk_key, pslot);
2181 			btrfs_mark_buffer_dirty(parent);
2182 			if (btrfs_header_nritems(left) > orig_slot) {
2183 				path->nodes[level] = left;
2184 				path->slots[level + 1] -= 1;
2185 				path->slots[level] = orig_slot;
2186 				btrfs_tree_unlock(mid);
2187 				free_extent_buffer(mid);
2188 			} else {
2189 				orig_slot -=
2190 					btrfs_header_nritems(left);
2191 				path->slots[level] = orig_slot;
2192 				btrfs_tree_unlock(left);
2193 				free_extent_buffer(left);
2194 			}
2195 			return 0;
2196 		}
2197 		btrfs_tree_unlock(left);
2198 		free_extent_buffer(left);
2199 	}
2200 	right = read_node_slot(root, parent, pslot + 1);
2201 	if (IS_ERR(right))
2202 		right = NULL;
2203 
2204 	/*
2205 	 * then try to empty the right most buffer into the middle
2206 	 */
2207 	if (right) {
2208 		u32 right_nr;
2209 
2210 		btrfs_tree_lock(right);
2211 		btrfs_set_lock_blocking(right);
2212 
2213 		right_nr = btrfs_header_nritems(right);
2214 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2215 			wret = 1;
2216 		} else {
2217 			ret = btrfs_cow_block(trans, root, right,
2218 					      parent, pslot + 1,
2219 					      &right);
2220 			if (ret)
2221 				wret = 1;
2222 			else {
2223 				wret = balance_node_right(trans, root,
2224 							  right, mid);
2225 			}
2226 		}
2227 		if (wret < 0)
2228 			ret = wret;
2229 		if (wret == 0) {
2230 			struct btrfs_disk_key disk_key;
2231 
2232 			btrfs_node_key(right, &disk_key, 0);
2233 			tree_mod_log_set_node_key(root->fs_info, parent,
2234 						  pslot + 1, 0);
2235 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2236 			btrfs_mark_buffer_dirty(parent);
2237 
2238 			if (btrfs_header_nritems(mid) <= orig_slot) {
2239 				path->nodes[level] = right;
2240 				path->slots[level + 1] += 1;
2241 				path->slots[level] = orig_slot -
2242 					btrfs_header_nritems(mid);
2243 				btrfs_tree_unlock(mid);
2244 				free_extent_buffer(mid);
2245 			} else {
2246 				btrfs_tree_unlock(right);
2247 				free_extent_buffer(right);
2248 			}
2249 			return 0;
2250 		}
2251 		btrfs_tree_unlock(right);
2252 		free_extent_buffer(right);
2253 	}
2254 	return 1;
2255 }
2256 
2257 /*
2258  * readahead one full node of leaves, finding things that are close
2259  * to the block in 'slot', and triggering ra on them.
2260  */
2261 static void reada_for_search(struct btrfs_root *root,
2262 			     struct btrfs_path *path,
2263 			     int level, int slot, u64 objectid)
2264 {
2265 	struct extent_buffer *node;
2266 	struct btrfs_disk_key disk_key;
2267 	u32 nritems;
2268 	u64 search;
2269 	u64 target;
2270 	u64 nread = 0;
2271 	u64 gen;
2272 	struct extent_buffer *eb;
2273 	u32 nr;
2274 	u32 blocksize;
2275 	u32 nscan = 0;
2276 
2277 	if (level != 1)
2278 		return;
2279 
2280 	if (!path->nodes[level])
2281 		return;
2282 
2283 	node = path->nodes[level];
2284 
2285 	search = btrfs_node_blockptr(node, slot);
2286 	blocksize = root->nodesize;
2287 	eb = btrfs_find_tree_block(root->fs_info, search);
2288 	if (eb) {
2289 		free_extent_buffer(eb);
2290 		return;
2291 	}
2292 
2293 	target = search;
2294 
2295 	nritems = btrfs_header_nritems(node);
2296 	nr = slot;
2297 
2298 	while (1) {
2299 		if (path->reada == READA_BACK) {
2300 			if (nr == 0)
2301 				break;
2302 			nr--;
2303 		} else if (path->reada == READA_FORWARD) {
2304 			nr++;
2305 			if (nr >= nritems)
2306 				break;
2307 		}
2308 		if (path->reada == READA_BACK && objectid) {
2309 			btrfs_node_key(node, &disk_key, nr);
2310 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2311 				break;
2312 		}
2313 		search = btrfs_node_blockptr(node, nr);
2314 		if ((search <= target && target - search <= 65536) ||
2315 		    (search > target && search - target <= 65536)) {
2316 			gen = btrfs_node_ptr_generation(node, nr);
2317 			readahead_tree_block(root, search);
2318 			nread += blocksize;
2319 		}
2320 		nscan++;
2321 		if ((nread > 65536 || nscan > 32))
2322 			break;
2323 	}
2324 }
2325 
2326 static noinline void reada_for_balance(struct btrfs_root *root,
2327 				       struct btrfs_path *path, int level)
2328 {
2329 	int slot;
2330 	int nritems;
2331 	struct extent_buffer *parent;
2332 	struct extent_buffer *eb;
2333 	u64 gen;
2334 	u64 block1 = 0;
2335 	u64 block2 = 0;
2336 
2337 	parent = path->nodes[level + 1];
2338 	if (!parent)
2339 		return;
2340 
2341 	nritems = btrfs_header_nritems(parent);
2342 	slot = path->slots[level + 1];
2343 
2344 	if (slot > 0) {
2345 		block1 = btrfs_node_blockptr(parent, slot - 1);
2346 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2347 		eb = btrfs_find_tree_block(root->fs_info, block1);
2348 		/*
2349 		 * if we get -eagain from btrfs_buffer_uptodate, we
2350 		 * don't want to return eagain here.  That will loop
2351 		 * forever
2352 		 */
2353 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2354 			block1 = 0;
2355 		free_extent_buffer(eb);
2356 	}
2357 	if (slot + 1 < nritems) {
2358 		block2 = btrfs_node_blockptr(parent, slot + 1);
2359 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2360 		eb = btrfs_find_tree_block(root->fs_info, block2);
2361 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2362 			block2 = 0;
2363 		free_extent_buffer(eb);
2364 	}
2365 
2366 	if (block1)
2367 		readahead_tree_block(root, block1);
2368 	if (block2)
2369 		readahead_tree_block(root, block2);
2370 }
2371 
2372 
2373 /*
2374  * when we walk down the tree, it is usually safe to unlock the higher layers
2375  * in the tree.  The exceptions are when our path goes through slot 0, because
2376  * operations on the tree might require changing key pointers higher up in the
2377  * tree.
2378  *
2379  * callers might also have set path->keep_locks, which tells this code to keep
2380  * the lock if the path points to the last slot in the block.  This is part of
2381  * walking through the tree, and selecting the next slot in the higher block.
2382  *
2383  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2384  * if lowest_unlock is 1, level 0 won't be unlocked
2385  */
2386 static noinline void unlock_up(struct btrfs_path *path, int level,
2387 			       int lowest_unlock, int min_write_lock_level,
2388 			       int *write_lock_level)
2389 {
2390 	int i;
2391 	int skip_level = level;
2392 	int no_skips = 0;
2393 	struct extent_buffer *t;
2394 
2395 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2396 		if (!path->nodes[i])
2397 			break;
2398 		if (!path->locks[i])
2399 			break;
2400 		if (!no_skips && path->slots[i] == 0) {
2401 			skip_level = i + 1;
2402 			continue;
2403 		}
2404 		if (!no_skips && path->keep_locks) {
2405 			u32 nritems;
2406 			t = path->nodes[i];
2407 			nritems = btrfs_header_nritems(t);
2408 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2409 				skip_level = i + 1;
2410 				continue;
2411 			}
2412 		}
2413 		if (skip_level < i && i >= lowest_unlock)
2414 			no_skips = 1;
2415 
2416 		t = path->nodes[i];
2417 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2418 			btrfs_tree_unlock_rw(t, path->locks[i]);
2419 			path->locks[i] = 0;
2420 			if (write_lock_level &&
2421 			    i > min_write_lock_level &&
2422 			    i <= *write_lock_level) {
2423 				*write_lock_level = i - 1;
2424 			}
2425 		}
2426 	}
2427 }
2428 
2429 /*
2430  * This releases any locks held in the path starting at level and
2431  * going all the way up to the root.
2432  *
2433  * btrfs_search_slot will keep the lock held on higher nodes in a few
2434  * corner cases, such as COW of the block at slot zero in the node.  This
2435  * ignores those rules, and it should only be called when there are no
2436  * more updates to be done higher up in the tree.
2437  */
2438 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2439 {
2440 	int i;
2441 
2442 	if (path->keep_locks)
2443 		return;
2444 
2445 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2446 		if (!path->nodes[i])
2447 			continue;
2448 		if (!path->locks[i])
2449 			continue;
2450 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2451 		path->locks[i] = 0;
2452 	}
2453 }
2454 
2455 /*
2456  * helper function for btrfs_search_slot.  The goal is to find a block
2457  * in cache without setting the path to blocking.  If we find the block
2458  * we return zero and the path is unchanged.
2459  *
2460  * If we can't find the block, we set the path blocking and do some
2461  * reada.  -EAGAIN is returned and the search must be repeated.
2462  */
2463 static int
2464 read_block_for_search(struct btrfs_trans_handle *trans,
2465 		       struct btrfs_root *root, struct btrfs_path *p,
2466 		       struct extent_buffer **eb_ret, int level, int slot,
2467 		       struct btrfs_key *key, u64 time_seq)
2468 {
2469 	u64 blocknr;
2470 	u64 gen;
2471 	struct extent_buffer *b = *eb_ret;
2472 	struct extent_buffer *tmp;
2473 	int ret;
2474 
2475 	blocknr = btrfs_node_blockptr(b, slot);
2476 	gen = btrfs_node_ptr_generation(b, slot);
2477 
2478 	tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2479 	if (tmp) {
2480 		/* first we do an atomic uptodate check */
2481 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2482 			*eb_ret = tmp;
2483 			return 0;
2484 		}
2485 
2486 		/* the pages were up to date, but we failed
2487 		 * the generation number check.  Do a full
2488 		 * read for the generation number that is correct.
2489 		 * We must do this without dropping locks so
2490 		 * we can trust our generation number
2491 		 */
2492 		btrfs_set_path_blocking(p);
2493 
2494 		/* now we're allowed to do a blocking uptodate check */
2495 		ret = btrfs_read_buffer(tmp, gen);
2496 		if (!ret) {
2497 			*eb_ret = tmp;
2498 			return 0;
2499 		}
2500 		free_extent_buffer(tmp);
2501 		btrfs_release_path(p);
2502 		return -EIO;
2503 	}
2504 
2505 	/*
2506 	 * reduce lock contention at high levels
2507 	 * of the btree by dropping locks before
2508 	 * we read.  Don't release the lock on the current
2509 	 * level because we need to walk this node to figure
2510 	 * out which blocks to read.
2511 	 */
2512 	btrfs_unlock_up_safe(p, level + 1);
2513 	btrfs_set_path_blocking(p);
2514 
2515 	free_extent_buffer(tmp);
2516 	if (p->reada != READA_NONE)
2517 		reada_for_search(root, p, level, slot, key->objectid);
2518 
2519 	btrfs_release_path(p);
2520 
2521 	ret = -EAGAIN;
2522 	tmp = read_tree_block(root, blocknr, 0);
2523 	if (!IS_ERR(tmp)) {
2524 		/*
2525 		 * If the read above didn't mark this buffer up to date,
2526 		 * it will never end up being up to date.  Set ret to EIO now
2527 		 * and give up so that our caller doesn't loop forever
2528 		 * on our EAGAINs.
2529 		 */
2530 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2531 			ret = -EIO;
2532 		free_extent_buffer(tmp);
2533 	} else {
2534 		ret = PTR_ERR(tmp);
2535 	}
2536 	return ret;
2537 }
2538 
2539 /*
2540  * helper function for btrfs_search_slot.  This does all of the checks
2541  * for node-level blocks and does any balancing required based on
2542  * the ins_len.
2543  *
2544  * If no extra work was required, zero is returned.  If we had to
2545  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2546  * start over
2547  */
2548 static int
2549 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2550 		       struct btrfs_root *root, struct btrfs_path *p,
2551 		       struct extent_buffer *b, int level, int ins_len,
2552 		       int *write_lock_level)
2553 {
2554 	int ret;
2555 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2556 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2557 		int sret;
2558 
2559 		if (*write_lock_level < level + 1) {
2560 			*write_lock_level = level + 1;
2561 			btrfs_release_path(p);
2562 			goto again;
2563 		}
2564 
2565 		btrfs_set_path_blocking(p);
2566 		reada_for_balance(root, p, level);
2567 		sret = split_node(trans, root, p, level);
2568 		btrfs_clear_path_blocking(p, NULL, 0);
2569 
2570 		BUG_ON(sret > 0);
2571 		if (sret) {
2572 			ret = sret;
2573 			goto done;
2574 		}
2575 		b = p->nodes[level];
2576 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2577 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2578 		int sret;
2579 
2580 		if (*write_lock_level < level + 1) {
2581 			*write_lock_level = level + 1;
2582 			btrfs_release_path(p);
2583 			goto again;
2584 		}
2585 
2586 		btrfs_set_path_blocking(p);
2587 		reada_for_balance(root, p, level);
2588 		sret = balance_level(trans, root, p, level);
2589 		btrfs_clear_path_blocking(p, NULL, 0);
2590 
2591 		if (sret) {
2592 			ret = sret;
2593 			goto done;
2594 		}
2595 		b = p->nodes[level];
2596 		if (!b) {
2597 			btrfs_release_path(p);
2598 			goto again;
2599 		}
2600 		BUG_ON(btrfs_header_nritems(b) == 1);
2601 	}
2602 	return 0;
2603 
2604 again:
2605 	ret = -EAGAIN;
2606 done:
2607 	return ret;
2608 }
2609 
2610 static void key_search_validate(struct extent_buffer *b,
2611 				struct btrfs_key *key,
2612 				int level)
2613 {
2614 #ifdef CONFIG_BTRFS_ASSERT
2615 	struct btrfs_disk_key disk_key;
2616 
2617 	btrfs_cpu_key_to_disk(&disk_key, key);
2618 
2619 	if (level == 0)
2620 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2621 		    offsetof(struct btrfs_leaf, items[0].key),
2622 		    sizeof(disk_key)));
2623 	else
2624 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2625 		    offsetof(struct btrfs_node, ptrs[0].key),
2626 		    sizeof(disk_key)));
2627 #endif
2628 }
2629 
2630 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2631 		      int level, int *prev_cmp, int *slot)
2632 {
2633 	if (*prev_cmp != 0) {
2634 		*prev_cmp = bin_search(b, key, level, slot);
2635 		return *prev_cmp;
2636 	}
2637 
2638 	key_search_validate(b, key, level);
2639 	*slot = 0;
2640 
2641 	return 0;
2642 }
2643 
2644 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2645 		u64 iobjectid, u64 ioff, u8 key_type,
2646 		struct btrfs_key *found_key)
2647 {
2648 	int ret;
2649 	struct btrfs_key key;
2650 	struct extent_buffer *eb;
2651 
2652 	ASSERT(path);
2653 	ASSERT(found_key);
2654 
2655 	key.type = key_type;
2656 	key.objectid = iobjectid;
2657 	key.offset = ioff;
2658 
2659 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2660 	if (ret < 0)
2661 		return ret;
2662 
2663 	eb = path->nodes[0];
2664 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2665 		ret = btrfs_next_leaf(fs_root, path);
2666 		if (ret)
2667 			return ret;
2668 		eb = path->nodes[0];
2669 	}
2670 
2671 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2672 	if (found_key->type != key.type ||
2673 			found_key->objectid != key.objectid)
2674 		return 1;
2675 
2676 	return 0;
2677 }
2678 
2679 /*
2680  * look for key in the tree.  path is filled in with nodes along the way
2681  * if key is found, we return zero and you can find the item in the leaf
2682  * level of the path (level 0)
2683  *
2684  * If the key isn't found, the path points to the slot where it should
2685  * be inserted, and 1 is returned.  If there are other errors during the
2686  * search a negative error number is returned.
2687  *
2688  * if ins_len > 0, nodes and leaves will be split as we walk down the
2689  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2690  * possible)
2691  */
2692 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2693 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2694 		      ins_len, int cow)
2695 {
2696 	struct extent_buffer *b;
2697 	int slot;
2698 	int ret;
2699 	int err;
2700 	int level;
2701 	int lowest_unlock = 1;
2702 	int root_lock;
2703 	/* everything at write_lock_level or lower must be write locked */
2704 	int write_lock_level = 0;
2705 	u8 lowest_level = 0;
2706 	int min_write_lock_level;
2707 	int prev_cmp;
2708 
2709 	lowest_level = p->lowest_level;
2710 	WARN_ON(lowest_level && ins_len > 0);
2711 	WARN_ON(p->nodes[0] != NULL);
2712 	BUG_ON(!cow && ins_len);
2713 
2714 	if (ins_len < 0) {
2715 		lowest_unlock = 2;
2716 
2717 		/* when we are removing items, we might have to go up to level
2718 		 * two as we update tree pointers  Make sure we keep write
2719 		 * for those levels as well
2720 		 */
2721 		write_lock_level = 2;
2722 	} else if (ins_len > 0) {
2723 		/*
2724 		 * for inserting items, make sure we have a write lock on
2725 		 * level 1 so we can update keys
2726 		 */
2727 		write_lock_level = 1;
2728 	}
2729 
2730 	if (!cow)
2731 		write_lock_level = -1;
2732 
2733 	if (cow && (p->keep_locks || p->lowest_level))
2734 		write_lock_level = BTRFS_MAX_LEVEL;
2735 
2736 	min_write_lock_level = write_lock_level;
2737 
2738 again:
2739 	prev_cmp = -1;
2740 	/*
2741 	 * we try very hard to do read locks on the root
2742 	 */
2743 	root_lock = BTRFS_READ_LOCK;
2744 	level = 0;
2745 	if (p->search_commit_root) {
2746 		/*
2747 		 * the commit roots are read only
2748 		 * so we always do read locks
2749 		 */
2750 		if (p->need_commit_sem)
2751 			down_read(&root->fs_info->commit_root_sem);
2752 		b = root->commit_root;
2753 		extent_buffer_get(b);
2754 		level = btrfs_header_level(b);
2755 		if (p->need_commit_sem)
2756 			up_read(&root->fs_info->commit_root_sem);
2757 		if (!p->skip_locking)
2758 			btrfs_tree_read_lock(b);
2759 	} else {
2760 		if (p->skip_locking) {
2761 			b = btrfs_root_node(root);
2762 			level = btrfs_header_level(b);
2763 		} else {
2764 			/* we don't know the level of the root node
2765 			 * until we actually have it read locked
2766 			 */
2767 			b = btrfs_read_lock_root_node(root);
2768 			level = btrfs_header_level(b);
2769 			if (level <= write_lock_level) {
2770 				/* whoops, must trade for write lock */
2771 				btrfs_tree_read_unlock(b);
2772 				free_extent_buffer(b);
2773 				b = btrfs_lock_root_node(root);
2774 				root_lock = BTRFS_WRITE_LOCK;
2775 
2776 				/* the level might have changed, check again */
2777 				level = btrfs_header_level(b);
2778 			}
2779 		}
2780 	}
2781 	p->nodes[level] = b;
2782 	if (!p->skip_locking)
2783 		p->locks[level] = root_lock;
2784 
2785 	while (b) {
2786 		level = btrfs_header_level(b);
2787 
2788 		/*
2789 		 * setup the path here so we can release it under lock
2790 		 * contention with the cow code
2791 		 */
2792 		if (cow) {
2793 			/*
2794 			 * if we don't really need to cow this block
2795 			 * then we don't want to set the path blocking,
2796 			 * so we test it here
2797 			 */
2798 			if (!should_cow_block(trans, root, b)) {
2799 				trans->dirty = true;
2800 				goto cow_done;
2801 			}
2802 
2803 			/*
2804 			 * must have write locks on this node and the
2805 			 * parent
2806 			 */
2807 			if (level > write_lock_level ||
2808 			    (level + 1 > write_lock_level &&
2809 			    level + 1 < BTRFS_MAX_LEVEL &&
2810 			    p->nodes[level + 1])) {
2811 				write_lock_level = level + 1;
2812 				btrfs_release_path(p);
2813 				goto again;
2814 			}
2815 
2816 			btrfs_set_path_blocking(p);
2817 			err = btrfs_cow_block(trans, root, b,
2818 					      p->nodes[level + 1],
2819 					      p->slots[level + 1], &b);
2820 			if (err) {
2821 				ret = err;
2822 				goto done;
2823 			}
2824 		}
2825 cow_done:
2826 		p->nodes[level] = b;
2827 		btrfs_clear_path_blocking(p, NULL, 0);
2828 
2829 		/*
2830 		 * we have a lock on b and as long as we aren't changing
2831 		 * the tree, there is no way to for the items in b to change.
2832 		 * It is safe to drop the lock on our parent before we
2833 		 * go through the expensive btree search on b.
2834 		 *
2835 		 * If we're inserting or deleting (ins_len != 0), then we might
2836 		 * be changing slot zero, which may require changing the parent.
2837 		 * So, we can't drop the lock until after we know which slot
2838 		 * we're operating on.
2839 		 */
2840 		if (!ins_len && !p->keep_locks) {
2841 			int u = level + 1;
2842 
2843 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2844 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2845 				p->locks[u] = 0;
2846 			}
2847 		}
2848 
2849 		ret = key_search(b, key, level, &prev_cmp, &slot);
2850 		if (ret < 0)
2851 			goto done;
2852 
2853 		if (level != 0) {
2854 			int dec = 0;
2855 			if (ret && slot > 0) {
2856 				dec = 1;
2857 				slot -= 1;
2858 			}
2859 			p->slots[level] = slot;
2860 			err = setup_nodes_for_search(trans, root, p, b, level,
2861 					     ins_len, &write_lock_level);
2862 			if (err == -EAGAIN)
2863 				goto again;
2864 			if (err) {
2865 				ret = err;
2866 				goto done;
2867 			}
2868 			b = p->nodes[level];
2869 			slot = p->slots[level];
2870 
2871 			/*
2872 			 * slot 0 is special, if we change the key
2873 			 * we have to update the parent pointer
2874 			 * which means we must have a write lock
2875 			 * on the parent
2876 			 */
2877 			if (slot == 0 && ins_len &&
2878 			    write_lock_level < level + 1) {
2879 				write_lock_level = level + 1;
2880 				btrfs_release_path(p);
2881 				goto again;
2882 			}
2883 
2884 			unlock_up(p, level, lowest_unlock,
2885 				  min_write_lock_level, &write_lock_level);
2886 
2887 			if (level == lowest_level) {
2888 				if (dec)
2889 					p->slots[level]++;
2890 				goto done;
2891 			}
2892 
2893 			err = read_block_for_search(trans, root, p,
2894 						    &b, level, slot, key, 0);
2895 			if (err == -EAGAIN)
2896 				goto again;
2897 			if (err) {
2898 				ret = err;
2899 				goto done;
2900 			}
2901 
2902 			if (!p->skip_locking) {
2903 				level = btrfs_header_level(b);
2904 				if (level <= write_lock_level) {
2905 					err = btrfs_try_tree_write_lock(b);
2906 					if (!err) {
2907 						btrfs_set_path_blocking(p);
2908 						btrfs_tree_lock(b);
2909 						btrfs_clear_path_blocking(p, b,
2910 								  BTRFS_WRITE_LOCK);
2911 					}
2912 					p->locks[level] = BTRFS_WRITE_LOCK;
2913 				} else {
2914 					err = btrfs_tree_read_lock_atomic(b);
2915 					if (!err) {
2916 						btrfs_set_path_blocking(p);
2917 						btrfs_tree_read_lock(b);
2918 						btrfs_clear_path_blocking(p, b,
2919 								  BTRFS_READ_LOCK);
2920 					}
2921 					p->locks[level] = BTRFS_READ_LOCK;
2922 				}
2923 				p->nodes[level] = b;
2924 			}
2925 		} else {
2926 			p->slots[level] = slot;
2927 			if (ins_len > 0 &&
2928 			    btrfs_leaf_free_space(root, b) < ins_len) {
2929 				if (write_lock_level < 1) {
2930 					write_lock_level = 1;
2931 					btrfs_release_path(p);
2932 					goto again;
2933 				}
2934 
2935 				btrfs_set_path_blocking(p);
2936 				err = split_leaf(trans, root, key,
2937 						 p, ins_len, ret == 0);
2938 				btrfs_clear_path_blocking(p, NULL, 0);
2939 
2940 				BUG_ON(err > 0);
2941 				if (err) {
2942 					ret = err;
2943 					goto done;
2944 				}
2945 			}
2946 			if (!p->search_for_split)
2947 				unlock_up(p, level, lowest_unlock,
2948 					  min_write_lock_level, &write_lock_level);
2949 			goto done;
2950 		}
2951 	}
2952 	ret = 1;
2953 done:
2954 	/*
2955 	 * we don't really know what they plan on doing with the path
2956 	 * from here on, so for now just mark it as blocking
2957 	 */
2958 	if (!p->leave_spinning)
2959 		btrfs_set_path_blocking(p);
2960 	if (ret < 0 && !p->skip_release_on_error)
2961 		btrfs_release_path(p);
2962 	return ret;
2963 }
2964 
2965 /*
2966  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2967  * current state of the tree together with the operations recorded in the tree
2968  * modification log to search for the key in a previous version of this tree, as
2969  * denoted by the time_seq parameter.
2970  *
2971  * Naturally, there is no support for insert, delete or cow operations.
2972  *
2973  * The resulting path and return value will be set up as if we called
2974  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2975  */
2976 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2977 			  struct btrfs_path *p, u64 time_seq)
2978 {
2979 	struct extent_buffer *b;
2980 	int slot;
2981 	int ret;
2982 	int err;
2983 	int level;
2984 	int lowest_unlock = 1;
2985 	u8 lowest_level = 0;
2986 	int prev_cmp = -1;
2987 
2988 	lowest_level = p->lowest_level;
2989 	WARN_ON(p->nodes[0] != NULL);
2990 
2991 	if (p->search_commit_root) {
2992 		BUG_ON(time_seq);
2993 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2994 	}
2995 
2996 again:
2997 	b = get_old_root(root, time_seq);
2998 	level = btrfs_header_level(b);
2999 	p->locks[level] = BTRFS_READ_LOCK;
3000 
3001 	while (b) {
3002 		level = btrfs_header_level(b);
3003 		p->nodes[level] = b;
3004 		btrfs_clear_path_blocking(p, NULL, 0);
3005 
3006 		/*
3007 		 * we have a lock on b and as long as we aren't changing
3008 		 * the tree, there is no way to for the items in b to change.
3009 		 * It is safe to drop the lock on our parent before we
3010 		 * go through the expensive btree search on b.
3011 		 */
3012 		btrfs_unlock_up_safe(p, level + 1);
3013 
3014 		/*
3015 		 * Since we can unwind ebs we want to do a real search every
3016 		 * time.
3017 		 */
3018 		prev_cmp = -1;
3019 		ret = key_search(b, key, level, &prev_cmp, &slot);
3020 
3021 		if (level != 0) {
3022 			int dec = 0;
3023 			if (ret && slot > 0) {
3024 				dec = 1;
3025 				slot -= 1;
3026 			}
3027 			p->slots[level] = slot;
3028 			unlock_up(p, level, lowest_unlock, 0, NULL);
3029 
3030 			if (level == lowest_level) {
3031 				if (dec)
3032 					p->slots[level]++;
3033 				goto done;
3034 			}
3035 
3036 			err = read_block_for_search(NULL, root, p, &b, level,
3037 						    slot, key, time_seq);
3038 			if (err == -EAGAIN)
3039 				goto again;
3040 			if (err) {
3041 				ret = err;
3042 				goto done;
3043 			}
3044 
3045 			level = btrfs_header_level(b);
3046 			err = btrfs_tree_read_lock_atomic(b);
3047 			if (!err) {
3048 				btrfs_set_path_blocking(p);
3049 				btrfs_tree_read_lock(b);
3050 				btrfs_clear_path_blocking(p, b,
3051 							  BTRFS_READ_LOCK);
3052 			}
3053 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3054 			if (!b) {
3055 				ret = -ENOMEM;
3056 				goto done;
3057 			}
3058 			p->locks[level] = BTRFS_READ_LOCK;
3059 			p->nodes[level] = b;
3060 		} else {
3061 			p->slots[level] = slot;
3062 			unlock_up(p, level, lowest_unlock, 0, NULL);
3063 			goto done;
3064 		}
3065 	}
3066 	ret = 1;
3067 done:
3068 	if (!p->leave_spinning)
3069 		btrfs_set_path_blocking(p);
3070 	if (ret < 0)
3071 		btrfs_release_path(p);
3072 
3073 	return ret;
3074 }
3075 
3076 /*
3077  * helper to use instead of search slot if no exact match is needed but
3078  * instead the next or previous item should be returned.
3079  * When find_higher is true, the next higher item is returned, the next lower
3080  * otherwise.
3081  * When return_any and find_higher are both true, and no higher item is found,
3082  * return the next lower instead.
3083  * When return_any is true and find_higher is false, and no lower item is found,
3084  * return the next higher instead.
3085  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3086  * < 0 on error
3087  */
3088 int btrfs_search_slot_for_read(struct btrfs_root *root,
3089 			       struct btrfs_key *key, struct btrfs_path *p,
3090 			       int find_higher, int return_any)
3091 {
3092 	int ret;
3093 	struct extent_buffer *leaf;
3094 
3095 again:
3096 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3097 	if (ret <= 0)
3098 		return ret;
3099 	/*
3100 	 * a return value of 1 means the path is at the position where the
3101 	 * item should be inserted. Normally this is the next bigger item,
3102 	 * but in case the previous item is the last in a leaf, path points
3103 	 * to the first free slot in the previous leaf, i.e. at an invalid
3104 	 * item.
3105 	 */
3106 	leaf = p->nodes[0];
3107 
3108 	if (find_higher) {
3109 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3110 			ret = btrfs_next_leaf(root, p);
3111 			if (ret <= 0)
3112 				return ret;
3113 			if (!return_any)
3114 				return 1;
3115 			/*
3116 			 * no higher item found, return the next
3117 			 * lower instead
3118 			 */
3119 			return_any = 0;
3120 			find_higher = 0;
3121 			btrfs_release_path(p);
3122 			goto again;
3123 		}
3124 	} else {
3125 		if (p->slots[0] == 0) {
3126 			ret = btrfs_prev_leaf(root, p);
3127 			if (ret < 0)
3128 				return ret;
3129 			if (!ret) {
3130 				leaf = p->nodes[0];
3131 				if (p->slots[0] == btrfs_header_nritems(leaf))
3132 					p->slots[0]--;
3133 				return 0;
3134 			}
3135 			if (!return_any)
3136 				return 1;
3137 			/*
3138 			 * no lower item found, return the next
3139 			 * higher instead
3140 			 */
3141 			return_any = 0;
3142 			find_higher = 1;
3143 			btrfs_release_path(p);
3144 			goto again;
3145 		} else {
3146 			--p->slots[0];
3147 		}
3148 	}
3149 	return 0;
3150 }
3151 
3152 /*
3153  * adjust the pointers going up the tree, starting at level
3154  * making sure the right key of each node is points to 'key'.
3155  * This is used after shifting pointers to the left, so it stops
3156  * fixing up pointers when a given leaf/node is not in slot 0 of the
3157  * higher levels
3158  *
3159  */
3160 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3161 			   struct btrfs_path *path,
3162 			   struct btrfs_disk_key *key, int level)
3163 {
3164 	int i;
3165 	struct extent_buffer *t;
3166 
3167 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3168 		int tslot = path->slots[i];
3169 		if (!path->nodes[i])
3170 			break;
3171 		t = path->nodes[i];
3172 		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3173 		btrfs_set_node_key(t, key, tslot);
3174 		btrfs_mark_buffer_dirty(path->nodes[i]);
3175 		if (tslot != 0)
3176 			break;
3177 	}
3178 }
3179 
3180 /*
3181  * update item key.
3182  *
3183  * This function isn't completely safe. It's the caller's responsibility
3184  * that the new key won't break the order
3185  */
3186 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3187 			     struct btrfs_path *path,
3188 			     struct btrfs_key *new_key)
3189 {
3190 	struct btrfs_disk_key disk_key;
3191 	struct extent_buffer *eb;
3192 	int slot;
3193 
3194 	eb = path->nodes[0];
3195 	slot = path->slots[0];
3196 	if (slot > 0) {
3197 		btrfs_item_key(eb, &disk_key, slot - 1);
3198 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3199 	}
3200 	if (slot < btrfs_header_nritems(eb) - 1) {
3201 		btrfs_item_key(eb, &disk_key, slot + 1);
3202 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3203 	}
3204 
3205 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3206 	btrfs_set_item_key(eb, &disk_key, slot);
3207 	btrfs_mark_buffer_dirty(eb);
3208 	if (slot == 0)
3209 		fixup_low_keys(fs_info, path, &disk_key, 1);
3210 }
3211 
3212 /*
3213  * try to push data from one node into the next node left in the
3214  * tree.
3215  *
3216  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3217  * error, and > 0 if there was no room in the left hand block.
3218  */
3219 static int push_node_left(struct btrfs_trans_handle *trans,
3220 			  struct btrfs_root *root, struct extent_buffer *dst,
3221 			  struct extent_buffer *src, int empty)
3222 {
3223 	int push_items = 0;
3224 	int src_nritems;
3225 	int dst_nritems;
3226 	int ret = 0;
3227 
3228 	src_nritems = btrfs_header_nritems(src);
3229 	dst_nritems = btrfs_header_nritems(dst);
3230 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3231 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3232 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3233 
3234 	if (!empty && src_nritems <= 8)
3235 		return 1;
3236 
3237 	if (push_items <= 0)
3238 		return 1;
3239 
3240 	if (empty) {
3241 		push_items = min(src_nritems, push_items);
3242 		if (push_items < src_nritems) {
3243 			/* leave at least 8 pointers in the node if
3244 			 * we aren't going to empty it
3245 			 */
3246 			if (src_nritems - push_items < 8) {
3247 				if (push_items <= 8)
3248 					return 1;
3249 				push_items -= 8;
3250 			}
3251 		}
3252 	} else
3253 		push_items = min(src_nritems - 8, push_items);
3254 
3255 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3256 				   push_items);
3257 	if (ret) {
3258 		btrfs_abort_transaction(trans, ret);
3259 		return ret;
3260 	}
3261 	copy_extent_buffer(dst, src,
3262 			   btrfs_node_key_ptr_offset(dst_nritems),
3263 			   btrfs_node_key_ptr_offset(0),
3264 			   push_items * sizeof(struct btrfs_key_ptr));
3265 
3266 	if (push_items < src_nritems) {
3267 		/*
3268 		 * don't call tree_mod_log_eb_move here, key removal was already
3269 		 * fully logged by tree_mod_log_eb_copy above.
3270 		 */
3271 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3272 				      btrfs_node_key_ptr_offset(push_items),
3273 				      (src_nritems - push_items) *
3274 				      sizeof(struct btrfs_key_ptr));
3275 	}
3276 	btrfs_set_header_nritems(src, src_nritems - push_items);
3277 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3278 	btrfs_mark_buffer_dirty(src);
3279 	btrfs_mark_buffer_dirty(dst);
3280 
3281 	return ret;
3282 }
3283 
3284 /*
3285  * try to push data from one node into the next node right in the
3286  * tree.
3287  *
3288  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3289  * error, and > 0 if there was no room in the right hand block.
3290  *
3291  * this will  only push up to 1/2 the contents of the left node over
3292  */
3293 static int balance_node_right(struct btrfs_trans_handle *trans,
3294 			      struct btrfs_root *root,
3295 			      struct extent_buffer *dst,
3296 			      struct extent_buffer *src)
3297 {
3298 	int push_items = 0;
3299 	int max_push;
3300 	int src_nritems;
3301 	int dst_nritems;
3302 	int ret = 0;
3303 
3304 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3305 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3306 
3307 	src_nritems = btrfs_header_nritems(src);
3308 	dst_nritems = btrfs_header_nritems(dst);
3309 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3310 	if (push_items <= 0)
3311 		return 1;
3312 
3313 	if (src_nritems < 4)
3314 		return 1;
3315 
3316 	max_push = src_nritems / 2 + 1;
3317 	/* don't try to empty the node */
3318 	if (max_push >= src_nritems)
3319 		return 1;
3320 
3321 	if (max_push < push_items)
3322 		push_items = max_push;
3323 
3324 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3325 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3326 				      btrfs_node_key_ptr_offset(0),
3327 				      (dst_nritems) *
3328 				      sizeof(struct btrfs_key_ptr));
3329 
3330 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3331 				   src_nritems - push_items, push_items);
3332 	if (ret) {
3333 		btrfs_abort_transaction(trans, ret);
3334 		return ret;
3335 	}
3336 	copy_extent_buffer(dst, src,
3337 			   btrfs_node_key_ptr_offset(0),
3338 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3339 			   push_items * sizeof(struct btrfs_key_ptr));
3340 
3341 	btrfs_set_header_nritems(src, src_nritems - push_items);
3342 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3343 
3344 	btrfs_mark_buffer_dirty(src);
3345 	btrfs_mark_buffer_dirty(dst);
3346 
3347 	return ret;
3348 }
3349 
3350 /*
3351  * helper function to insert a new root level in the tree.
3352  * A new node is allocated, and a single item is inserted to
3353  * point to the existing root
3354  *
3355  * returns zero on success or < 0 on failure.
3356  */
3357 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3358 			   struct btrfs_root *root,
3359 			   struct btrfs_path *path, int level)
3360 {
3361 	u64 lower_gen;
3362 	struct extent_buffer *lower;
3363 	struct extent_buffer *c;
3364 	struct extent_buffer *old;
3365 	struct btrfs_disk_key lower_key;
3366 
3367 	BUG_ON(path->nodes[level]);
3368 	BUG_ON(path->nodes[level-1] != root->node);
3369 
3370 	lower = path->nodes[level-1];
3371 	if (level == 1)
3372 		btrfs_item_key(lower, &lower_key, 0);
3373 	else
3374 		btrfs_node_key(lower, &lower_key, 0);
3375 
3376 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3377 				   &lower_key, level, root->node->start, 0);
3378 	if (IS_ERR(c))
3379 		return PTR_ERR(c);
3380 
3381 	root_add_used(root, root->nodesize);
3382 
3383 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3384 	btrfs_set_header_nritems(c, 1);
3385 	btrfs_set_header_level(c, level);
3386 	btrfs_set_header_bytenr(c, c->start);
3387 	btrfs_set_header_generation(c, trans->transid);
3388 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3389 	btrfs_set_header_owner(c, root->root_key.objectid);
3390 
3391 	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3392 			    BTRFS_FSID_SIZE);
3393 
3394 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3395 			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3396 
3397 	btrfs_set_node_key(c, &lower_key, 0);
3398 	btrfs_set_node_blockptr(c, 0, lower->start);
3399 	lower_gen = btrfs_header_generation(lower);
3400 	WARN_ON(lower_gen != trans->transid);
3401 
3402 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3403 
3404 	btrfs_mark_buffer_dirty(c);
3405 
3406 	old = root->node;
3407 	tree_mod_log_set_root_pointer(root, c, 0);
3408 	rcu_assign_pointer(root->node, c);
3409 
3410 	/* the super has an extra ref to root->node */
3411 	free_extent_buffer(old);
3412 
3413 	add_root_to_dirty_list(root);
3414 	extent_buffer_get(c);
3415 	path->nodes[level] = c;
3416 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3417 	path->slots[level] = 0;
3418 	return 0;
3419 }
3420 
3421 /*
3422  * worker function to insert a single pointer in a node.
3423  * the node should have enough room for the pointer already
3424  *
3425  * slot and level indicate where you want the key to go, and
3426  * blocknr is the block the key points to.
3427  */
3428 static void insert_ptr(struct btrfs_trans_handle *trans,
3429 		       struct btrfs_root *root, struct btrfs_path *path,
3430 		       struct btrfs_disk_key *key, u64 bytenr,
3431 		       int slot, int level)
3432 {
3433 	struct extent_buffer *lower;
3434 	int nritems;
3435 	int ret;
3436 
3437 	BUG_ON(!path->nodes[level]);
3438 	btrfs_assert_tree_locked(path->nodes[level]);
3439 	lower = path->nodes[level];
3440 	nritems = btrfs_header_nritems(lower);
3441 	BUG_ON(slot > nritems);
3442 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3443 	if (slot != nritems) {
3444 		if (level)
3445 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3446 					     slot, nritems - slot);
3447 		memmove_extent_buffer(lower,
3448 			      btrfs_node_key_ptr_offset(slot + 1),
3449 			      btrfs_node_key_ptr_offset(slot),
3450 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3451 	}
3452 	if (level) {
3453 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3454 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3455 		BUG_ON(ret < 0);
3456 	}
3457 	btrfs_set_node_key(lower, key, slot);
3458 	btrfs_set_node_blockptr(lower, slot, bytenr);
3459 	WARN_ON(trans->transid == 0);
3460 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3461 	btrfs_set_header_nritems(lower, nritems + 1);
3462 	btrfs_mark_buffer_dirty(lower);
3463 }
3464 
3465 /*
3466  * split the node at the specified level in path in two.
3467  * The path is corrected to point to the appropriate node after the split
3468  *
3469  * Before splitting this tries to make some room in the node by pushing
3470  * left and right, if either one works, it returns right away.
3471  *
3472  * returns 0 on success and < 0 on failure
3473  */
3474 static noinline int split_node(struct btrfs_trans_handle *trans,
3475 			       struct btrfs_root *root,
3476 			       struct btrfs_path *path, int level)
3477 {
3478 	struct extent_buffer *c;
3479 	struct extent_buffer *split;
3480 	struct btrfs_disk_key disk_key;
3481 	int mid;
3482 	int ret;
3483 	u32 c_nritems;
3484 
3485 	c = path->nodes[level];
3486 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3487 	if (c == root->node) {
3488 		/*
3489 		 * trying to split the root, lets make a new one
3490 		 *
3491 		 * tree mod log: We don't log_removal old root in
3492 		 * insert_new_root, because that root buffer will be kept as a
3493 		 * normal node. We are going to log removal of half of the
3494 		 * elements below with tree_mod_log_eb_copy. We're holding a
3495 		 * tree lock on the buffer, which is why we cannot race with
3496 		 * other tree_mod_log users.
3497 		 */
3498 		ret = insert_new_root(trans, root, path, level + 1);
3499 		if (ret)
3500 			return ret;
3501 	} else {
3502 		ret = push_nodes_for_insert(trans, root, path, level);
3503 		c = path->nodes[level];
3504 		if (!ret && btrfs_header_nritems(c) <
3505 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3506 			return 0;
3507 		if (ret < 0)
3508 			return ret;
3509 	}
3510 
3511 	c_nritems = btrfs_header_nritems(c);
3512 	mid = (c_nritems + 1) / 2;
3513 	btrfs_node_key(c, &disk_key, mid);
3514 
3515 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3516 			&disk_key, level, c->start, 0);
3517 	if (IS_ERR(split))
3518 		return PTR_ERR(split);
3519 
3520 	root_add_used(root, root->nodesize);
3521 
3522 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3523 	btrfs_set_header_level(split, btrfs_header_level(c));
3524 	btrfs_set_header_bytenr(split, split->start);
3525 	btrfs_set_header_generation(split, trans->transid);
3526 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3527 	btrfs_set_header_owner(split, root->root_key.objectid);
3528 	write_extent_buffer(split, root->fs_info->fsid,
3529 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
3530 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3531 			    btrfs_header_chunk_tree_uuid(split),
3532 			    BTRFS_UUID_SIZE);
3533 
3534 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3535 				   mid, c_nritems - mid);
3536 	if (ret) {
3537 		btrfs_abort_transaction(trans, ret);
3538 		return ret;
3539 	}
3540 	copy_extent_buffer(split, c,
3541 			   btrfs_node_key_ptr_offset(0),
3542 			   btrfs_node_key_ptr_offset(mid),
3543 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3544 	btrfs_set_header_nritems(split, c_nritems - mid);
3545 	btrfs_set_header_nritems(c, mid);
3546 	ret = 0;
3547 
3548 	btrfs_mark_buffer_dirty(c);
3549 	btrfs_mark_buffer_dirty(split);
3550 
3551 	insert_ptr(trans, root, path, &disk_key, split->start,
3552 		   path->slots[level + 1] + 1, level + 1);
3553 
3554 	if (path->slots[level] >= mid) {
3555 		path->slots[level] -= mid;
3556 		btrfs_tree_unlock(c);
3557 		free_extent_buffer(c);
3558 		path->nodes[level] = split;
3559 		path->slots[level + 1] += 1;
3560 	} else {
3561 		btrfs_tree_unlock(split);
3562 		free_extent_buffer(split);
3563 	}
3564 	return ret;
3565 }
3566 
3567 /*
3568  * how many bytes are required to store the items in a leaf.  start
3569  * and nr indicate which items in the leaf to check.  This totals up the
3570  * space used both by the item structs and the item data
3571  */
3572 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3573 {
3574 	struct btrfs_item *start_item;
3575 	struct btrfs_item *end_item;
3576 	struct btrfs_map_token token;
3577 	int data_len;
3578 	int nritems = btrfs_header_nritems(l);
3579 	int end = min(nritems, start + nr) - 1;
3580 
3581 	if (!nr)
3582 		return 0;
3583 	btrfs_init_map_token(&token);
3584 	start_item = btrfs_item_nr(start);
3585 	end_item = btrfs_item_nr(end);
3586 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3587 		btrfs_token_item_size(l, start_item, &token);
3588 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3589 	data_len += sizeof(struct btrfs_item) * nr;
3590 	WARN_ON(data_len < 0);
3591 	return data_len;
3592 }
3593 
3594 /*
3595  * The space between the end of the leaf items and
3596  * the start of the leaf data.  IOW, how much room
3597  * the leaf has left for both items and data
3598  */
3599 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3600 				   struct extent_buffer *leaf)
3601 {
3602 	int nritems = btrfs_header_nritems(leaf);
3603 	int ret;
3604 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3605 	if (ret < 0) {
3606 		btrfs_crit(root->fs_info,
3607 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3608 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3609 		       leaf_space_used(leaf, 0, nritems), nritems);
3610 	}
3611 	return ret;
3612 }
3613 
3614 /*
3615  * min slot controls the lowest index we're willing to push to the
3616  * right.  We'll push up to and including min_slot, but no lower
3617  */
3618 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3619 				      struct btrfs_root *root,
3620 				      struct btrfs_path *path,
3621 				      int data_size, int empty,
3622 				      struct extent_buffer *right,
3623 				      int free_space, u32 left_nritems,
3624 				      u32 min_slot)
3625 {
3626 	struct extent_buffer *left = path->nodes[0];
3627 	struct extent_buffer *upper = path->nodes[1];
3628 	struct btrfs_map_token token;
3629 	struct btrfs_disk_key disk_key;
3630 	int slot;
3631 	u32 i;
3632 	int push_space = 0;
3633 	int push_items = 0;
3634 	struct btrfs_item *item;
3635 	u32 nr;
3636 	u32 right_nritems;
3637 	u32 data_end;
3638 	u32 this_item_size;
3639 
3640 	btrfs_init_map_token(&token);
3641 
3642 	if (empty)
3643 		nr = 0;
3644 	else
3645 		nr = max_t(u32, 1, min_slot);
3646 
3647 	if (path->slots[0] >= left_nritems)
3648 		push_space += data_size;
3649 
3650 	slot = path->slots[1];
3651 	i = left_nritems - 1;
3652 	while (i >= nr) {
3653 		item = btrfs_item_nr(i);
3654 
3655 		if (!empty && push_items > 0) {
3656 			if (path->slots[0] > i)
3657 				break;
3658 			if (path->slots[0] == i) {
3659 				int space = btrfs_leaf_free_space(root, left);
3660 				if (space + push_space * 2 > free_space)
3661 					break;
3662 			}
3663 		}
3664 
3665 		if (path->slots[0] == i)
3666 			push_space += data_size;
3667 
3668 		this_item_size = btrfs_item_size(left, item);
3669 		if (this_item_size + sizeof(*item) + push_space > free_space)
3670 			break;
3671 
3672 		push_items++;
3673 		push_space += this_item_size + sizeof(*item);
3674 		if (i == 0)
3675 			break;
3676 		i--;
3677 	}
3678 
3679 	if (push_items == 0)
3680 		goto out_unlock;
3681 
3682 	WARN_ON(!empty && push_items == left_nritems);
3683 
3684 	/* push left to right */
3685 	right_nritems = btrfs_header_nritems(right);
3686 
3687 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3688 	push_space -= leaf_data_end(root, left);
3689 
3690 	/* make room in the right data area */
3691 	data_end = leaf_data_end(root, right);
3692 	memmove_extent_buffer(right,
3693 			      btrfs_leaf_data(right) + data_end - push_space,
3694 			      btrfs_leaf_data(right) + data_end,
3695 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3696 
3697 	/* copy from the left data area */
3698 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3699 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3700 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3701 		     push_space);
3702 
3703 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3704 			      btrfs_item_nr_offset(0),
3705 			      right_nritems * sizeof(struct btrfs_item));
3706 
3707 	/* copy the items from left to right */
3708 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3709 		   btrfs_item_nr_offset(left_nritems - push_items),
3710 		   push_items * sizeof(struct btrfs_item));
3711 
3712 	/* update the item pointers */
3713 	right_nritems += push_items;
3714 	btrfs_set_header_nritems(right, right_nritems);
3715 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3716 	for (i = 0; i < right_nritems; i++) {
3717 		item = btrfs_item_nr(i);
3718 		push_space -= btrfs_token_item_size(right, item, &token);
3719 		btrfs_set_token_item_offset(right, item, push_space, &token);
3720 	}
3721 
3722 	left_nritems -= push_items;
3723 	btrfs_set_header_nritems(left, left_nritems);
3724 
3725 	if (left_nritems)
3726 		btrfs_mark_buffer_dirty(left);
3727 	else
3728 		clean_tree_block(trans, root->fs_info, left);
3729 
3730 	btrfs_mark_buffer_dirty(right);
3731 
3732 	btrfs_item_key(right, &disk_key, 0);
3733 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3734 	btrfs_mark_buffer_dirty(upper);
3735 
3736 	/* then fixup the leaf pointer in the path */
3737 	if (path->slots[0] >= left_nritems) {
3738 		path->slots[0] -= left_nritems;
3739 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3740 			clean_tree_block(trans, root->fs_info, path->nodes[0]);
3741 		btrfs_tree_unlock(path->nodes[0]);
3742 		free_extent_buffer(path->nodes[0]);
3743 		path->nodes[0] = right;
3744 		path->slots[1] += 1;
3745 	} else {
3746 		btrfs_tree_unlock(right);
3747 		free_extent_buffer(right);
3748 	}
3749 	return 0;
3750 
3751 out_unlock:
3752 	btrfs_tree_unlock(right);
3753 	free_extent_buffer(right);
3754 	return 1;
3755 }
3756 
3757 /*
3758  * push some data in the path leaf to the right, trying to free up at
3759  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3760  *
3761  * returns 1 if the push failed because the other node didn't have enough
3762  * room, 0 if everything worked out and < 0 if there were major errors.
3763  *
3764  * this will push starting from min_slot to the end of the leaf.  It won't
3765  * push any slot lower than min_slot
3766  */
3767 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3768 			   *root, struct btrfs_path *path,
3769 			   int min_data_size, int data_size,
3770 			   int empty, u32 min_slot)
3771 {
3772 	struct extent_buffer *left = path->nodes[0];
3773 	struct extent_buffer *right;
3774 	struct extent_buffer *upper;
3775 	int slot;
3776 	int free_space;
3777 	u32 left_nritems;
3778 	int ret;
3779 
3780 	if (!path->nodes[1])
3781 		return 1;
3782 
3783 	slot = path->slots[1];
3784 	upper = path->nodes[1];
3785 	if (slot >= btrfs_header_nritems(upper) - 1)
3786 		return 1;
3787 
3788 	btrfs_assert_tree_locked(path->nodes[1]);
3789 
3790 	right = read_node_slot(root, upper, slot + 1);
3791 	/*
3792 	 * slot + 1 is not valid or we fail to read the right node,
3793 	 * no big deal, just return.
3794 	 */
3795 	if (IS_ERR(right))
3796 		return 1;
3797 
3798 	btrfs_tree_lock(right);
3799 	btrfs_set_lock_blocking(right);
3800 
3801 	free_space = btrfs_leaf_free_space(root, right);
3802 	if (free_space < data_size)
3803 		goto out_unlock;
3804 
3805 	/* cow and double check */
3806 	ret = btrfs_cow_block(trans, root, right, upper,
3807 			      slot + 1, &right);
3808 	if (ret)
3809 		goto out_unlock;
3810 
3811 	free_space = btrfs_leaf_free_space(root, right);
3812 	if (free_space < data_size)
3813 		goto out_unlock;
3814 
3815 	left_nritems = btrfs_header_nritems(left);
3816 	if (left_nritems == 0)
3817 		goto out_unlock;
3818 
3819 	if (path->slots[0] == left_nritems && !empty) {
3820 		/* Key greater than all keys in the leaf, right neighbor has
3821 		 * enough room for it and we're not emptying our leaf to delete
3822 		 * it, therefore use right neighbor to insert the new item and
3823 		 * no need to touch/dirty our left leaft. */
3824 		btrfs_tree_unlock(left);
3825 		free_extent_buffer(left);
3826 		path->nodes[0] = right;
3827 		path->slots[0] = 0;
3828 		path->slots[1]++;
3829 		return 0;
3830 	}
3831 
3832 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3833 				right, free_space, left_nritems, min_slot);
3834 out_unlock:
3835 	btrfs_tree_unlock(right);
3836 	free_extent_buffer(right);
3837 	return 1;
3838 }
3839 
3840 /*
3841  * push some data in the path leaf to the left, trying to free up at
3842  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3843  *
3844  * max_slot can put a limit on how far into the leaf we'll push items.  The
3845  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3846  * items
3847  */
3848 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3849 				     struct btrfs_root *root,
3850 				     struct btrfs_path *path, int data_size,
3851 				     int empty, struct extent_buffer *left,
3852 				     int free_space, u32 right_nritems,
3853 				     u32 max_slot)
3854 {
3855 	struct btrfs_disk_key disk_key;
3856 	struct extent_buffer *right = path->nodes[0];
3857 	int i;
3858 	int push_space = 0;
3859 	int push_items = 0;
3860 	struct btrfs_item *item;
3861 	u32 old_left_nritems;
3862 	u32 nr;
3863 	int ret = 0;
3864 	u32 this_item_size;
3865 	u32 old_left_item_size;
3866 	struct btrfs_map_token token;
3867 
3868 	btrfs_init_map_token(&token);
3869 
3870 	if (empty)
3871 		nr = min(right_nritems, max_slot);
3872 	else
3873 		nr = min(right_nritems - 1, max_slot);
3874 
3875 	for (i = 0; i < nr; i++) {
3876 		item = btrfs_item_nr(i);
3877 
3878 		if (!empty && push_items > 0) {
3879 			if (path->slots[0] < i)
3880 				break;
3881 			if (path->slots[0] == i) {
3882 				int space = btrfs_leaf_free_space(root, right);
3883 				if (space + push_space * 2 > free_space)
3884 					break;
3885 			}
3886 		}
3887 
3888 		if (path->slots[0] == i)
3889 			push_space += data_size;
3890 
3891 		this_item_size = btrfs_item_size(right, item);
3892 		if (this_item_size + sizeof(*item) + push_space > free_space)
3893 			break;
3894 
3895 		push_items++;
3896 		push_space += this_item_size + sizeof(*item);
3897 	}
3898 
3899 	if (push_items == 0) {
3900 		ret = 1;
3901 		goto out;
3902 	}
3903 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3904 
3905 	/* push data from right to left */
3906 	copy_extent_buffer(left, right,
3907 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3908 			   btrfs_item_nr_offset(0),
3909 			   push_items * sizeof(struct btrfs_item));
3910 
3911 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3912 		     btrfs_item_offset_nr(right, push_items - 1);
3913 
3914 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3915 		     leaf_data_end(root, left) - push_space,
3916 		     btrfs_leaf_data(right) +
3917 		     btrfs_item_offset_nr(right, push_items - 1),
3918 		     push_space);
3919 	old_left_nritems = btrfs_header_nritems(left);
3920 	BUG_ON(old_left_nritems <= 0);
3921 
3922 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3923 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3924 		u32 ioff;
3925 
3926 		item = btrfs_item_nr(i);
3927 
3928 		ioff = btrfs_token_item_offset(left, item, &token);
3929 		btrfs_set_token_item_offset(left, item,
3930 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3931 		      &token);
3932 	}
3933 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3934 
3935 	/* fixup right node */
3936 	if (push_items > right_nritems)
3937 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3938 		       right_nritems);
3939 
3940 	if (push_items < right_nritems) {
3941 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3942 						  leaf_data_end(root, right);
3943 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3944 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3945 				      btrfs_leaf_data(right) +
3946 				      leaf_data_end(root, right), push_space);
3947 
3948 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3949 			      btrfs_item_nr_offset(push_items),
3950 			     (btrfs_header_nritems(right) - push_items) *
3951 			     sizeof(struct btrfs_item));
3952 	}
3953 	right_nritems -= push_items;
3954 	btrfs_set_header_nritems(right, right_nritems);
3955 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3956 	for (i = 0; i < right_nritems; i++) {
3957 		item = btrfs_item_nr(i);
3958 
3959 		push_space = push_space - btrfs_token_item_size(right,
3960 								item, &token);
3961 		btrfs_set_token_item_offset(right, item, push_space, &token);
3962 	}
3963 
3964 	btrfs_mark_buffer_dirty(left);
3965 	if (right_nritems)
3966 		btrfs_mark_buffer_dirty(right);
3967 	else
3968 		clean_tree_block(trans, root->fs_info, right);
3969 
3970 	btrfs_item_key(right, &disk_key, 0);
3971 	fixup_low_keys(root->fs_info, path, &disk_key, 1);
3972 
3973 	/* then fixup the leaf pointer in the path */
3974 	if (path->slots[0] < push_items) {
3975 		path->slots[0] += old_left_nritems;
3976 		btrfs_tree_unlock(path->nodes[0]);
3977 		free_extent_buffer(path->nodes[0]);
3978 		path->nodes[0] = left;
3979 		path->slots[1] -= 1;
3980 	} else {
3981 		btrfs_tree_unlock(left);
3982 		free_extent_buffer(left);
3983 		path->slots[0] -= push_items;
3984 	}
3985 	BUG_ON(path->slots[0] < 0);
3986 	return ret;
3987 out:
3988 	btrfs_tree_unlock(left);
3989 	free_extent_buffer(left);
3990 	return ret;
3991 }
3992 
3993 /*
3994  * push some data in the path leaf to the left, trying to free up at
3995  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3996  *
3997  * max_slot can put a limit on how far into the leaf we'll push items.  The
3998  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3999  * items
4000  */
4001 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4002 			  *root, struct btrfs_path *path, int min_data_size,
4003 			  int data_size, int empty, u32 max_slot)
4004 {
4005 	struct extent_buffer *right = path->nodes[0];
4006 	struct extent_buffer *left;
4007 	int slot;
4008 	int free_space;
4009 	u32 right_nritems;
4010 	int ret = 0;
4011 
4012 	slot = path->slots[1];
4013 	if (slot == 0)
4014 		return 1;
4015 	if (!path->nodes[1])
4016 		return 1;
4017 
4018 	right_nritems = btrfs_header_nritems(right);
4019 	if (right_nritems == 0)
4020 		return 1;
4021 
4022 	btrfs_assert_tree_locked(path->nodes[1]);
4023 
4024 	left = read_node_slot(root, path->nodes[1], slot - 1);
4025 	/*
4026 	 * slot - 1 is not valid or we fail to read the left node,
4027 	 * no big deal, just return.
4028 	 */
4029 	if (IS_ERR(left))
4030 		return 1;
4031 
4032 	btrfs_tree_lock(left);
4033 	btrfs_set_lock_blocking(left);
4034 
4035 	free_space = btrfs_leaf_free_space(root, left);
4036 	if (free_space < data_size) {
4037 		ret = 1;
4038 		goto out;
4039 	}
4040 
4041 	/* cow and double check */
4042 	ret = btrfs_cow_block(trans, root, left,
4043 			      path->nodes[1], slot - 1, &left);
4044 	if (ret) {
4045 		/* we hit -ENOSPC, but it isn't fatal here */
4046 		if (ret == -ENOSPC)
4047 			ret = 1;
4048 		goto out;
4049 	}
4050 
4051 	free_space = btrfs_leaf_free_space(root, left);
4052 	if (free_space < data_size) {
4053 		ret = 1;
4054 		goto out;
4055 	}
4056 
4057 	return __push_leaf_left(trans, root, path, min_data_size,
4058 			       empty, left, free_space, right_nritems,
4059 			       max_slot);
4060 out:
4061 	btrfs_tree_unlock(left);
4062 	free_extent_buffer(left);
4063 	return ret;
4064 }
4065 
4066 /*
4067  * split the path's leaf in two, making sure there is at least data_size
4068  * available for the resulting leaf level of the path.
4069  */
4070 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4071 				    struct btrfs_root *root,
4072 				    struct btrfs_path *path,
4073 				    struct extent_buffer *l,
4074 				    struct extent_buffer *right,
4075 				    int slot, int mid, int nritems)
4076 {
4077 	int data_copy_size;
4078 	int rt_data_off;
4079 	int i;
4080 	struct btrfs_disk_key disk_key;
4081 	struct btrfs_map_token token;
4082 
4083 	btrfs_init_map_token(&token);
4084 
4085 	nritems = nritems - mid;
4086 	btrfs_set_header_nritems(right, nritems);
4087 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4088 
4089 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4090 			   btrfs_item_nr_offset(mid),
4091 			   nritems * sizeof(struct btrfs_item));
4092 
4093 	copy_extent_buffer(right, l,
4094 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4095 		     data_copy_size, btrfs_leaf_data(l) +
4096 		     leaf_data_end(root, l), data_copy_size);
4097 
4098 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4099 		      btrfs_item_end_nr(l, mid);
4100 
4101 	for (i = 0; i < nritems; i++) {
4102 		struct btrfs_item *item = btrfs_item_nr(i);
4103 		u32 ioff;
4104 
4105 		ioff = btrfs_token_item_offset(right, item, &token);
4106 		btrfs_set_token_item_offset(right, item,
4107 					    ioff + rt_data_off, &token);
4108 	}
4109 
4110 	btrfs_set_header_nritems(l, mid);
4111 	btrfs_item_key(right, &disk_key, 0);
4112 	insert_ptr(trans, root, path, &disk_key, right->start,
4113 		   path->slots[1] + 1, 1);
4114 
4115 	btrfs_mark_buffer_dirty(right);
4116 	btrfs_mark_buffer_dirty(l);
4117 	BUG_ON(path->slots[0] != slot);
4118 
4119 	if (mid <= slot) {
4120 		btrfs_tree_unlock(path->nodes[0]);
4121 		free_extent_buffer(path->nodes[0]);
4122 		path->nodes[0] = right;
4123 		path->slots[0] -= mid;
4124 		path->slots[1] += 1;
4125 	} else {
4126 		btrfs_tree_unlock(right);
4127 		free_extent_buffer(right);
4128 	}
4129 
4130 	BUG_ON(path->slots[0] < 0);
4131 }
4132 
4133 /*
4134  * double splits happen when we need to insert a big item in the middle
4135  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4136  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4137  *          A                 B                 C
4138  *
4139  * We avoid this by trying to push the items on either side of our target
4140  * into the adjacent leaves.  If all goes well we can avoid the double split
4141  * completely.
4142  */
4143 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4144 					  struct btrfs_root *root,
4145 					  struct btrfs_path *path,
4146 					  int data_size)
4147 {
4148 	int ret;
4149 	int progress = 0;
4150 	int slot;
4151 	u32 nritems;
4152 	int space_needed = data_size;
4153 
4154 	slot = path->slots[0];
4155 	if (slot < btrfs_header_nritems(path->nodes[0]))
4156 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4157 
4158 	/*
4159 	 * try to push all the items after our slot into the
4160 	 * right leaf
4161 	 */
4162 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4163 	if (ret < 0)
4164 		return ret;
4165 
4166 	if (ret == 0)
4167 		progress++;
4168 
4169 	nritems = btrfs_header_nritems(path->nodes[0]);
4170 	/*
4171 	 * our goal is to get our slot at the start or end of a leaf.  If
4172 	 * we've done so we're done
4173 	 */
4174 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4175 		return 0;
4176 
4177 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4178 		return 0;
4179 
4180 	/* try to push all the items before our slot into the next leaf */
4181 	slot = path->slots[0];
4182 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4183 	if (ret < 0)
4184 		return ret;
4185 
4186 	if (ret == 0)
4187 		progress++;
4188 
4189 	if (progress)
4190 		return 0;
4191 	return 1;
4192 }
4193 
4194 /*
4195  * split the path's leaf in two, making sure there is at least data_size
4196  * available for the resulting leaf level of the path.
4197  *
4198  * returns 0 if all went well and < 0 on failure.
4199  */
4200 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4201 			       struct btrfs_root *root,
4202 			       struct btrfs_key *ins_key,
4203 			       struct btrfs_path *path, int data_size,
4204 			       int extend)
4205 {
4206 	struct btrfs_disk_key disk_key;
4207 	struct extent_buffer *l;
4208 	u32 nritems;
4209 	int mid;
4210 	int slot;
4211 	struct extent_buffer *right;
4212 	struct btrfs_fs_info *fs_info = root->fs_info;
4213 	int ret = 0;
4214 	int wret;
4215 	int split;
4216 	int num_doubles = 0;
4217 	int tried_avoid_double = 0;
4218 
4219 	l = path->nodes[0];
4220 	slot = path->slots[0];
4221 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4222 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4223 		return -EOVERFLOW;
4224 
4225 	/* first try to make some room by pushing left and right */
4226 	if (data_size && path->nodes[1]) {
4227 		int space_needed = data_size;
4228 
4229 		if (slot < btrfs_header_nritems(l))
4230 			space_needed -= btrfs_leaf_free_space(root, l);
4231 
4232 		wret = push_leaf_right(trans, root, path, space_needed,
4233 				       space_needed, 0, 0);
4234 		if (wret < 0)
4235 			return wret;
4236 		if (wret) {
4237 			wret = push_leaf_left(trans, root, path, space_needed,
4238 					      space_needed, 0, (u32)-1);
4239 			if (wret < 0)
4240 				return wret;
4241 		}
4242 		l = path->nodes[0];
4243 
4244 		/* did the pushes work? */
4245 		if (btrfs_leaf_free_space(root, l) >= data_size)
4246 			return 0;
4247 	}
4248 
4249 	if (!path->nodes[1]) {
4250 		ret = insert_new_root(trans, root, path, 1);
4251 		if (ret)
4252 			return ret;
4253 	}
4254 again:
4255 	split = 1;
4256 	l = path->nodes[0];
4257 	slot = path->slots[0];
4258 	nritems = btrfs_header_nritems(l);
4259 	mid = (nritems + 1) / 2;
4260 
4261 	if (mid <= slot) {
4262 		if (nritems == 1 ||
4263 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4264 			BTRFS_LEAF_DATA_SIZE(root)) {
4265 			if (slot >= nritems) {
4266 				split = 0;
4267 			} else {
4268 				mid = slot;
4269 				if (mid != nritems &&
4270 				    leaf_space_used(l, mid, nritems - mid) +
4271 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4272 					if (data_size && !tried_avoid_double)
4273 						goto push_for_double;
4274 					split = 2;
4275 				}
4276 			}
4277 		}
4278 	} else {
4279 		if (leaf_space_used(l, 0, mid) + data_size >
4280 			BTRFS_LEAF_DATA_SIZE(root)) {
4281 			if (!extend && data_size && slot == 0) {
4282 				split = 0;
4283 			} else if ((extend || !data_size) && slot == 0) {
4284 				mid = 1;
4285 			} else {
4286 				mid = slot;
4287 				if (mid != nritems &&
4288 				    leaf_space_used(l, mid, nritems - mid) +
4289 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4290 					if (data_size && !tried_avoid_double)
4291 						goto push_for_double;
4292 					split = 2;
4293 				}
4294 			}
4295 		}
4296 	}
4297 
4298 	if (split == 0)
4299 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4300 	else
4301 		btrfs_item_key(l, &disk_key, mid);
4302 
4303 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4304 			&disk_key, 0, l->start, 0);
4305 	if (IS_ERR(right))
4306 		return PTR_ERR(right);
4307 
4308 	root_add_used(root, root->nodesize);
4309 
4310 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4311 	btrfs_set_header_bytenr(right, right->start);
4312 	btrfs_set_header_generation(right, trans->transid);
4313 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4314 	btrfs_set_header_owner(right, root->root_key.objectid);
4315 	btrfs_set_header_level(right, 0);
4316 	write_extent_buffer(right, fs_info->fsid,
4317 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
4318 
4319 	write_extent_buffer(right, fs_info->chunk_tree_uuid,
4320 			    btrfs_header_chunk_tree_uuid(right),
4321 			    BTRFS_UUID_SIZE);
4322 
4323 	if (split == 0) {
4324 		if (mid <= slot) {
4325 			btrfs_set_header_nritems(right, 0);
4326 			insert_ptr(trans, root, path, &disk_key, right->start,
4327 				   path->slots[1] + 1, 1);
4328 			btrfs_tree_unlock(path->nodes[0]);
4329 			free_extent_buffer(path->nodes[0]);
4330 			path->nodes[0] = right;
4331 			path->slots[0] = 0;
4332 			path->slots[1] += 1;
4333 		} else {
4334 			btrfs_set_header_nritems(right, 0);
4335 			insert_ptr(trans, root, path, &disk_key, right->start,
4336 					  path->slots[1], 1);
4337 			btrfs_tree_unlock(path->nodes[0]);
4338 			free_extent_buffer(path->nodes[0]);
4339 			path->nodes[0] = right;
4340 			path->slots[0] = 0;
4341 			if (path->slots[1] == 0)
4342 				fixup_low_keys(fs_info, path, &disk_key, 1);
4343 		}
4344 		btrfs_mark_buffer_dirty(right);
4345 		return ret;
4346 	}
4347 
4348 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4349 
4350 	if (split == 2) {
4351 		BUG_ON(num_doubles != 0);
4352 		num_doubles++;
4353 		goto again;
4354 	}
4355 
4356 	return 0;
4357 
4358 push_for_double:
4359 	push_for_double_split(trans, root, path, data_size);
4360 	tried_avoid_double = 1;
4361 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4362 		return 0;
4363 	goto again;
4364 }
4365 
4366 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4367 					 struct btrfs_root *root,
4368 					 struct btrfs_path *path, int ins_len)
4369 {
4370 	struct btrfs_key key;
4371 	struct extent_buffer *leaf;
4372 	struct btrfs_file_extent_item *fi;
4373 	u64 extent_len = 0;
4374 	u32 item_size;
4375 	int ret;
4376 
4377 	leaf = path->nodes[0];
4378 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4379 
4380 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4381 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4382 
4383 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4384 		return 0;
4385 
4386 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4387 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4388 		fi = btrfs_item_ptr(leaf, path->slots[0],
4389 				    struct btrfs_file_extent_item);
4390 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4391 	}
4392 	btrfs_release_path(path);
4393 
4394 	path->keep_locks = 1;
4395 	path->search_for_split = 1;
4396 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4397 	path->search_for_split = 0;
4398 	if (ret > 0)
4399 		ret = -EAGAIN;
4400 	if (ret < 0)
4401 		goto err;
4402 
4403 	ret = -EAGAIN;
4404 	leaf = path->nodes[0];
4405 	/* if our item isn't there, return now */
4406 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4407 		goto err;
4408 
4409 	/* the leaf has  changed, it now has room.  return now */
4410 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4411 		goto err;
4412 
4413 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4414 		fi = btrfs_item_ptr(leaf, path->slots[0],
4415 				    struct btrfs_file_extent_item);
4416 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4417 			goto err;
4418 	}
4419 
4420 	btrfs_set_path_blocking(path);
4421 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4422 	if (ret)
4423 		goto err;
4424 
4425 	path->keep_locks = 0;
4426 	btrfs_unlock_up_safe(path, 1);
4427 	return 0;
4428 err:
4429 	path->keep_locks = 0;
4430 	return ret;
4431 }
4432 
4433 static noinline int split_item(struct btrfs_trans_handle *trans,
4434 			       struct btrfs_root *root,
4435 			       struct btrfs_path *path,
4436 			       struct btrfs_key *new_key,
4437 			       unsigned long split_offset)
4438 {
4439 	struct extent_buffer *leaf;
4440 	struct btrfs_item *item;
4441 	struct btrfs_item *new_item;
4442 	int slot;
4443 	char *buf;
4444 	u32 nritems;
4445 	u32 item_size;
4446 	u32 orig_offset;
4447 	struct btrfs_disk_key disk_key;
4448 
4449 	leaf = path->nodes[0];
4450 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4451 
4452 	btrfs_set_path_blocking(path);
4453 
4454 	item = btrfs_item_nr(path->slots[0]);
4455 	orig_offset = btrfs_item_offset(leaf, item);
4456 	item_size = btrfs_item_size(leaf, item);
4457 
4458 	buf = kmalloc(item_size, GFP_NOFS);
4459 	if (!buf)
4460 		return -ENOMEM;
4461 
4462 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4463 			    path->slots[0]), item_size);
4464 
4465 	slot = path->slots[0] + 1;
4466 	nritems = btrfs_header_nritems(leaf);
4467 	if (slot != nritems) {
4468 		/* shift the items */
4469 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4470 				btrfs_item_nr_offset(slot),
4471 				(nritems - slot) * sizeof(struct btrfs_item));
4472 	}
4473 
4474 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4475 	btrfs_set_item_key(leaf, &disk_key, slot);
4476 
4477 	new_item = btrfs_item_nr(slot);
4478 
4479 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4480 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4481 
4482 	btrfs_set_item_offset(leaf, item,
4483 			      orig_offset + item_size - split_offset);
4484 	btrfs_set_item_size(leaf, item, split_offset);
4485 
4486 	btrfs_set_header_nritems(leaf, nritems + 1);
4487 
4488 	/* write the data for the start of the original item */
4489 	write_extent_buffer(leaf, buf,
4490 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4491 			    split_offset);
4492 
4493 	/* write the data for the new item */
4494 	write_extent_buffer(leaf, buf + split_offset,
4495 			    btrfs_item_ptr_offset(leaf, slot),
4496 			    item_size - split_offset);
4497 	btrfs_mark_buffer_dirty(leaf);
4498 
4499 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4500 	kfree(buf);
4501 	return 0;
4502 }
4503 
4504 /*
4505  * This function splits a single item into two items,
4506  * giving 'new_key' to the new item and splitting the
4507  * old one at split_offset (from the start of the item).
4508  *
4509  * The path may be released by this operation.  After
4510  * the split, the path is pointing to the old item.  The
4511  * new item is going to be in the same node as the old one.
4512  *
4513  * Note, the item being split must be smaller enough to live alone on
4514  * a tree block with room for one extra struct btrfs_item
4515  *
4516  * This allows us to split the item in place, keeping a lock on the
4517  * leaf the entire time.
4518  */
4519 int btrfs_split_item(struct btrfs_trans_handle *trans,
4520 		     struct btrfs_root *root,
4521 		     struct btrfs_path *path,
4522 		     struct btrfs_key *new_key,
4523 		     unsigned long split_offset)
4524 {
4525 	int ret;
4526 	ret = setup_leaf_for_split(trans, root, path,
4527 				   sizeof(struct btrfs_item));
4528 	if (ret)
4529 		return ret;
4530 
4531 	ret = split_item(trans, root, path, new_key, split_offset);
4532 	return ret;
4533 }
4534 
4535 /*
4536  * This function duplicate a item, giving 'new_key' to the new item.
4537  * It guarantees both items live in the same tree leaf and the new item
4538  * is contiguous with the original item.
4539  *
4540  * This allows us to split file extent in place, keeping a lock on the
4541  * leaf the entire time.
4542  */
4543 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4544 			 struct btrfs_root *root,
4545 			 struct btrfs_path *path,
4546 			 struct btrfs_key *new_key)
4547 {
4548 	struct extent_buffer *leaf;
4549 	int ret;
4550 	u32 item_size;
4551 
4552 	leaf = path->nodes[0];
4553 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4554 	ret = setup_leaf_for_split(trans, root, path,
4555 				   item_size + sizeof(struct btrfs_item));
4556 	if (ret)
4557 		return ret;
4558 
4559 	path->slots[0]++;
4560 	setup_items_for_insert(root, path, new_key, &item_size,
4561 			       item_size, item_size +
4562 			       sizeof(struct btrfs_item), 1);
4563 	leaf = path->nodes[0];
4564 	memcpy_extent_buffer(leaf,
4565 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4566 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4567 			     item_size);
4568 	return 0;
4569 }
4570 
4571 /*
4572  * make the item pointed to by the path smaller.  new_size indicates
4573  * how small to make it, and from_end tells us if we just chop bytes
4574  * off the end of the item or if we shift the item to chop bytes off
4575  * the front.
4576  */
4577 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4578 			 u32 new_size, int from_end)
4579 {
4580 	int slot;
4581 	struct extent_buffer *leaf;
4582 	struct btrfs_item *item;
4583 	u32 nritems;
4584 	unsigned int data_end;
4585 	unsigned int old_data_start;
4586 	unsigned int old_size;
4587 	unsigned int size_diff;
4588 	int i;
4589 	struct btrfs_map_token token;
4590 
4591 	btrfs_init_map_token(&token);
4592 
4593 	leaf = path->nodes[0];
4594 	slot = path->slots[0];
4595 
4596 	old_size = btrfs_item_size_nr(leaf, slot);
4597 	if (old_size == new_size)
4598 		return;
4599 
4600 	nritems = btrfs_header_nritems(leaf);
4601 	data_end = leaf_data_end(root, leaf);
4602 
4603 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4604 
4605 	size_diff = old_size - new_size;
4606 
4607 	BUG_ON(slot < 0);
4608 	BUG_ON(slot >= nritems);
4609 
4610 	/*
4611 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4612 	 */
4613 	/* first correct the data pointers */
4614 	for (i = slot; i < nritems; i++) {
4615 		u32 ioff;
4616 		item = btrfs_item_nr(i);
4617 
4618 		ioff = btrfs_token_item_offset(leaf, item, &token);
4619 		btrfs_set_token_item_offset(leaf, item,
4620 					    ioff + size_diff, &token);
4621 	}
4622 
4623 	/* shift the data */
4624 	if (from_end) {
4625 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4626 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4627 			      data_end, old_data_start + new_size - data_end);
4628 	} else {
4629 		struct btrfs_disk_key disk_key;
4630 		u64 offset;
4631 
4632 		btrfs_item_key(leaf, &disk_key, slot);
4633 
4634 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4635 			unsigned long ptr;
4636 			struct btrfs_file_extent_item *fi;
4637 
4638 			fi = btrfs_item_ptr(leaf, slot,
4639 					    struct btrfs_file_extent_item);
4640 			fi = (struct btrfs_file_extent_item *)(
4641 			     (unsigned long)fi - size_diff);
4642 
4643 			if (btrfs_file_extent_type(leaf, fi) ==
4644 			    BTRFS_FILE_EXTENT_INLINE) {
4645 				ptr = btrfs_item_ptr_offset(leaf, slot);
4646 				memmove_extent_buffer(leaf, ptr,
4647 				      (unsigned long)fi,
4648 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4649 			}
4650 		}
4651 
4652 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4653 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4654 			      data_end, old_data_start - data_end);
4655 
4656 		offset = btrfs_disk_key_offset(&disk_key);
4657 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4658 		btrfs_set_item_key(leaf, &disk_key, slot);
4659 		if (slot == 0)
4660 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
4661 	}
4662 
4663 	item = btrfs_item_nr(slot);
4664 	btrfs_set_item_size(leaf, item, new_size);
4665 	btrfs_mark_buffer_dirty(leaf);
4666 
4667 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4668 		btrfs_print_leaf(root, leaf);
4669 		BUG();
4670 	}
4671 }
4672 
4673 /*
4674  * make the item pointed to by the path bigger, data_size is the added size.
4675  */
4676 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4677 		       u32 data_size)
4678 {
4679 	int slot;
4680 	struct extent_buffer *leaf;
4681 	struct btrfs_item *item;
4682 	u32 nritems;
4683 	unsigned int data_end;
4684 	unsigned int old_data;
4685 	unsigned int old_size;
4686 	int i;
4687 	struct btrfs_map_token token;
4688 
4689 	btrfs_init_map_token(&token);
4690 
4691 	leaf = path->nodes[0];
4692 
4693 	nritems = btrfs_header_nritems(leaf);
4694 	data_end = leaf_data_end(root, leaf);
4695 
4696 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4697 		btrfs_print_leaf(root, leaf);
4698 		BUG();
4699 	}
4700 	slot = path->slots[0];
4701 	old_data = btrfs_item_end_nr(leaf, slot);
4702 
4703 	BUG_ON(slot < 0);
4704 	if (slot >= nritems) {
4705 		btrfs_print_leaf(root, leaf);
4706 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4707 		       slot, nritems);
4708 		BUG_ON(1);
4709 	}
4710 
4711 	/*
4712 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4713 	 */
4714 	/* first correct the data pointers */
4715 	for (i = slot; i < nritems; i++) {
4716 		u32 ioff;
4717 		item = btrfs_item_nr(i);
4718 
4719 		ioff = btrfs_token_item_offset(leaf, item, &token);
4720 		btrfs_set_token_item_offset(leaf, item,
4721 					    ioff - data_size, &token);
4722 	}
4723 
4724 	/* shift the data */
4725 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4726 		      data_end - data_size, btrfs_leaf_data(leaf) +
4727 		      data_end, old_data - data_end);
4728 
4729 	data_end = old_data;
4730 	old_size = btrfs_item_size_nr(leaf, slot);
4731 	item = btrfs_item_nr(slot);
4732 	btrfs_set_item_size(leaf, item, old_size + data_size);
4733 	btrfs_mark_buffer_dirty(leaf);
4734 
4735 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4736 		btrfs_print_leaf(root, leaf);
4737 		BUG();
4738 	}
4739 }
4740 
4741 /*
4742  * this is a helper for btrfs_insert_empty_items, the main goal here is
4743  * to save stack depth by doing the bulk of the work in a function
4744  * that doesn't call btrfs_search_slot
4745  */
4746 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4747 			    struct btrfs_key *cpu_key, u32 *data_size,
4748 			    u32 total_data, u32 total_size, int nr)
4749 {
4750 	struct btrfs_item *item;
4751 	int i;
4752 	u32 nritems;
4753 	unsigned int data_end;
4754 	struct btrfs_disk_key disk_key;
4755 	struct extent_buffer *leaf;
4756 	int slot;
4757 	struct btrfs_map_token token;
4758 
4759 	if (path->slots[0] == 0) {
4760 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4761 		fixup_low_keys(root->fs_info, path, &disk_key, 1);
4762 	}
4763 	btrfs_unlock_up_safe(path, 1);
4764 
4765 	btrfs_init_map_token(&token);
4766 
4767 	leaf = path->nodes[0];
4768 	slot = path->slots[0];
4769 
4770 	nritems = btrfs_header_nritems(leaf);
4771 	data_end = leaf_data_end(root, leaf);
4772 
4773 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4774 		btrfs_print_leaf(root, leaf);
4775 		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4776 		       total_size, btrfs_leaf_free_space(root, leaf));
4777 		BUG();
4778 	}
4779 
4780 	if (slot != nritems) {
4781 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4782 
4783 		if (old_data < data_end) {
4784 			btrfs_print_leaf(root, leaf);
4785 			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4786 			       slot, old_data, data_end);
4787 			BUG_ON(1);
4788 		}
4789 		/*
4790 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4791 		 */
4792 		/* first correct the data pointers */
4793 		for (i = slot; i < nritems; i++) {
4794 			u32 ioff;
4795 
4796 			item = btrfs_item_nr( i);
4797 			ioff = btrfs_token_item_offset(leaf, item, &token);
4798 			btrfs_set_token_item_offset(leaf, item,
4799 						    ioff - total_data, &token);
4800 		}
4801 		/* shift the items */
4802 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4803 			      btrfs_item_nr_offset(slot),
4804 			      (nritems - slot) * sizeof(struct btrfs_item));
4805 
4806 		/* shift the data */
4807 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4808 			      data_end - total_data, btrfs_leaf_data(leaf) +
4809 			      data_end, old_data - data_end);
4810 		data_end = old_data;
4811 	}
4812 
4813 	/* setup the item for the new data */
4814 	for (i = 0; i < nr; i++) {
4815 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4816 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4817 		item = btrfs_item_nr(slot + i);
4818 		btrfs_set_token_item_offset(leaf, item,
4819 					    data_end - data_size[i], &token);
4820 		data_end -= data_size[i];
4821 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4822 	}
4823 
4824 	btrfs_set_header_nritems(leaf, nritems + nr);
4825 	btrfs_mark_buffer_dirty(leaf);
4826 
4827 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4828 		btrfs_print_leaf(root, leaf);
4829 		BUG();
4830 	}
4831 }
4832 
4833 /*
4834  * Given a key and some data, insert items into the tree.
4835  * This does all the path init required, making room in the tree if needed.
4836  */
4837 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4838 			    struct btrfs_root *root,
4839 			    struct btrfs_path *path,
4840 			    struct btrfs_key *cpu_key, u32 *data_size,
4841 			    int nr)
4842 {
4843 	int ret = 0;
4844 	int slot;
4845 	int i;
4846 	u32 total_size = 0;
4847 	u32 total_data = 0;
4848 
4849 	for (i = 0; i < nr; i++)
4850 		total_data += data_size[i];
4851 
4852 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4853 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4854 	if (ret == 0)
4855 		return -EEXIST;
4856 	if (ret < 0)
4857 		return ret;
4858 
4859 	slot = path->slots[0];
4860 	BUG_ON(slot < 0);
4861 
4862 	setup_items_for_insert(root, path, cpu_key, data_size,
4863 			       total_data, total_size, nr);
4864 	return 0;
4865 }
4866 
4867 /*
4868  * Given a key and some data, insert an item into the tree.
4869  * This does all the path init required, making room in the tree if needed.
4870  */
4871 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4872 		      *root, struct btrfs_key *cpu_key, void *data, u32
4873 		      data_size)
4874 {
4875 	int ret = 0;
4876 	struct btrfs_path *path;
4877 	struct extent_buffer *leaf;
4878 	unsigned long ptr;
4879 
4880 	path = btrfs_alloc_path();
4881 	if (!path)
4882 		return -ENOMEM;
4883 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4884 	if (!ret) {
4885 		leaf = path->nodes[0];
4886 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4887 		write_extent_buffer(leaf, data, ptr, data_size);
4888 		btrfs_mark_buffer_dirty(leaf);
4889 	}
4890 	btrfs_free_path(path);
4891 	return ret;
4892 }
4893 
4894 /*
4895  * delete the pointer from a given node.
4896  *
4897  * the tree should have been previously balanced so the deletion does not
4898  * empty a node.
4899  */
4900 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4901 		    int level, int slot)
4902 {
4903 	struct extent_buffer *parent = path->nodes[level];
4904 	u32 nritems;
4905 	int ret;
4906 
4907 	nritems = btrfs_header_nritems(parent);
4908 	if (slot != nritems - 1) {
4909 		if (level)
4910 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4911 					     slot + 1, nritems - slot - 1);
4912 		memmove_extent_buffer(parent,
4913 			      btrfs_node_key_ptr_offset(slot),
4914 			      btrfs_node_key_ptr_offset(slot + 1),
4915 			      sizeof(struct btrfs_key_ptr) *
4916 			      (nritems - slot - 1));
4917 	} else if (level) {
4918 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4919 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4920 		BUG_ON(ret < 0);
4921 	}
4922 
4923 	nritems--;
4924 	btrfs_set_header_nritems(parent, nritems);
4925 	if (nritems == 0 && parent == root->node) {
4926 		BUG_ON(btrfs_header_level(root->node) != 1);
4927 		/* just turn the root into a leaf and break */
4928 		btrfs_set_header_level(root->node, 0);
4929 	} else if (slot == 0) {
4930 		struct btrfs_disk_key disk_key;
4931 
4932 		btrfs_node_key(parent, &disk_key, 0);
4933 		fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4934 	}
4935 	btrfs_mark_buffer_dirty(parent);
4936 }
4937 
4938 /*
4939  * a helper function to delete the leaf pointed to by path->slots[1] and
4940  * path->nodes[1].
4941  *
4942  * This deletes the pointer in path->nodes[1] and frees the leaf
4943  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4944  *
4945  * The path must have already been setup for deleting the leaf, including
4946  * all the proper balancing.  path->nodes[1] must be locked.
4947  */
4948 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4949 				    struct btrfs_root *root,
4950 				    struct btrfs_path *path,
4951 				    struct extent_buffer *leaf)
4952 {
4953 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4954 	del_ptr(root, path, 1, path->slots[1]);
4955 
4956 	/*
4957 	 * btrfs_free_extent is expensive, we want to make sure we
4958 	 * aren't holding any locks when we call it
4959 	 */
4960 	btrfs_unlock_up_safe(path, 0);
4961 
4962 	root_sub_used(root, leaf->len);
4963 
4964 	extent_buffer_get(leaf);
4965 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4966 	free_extent_buffer_stale(leaf);
4967 }
4968 /*
4969  * delete the item at the leaf level in path.  If that empties
4970  * the leaf, remove it from the tree
4971  */
4972 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4973 		    struct btrfs_path *path, int slot, int nr)
4974 {
4975 	struct extent_buffer *leaf;
4976 	struct btrfs_item *item;
4977 	u32 last_off;
4978 	u32 dsize = 0;
4979 	int ret = 0;
4980 	int wret;
4981 	int i;
4982 	u32 nritems;
4983 	struct btrfs_map_token token;
4984 
4985 	btrfs_init_map_token(&token);
4986 
4987 	leaf = path->nodes[0];
4988 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4989 
4990 	for (i = 0; i < nr; i++)
4991 		dsize += btrfs_item_size_nr(leaf, slot + i);
4992 
4993 	nritems = btrfs_header_nritems(leaf);
4994 
4995 	if (slot + nr != nritems) {
4996 		int data_end = leaf_data_end(root, leaf);
4997 
4998 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4999 			      data_end + dsize,
5000 			      btrfs_leaf_data(leaf) + data_end,
5001 			      last_off - data_end);
5002 
5003 		for (i = slot + nr; i < nritems; i++) {
5004 			u32 ioff;
5005 
5006 			item = btrfs_item_nr(i);
5007 			ioff = btrfs_token_item_offset(leaf, item, &token);
5008 			btrfs_set_token_item_offset(leaf, item,
5009 						    ioff + dsize, &token);
5010 		}
5011 
5012 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5013 			      btrfs_item_nr_offset(slot + nr),
5014 			      sizeof(struct btrfs_item) *
5015 			      (nritems - slot - nr));
5016 	}
5017 	btrfs_set_header_nritems(leaf, nritems - nr);
5018 	nritems -= nr;
5019 
5020 	/* delete the leaf if we've emptied it */
5021 	if (nritems == 0) {
5022 		if (leaf == root->node) {
5023 			btrfs_set_header_level(leaf, 0);
5024 		} else {
5025 			btrfs_set_path_blocking(path);
5026 			clean_tree_block(trans, root->fs_info, leaf);
5027 			btrfs_del_leaf(trans, root, path, leaf);
5028 		}
5029 	} else {
5030 		int used = leaf_space_used(leaf, 0, nritems);
5031 		if (slot == 0) {
5032 			struct btrfs_disk_key disk_key;
5033 
5034 			btrfs_item_key(leaf, &disk_key, 0);
5035 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
5036 		}
5037 
5038 		/* delete the leaf if it is mostly empty */
5039 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5040 			/* push_leaf_left fixes the path.
5041 			 * make sure the path still points to our leaf
5042 			 * for possible call to del_ptr below
5043 			 */
5044 			slot = path->slots[1];
5045 			extent_buffer_get(leaf);
5046 
5047 			btrfs_set_path_blocking(path);
5048 			wret = push_leaf_left(trans, root, path, 1, 1,
5049 					      1, (u32)-1);
5050 			if (wret < 0 && wret != -ENOSPC)
5051 				ret = wret;
5052 
5053 			if (path->nodes[0] == leaf &&
5054 			    btrfs_header_nritems(leaf)) {
5055 				wret = push_leaf_right(trans, root, path, 1,
5056 						       1, 1, 0);
5057 				if (wret < 0 && wret != -ENOSPC)
5058 					ret = wret;
5059 			}
5060 
5061 			if (btrfs_header_nritems(leaf) == 0) {
5062 				path->slots[1] = slot;
5063 				btrfs_del_leaf(trans, root, path, leaf);
5064 				free_extent_buffer(leaf);
5065 				ret = 0;
5066 			} else {
5067 				/* if we're still in the path, make sure
5068 				 * we're dirty.  Otherwise, one of the
5069 				 * push_leaf functions must have already
5070 				 * dirtied this buffer
5071 				 */
5072 				if (path->nodes[0] == leaf)
5073 					btrfs_mark_buffer_dirty(leaf);
5074 				free_extent_buffer(leaf);
5075 			}
5076 		} else {
5077 			btrfs_mark_buffer_dirty(leaf);
5078 		}
5079 	}
5080 	return ret;
5081 }
5082 
5083 /*
5084  * search the tree again to find a leaf with lesser keys
5085  * returns 0 if it found something or 1 if there are no lesser leaves.
5086  * returns < 0 on io errors.
5087  *
5088  * This may release the path, and so you may lose any locks held at the
5089  * time you call it.
5090  */
5091 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5092 {
5093 	struct btrfs_key key;
5094 	struct btrfs_disk_key found_key;
5095 	int ret;
5096 
5097 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5098 
5099 	if (key.offset > 0) {
5100 		key.offset--;
5101 	} else if (key.type > 0) {
5102 		key.type--;
5103 		key.offset = (u64)-1;
5104 	} else if (key.objectid > 0) {
5105 		key.objectid--;
5106 		key.type = (u8)-1;
5107 		key.offset = (u64)-1;
5108 	} else {
5109 		return 1;
5110 	}
5111 
5112 	btrfs_release_path(path);
5113 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5114 	if (ret < 0)
5115 		return ret;
5116 	btrfs_item_key(path->nodes[0], &found_key, 0);
5117 	ret = comp_keys(&found_key, &key);
5118 	/*
5119 	 * We might have had an item with the previous key in the tree right
5120 	 * before we released our path. And after we released our path, that
5121 	 * item might have been pushed to the first slot (0) of the leaf we
5122 	 * were holding due to a tree balance. Alternatively, an item with the
5123 	 * previous key can exist as the only element of a leaf (big fat item).
5124 	 * Therefore account for these 2 cases, so that our callers (like
5125 	 * btrfs_previous_item) don't miss an existing item with a key matching
5126 	 * the previous key we computed above.
5127 	 */
5128 	if (ret <= 0)
5129 		return 0;
5130 	return 1;
5131 }
5132 
5133 /*
5134  * A helper function to walk down the tree starting at min_key, and looking
5135  * for nodes or leaves that are have a minimum transaction id.
5136  * This is used by the btree defrag code, and tree logging
5137  *
5138  * This does not cow, but it does stuff the starting key it finds back
5139  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5140  * key and get a writable path.
5141  *
5142  * This does lock as it descends, and path->keep_locks should be set
5143  * to 1 by the caller.
5144  *
5145  * This honors path->lowest_level to prevent descent past a given level
5146  * of the tree.
5147  *
5148  * min_trans indicates the oldest transaction that you are interested
5149  * in walking through.  Any nodes or leaves older than min_trans are
5150  * skipped over (without reading them).
5151  *
5152  * returns zero if something useful was found, < 0 on error and 1 if there
5153  * was nothing in the tree that matched the search criteria.
5154  */
5155 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5156 			 struct btrfs_path *path,
5157 			 u64 min_trans)
5158 {
5159 	struct extent_buffer *cur;
5160 	struct btrfs_key found_key;
5161 	int slot;
5162 	int sret;
5163 	u32 nritems;
5164 	int level;
5165 	int ret = 1;
5166 	int keep_locks = path->keep_locks;
5167 
5168 	path->keep_locks = 1;
5169 again:
5170 	cur = btrfs_read_lock_root_node(root);
5171 	level = btrfs_header_level(cur);
5172 	WARN_ON(path->nodes[level]);
5173 	path->nodes[level] = cur;
5174 	path->locks[level] = BTRFS_READ_LOCK;
5175 
5176 	if (btrfs_header_generation(cur) < min_trans) {
5177 		ret = 1;
5178 		goto out;
5179 	}
5180 	while (1) {
5181 		nritems = btrfs_header_nritems(cur);
5182 		level = btrfs_header_level(cur);
5183 		sret = bin_search(cur, min_key, level, &slot);
5184 
5185 		/* at the lowest level, we're done, setup the path and exit */
5186 		if (level == path->lowest_level) {
5187 			if (slot >= nritems)
5188 				goto find_next_key;
5189 			ret = 0;
5190 			path->slots[level] = slot;
5191 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5192 			goto out;
5193 		}
5194 		if (sret && slot > 0)
5195 			slot--;
5196 		/*
5197 		 * check this node pointer against the min_trans parameters.
5198 		 * If it is too old, old, skip to the next one.
5199 		 */
5200 		while (slot < nritems) {
5201 			u64 gen;
5202 
5203 			gen = btrfs_node_ptr_generation(cur, slot);
5204 			if (gen < min_trans) {
5205 				slot++;
5206 				continue;
5207 			}
5208 			break;
5209 		}
5210 find_next_key:
5211 		/*
5212 		 * we didn't find a candidate key in this node, walk forward
5213 		 * and find another one
5214 		 */
5215 		if (slot >= nritems) {
5216 			path->slots[level] = slot;
5217 			btrfs_set_path_blocking(path);
5218 			sret = btrfs_find_next_key(root, path, min_key, level,
5219 						  min_trans);
5220 			if (sret == 0) {
5221 				btrfs_release_path(path);
5222 				goto again;
5223 			} else {
5224 				goto out;
5225 			}
5226 		}
5227 		/* save our key for returning back */
5228 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5229 		path->slots[level] = slot;
5230 		if (level == path->lowest_level) {
5231 			ret = 0;
5232 			goto out;
5233 		}
5234 		btrfs_set_path_blocking(path);
5235 		cur = read_node_slot(root, cur, slot);
5236 		if (IS_ERR(cur)) {
5237 			ret = PTR_ERR(cur);
5238 			goto out;
5239 		}
5240 
5241 		btrfs_tree_read_lock(cur);
5242 
5243 		path->locks[level - 1] = BTRFS_READ_LOCK;
5244 		path->nodes[level - 1] = cur;
5245 		unlock_up(path, level, 1, 0, NULL);
5246 		btrfs_clear_path_blocking(path, NULL, 0);
5247 	}
5248 out:
5249 	path->keep_locks = keep_locks;
5250 	if (ret == 0) {
5251 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5252 		btrfs_set_path_blocking(path);
5253 		memcpy(min_key, &found_key, sizeof(found_key));
5254 	}
5255 	return ret;
5256 }
5257 
5258 static int tree_move_down(struct btrfs_root *root,
5259 			   struct btrfs_path *path,
5260 			   int *level, int root_level)
5261 {
5262 	struct extent_buffer *eb;
5263 
5264 	BUG_ON(*level == 0);
5265 	eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
5266 	if (IS_ERR(eb))
5267 		return PTR_ERR(eb);
5268 
5269 	path->nodes[*level - 1] = eb;
5270 	path->slots[*level - 1] = 0;
5271 	(*level)--;
5272 	return 0;
5273 }
5274 
5275 static int tree_move_next_or_upnext(struct btrfs_root *root,
5276 				    struct btrfs_path *path,
5277 				    int *level, int root_level)
5278 {
5279 	int ret = 0;
5280 	int nritems;
5281 	nritems = btrfs_header_nritems(path->nodes[*level]);
5282 
5283 	path->slots[*level]++;
5284 
5285 	while (path->slots[*level] >= nritems) {
5286 		if (*level == root_level)
5287 			return -1;
5288 
5289 		/* move upnext */
5290 		path->slots[*level] = 0;
5291 		free_extent_buffer(path->nodes[*level]);
5292 		path->nodes[*level] = NULL;
5293 		(*level)++;
5294 		path->slots[*level]++;
5295 
5296 		nritems = btrfs_header_nritems(path->nodes[*level]);
5297 		ret = 1;
5298 	}
5299 	return ret;
5300 }
5301 
5302 /*
5303  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5304  * or down.
5305  */
5306 static int tree_advance(struct btrfs_root *root,
5307 			struct btrfs_path *path,
5308 			int *level, int root_level,
5309 			int allow_down,
5310 			struct btrfs_key *key)
5311 {
5312 	int ret;
5313 
5314 	if (*level == 0 || !allow_down) {
5315 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5316 	} else {
5317 		ret = tree_move_down(root, path, level, root_level);
5318 	}
5319 	if (ret >= 0) {
5320 		if (*level == 0)
5321 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5322 					path->slots[*level]);
5323 		else
5324 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5325 					path->slots[*level]);
5326 	}
5327 	return ret;
5328 }
5329 
5330 static int tree_compare_item(struct btrfs_root *left_root,
5331 			     struct btrfs_path *left_path,
5332 			     struct btrfs_path *right_path,
5333 			     char *tmp_buf)
5334 {
5335 	int cmp;
5336 	int len1, len2;
5337 	unsigned long off1, off2;
5338 
5339 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5340 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5341 	if (len1 != len2)
5342 		return 1;
5343 
5344 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5345 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5346 				right_path->slots[0]);
5347 
5348 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5349 
5350 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5351 	if (cmp)
5352 		return 1;
5353 	return 0;
5354 }
5355 
5356 #define ADVANCE 1
5357 #define ADVANCE_ONLY_NEXT -1
5358 
5359 /*
5360  * This function compares two trees and calls the provided callback for
5361  * every changed/new/deleted item it finds.
5362  * If shared tree blocks are encountered, whole subtrees are skipped, making
5363  * the compare pretty fast on snapshotted subvolumes.
5364  *
5365  * This currently works on commit roots only. As commit roots are read only,
5366  * we don't do any locking. The commit roots are protected with transactions.
5367  * Transactions are ended and rejoined when a commit is tried in between.
5368  *
5369  * This function checks for modifications done to the trees while comparing.
5370  * If it detects a change, it aborts immediately.
5371  */
5372 int btrfs_compare_trees(struct btrfs_root *left_root,
5373 			struct btrfs_root *right_root,
5374 			btrfs_changed_cb_t changed_cb, void *ctx)
5375 {
5376 	int ret;
5377 	int cmp;
5378 	struct btrfs_path *left_path = NULL;
5379 	struct btrfs_path *right_path = NULL;
5380 	struct btrfs_key left_key;
5381 	struct btrfs_key right_key;
5382 	char *tmp_buf = NULL;
5383 	int left_root_level;
5384 	int right_root_level;
5385 	int left_level;
5386 	int right_level;
5387 	int left_end_reached;
5388 	int right_end_reached;
5389 	int advance_left;
5390 	int advance_right;
5391 	u64 left_blockptr;
5392 	u64 right_blockptr;
5393 	u64 left_gen;
5394 	u64 right_gen;
5395 
5396 	left_path = btrfs_alloc_path();
5397 	if (!left_path) {
5398 		ret = -ENOMEM;
5399 		goto out;
5400 	}
5401 	right_path = btrfs_alloc_path();
5402 	if (!right_path) {
5403 		ret = -ENOMEM;
5404 		goto out;
5405 	}
5406 
5407 	tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
5408 	if (!tmp_buf) {
5409 		tmp_buf = vmalloc(left_root->nodesize);
5410 		if (!tmp_buf) {
5411 			ret = -ENOMEM;
5412 			goto out;
5413 		}
5414 	}
5415 
5416 	left_path->search_commit_root = 1;
5417 	left_path->skip_locking = 1;
5418 	right_path->search_commit_root = 1;
5419 	right_path->skip_locking = 1;
5420 
5421 	/*
5422 	 * Strategy: Go to the first items of both trees. Then do
5423 	 *
5424 	 * If both trees are at level 0
5425 	 *   Compare keys of current items
5426 	 *     If left < right treat left item as new, advance left tree
5427 	 *       and repeat
5428 	 *     If left > right treat right item as deleted, advance right tree
5429 	 *       and repeat
5430 	 *     If left == right do deep compare of items, treat as changed if
5431 	 *       needed, advance both trees and repeat
5432 	 * If both trees are at the same level but not at level 0
5433 	 *   Compare keys of current nodes/leafs
5434 	 *     If left < right advance left tree and repeat
5435 	 *     If left > right advance right tree and repeat
5436 	 *     If left == right compare blockptrs of the next nodes/leafs
5437 	 *       If they match advance both trees but stay at the same level
5438 	 *         and repeat
5439 	 *       If they don't match advance both trees while allowing to go
5440 	 *         deeper and repeat
5441 	 * If tree levels are different
5442 	 *   Advance the tree that needs it and repeat
5443 	 *
5444 	 * Advancing a tree means:
5445 	 *   If we are at level 0, try to go to the next slot. If that's not
5446 	 *   possible, go one level up and repeat. Stop when we found a level
5447 	 *   where we could go to the next slot. We may at this point be on a
5448 	 *   node or a leaf.
5449 	 *
5450 	 *   If we are not at level 0 and not on shared tree blocks, go one
5451 	 *   level deeper.
5452 	 *
5453 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5454 	 *   the right if possible or go up and right.
5455 	 */
5456 
5457 	down_read(&left_root->fs_info->commit_root_sem);
5458 	left_level = btrfs_header_level(left_root->commit_root);
5459 	left_root_level = left_level;
5460 	left_path->nodes[left_level] = left_root->commit_root;
5461 	extent_buffer_get(left_path->nodes[left_level]);
5462 
5463 	right_level = btrfs_header_level(right_root->commit_root);
5464 	right_root_level = right_level;
5465 	right_path->nodes[right_level] = right_root->commit_root;
5466 	extent_buffer_get(right_path->nodes[right_level]);
5467 	up_read(&left_root->fs_info->commit_root_sem);
5468 
5469 	if (left_level == 0)
5470 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5471 				&left_key, left_path->slots[left_level]);
5472 	else
5473 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5474 				&left_key, left_path->slots[left_level]);
5475 	if (right_level == 0)
5476 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5477 				&right_key, right_path->slots[right_level]);
5478 	else
5479 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5480 				&right_key, right_path->slots[right_level]);
5481 
5482 	left_end_reached = right_end_reached = 0;
5483 	advance_left = advance_right = 0;
5484 
5485 	while (1) {
5486 		if (advance_left && !left_end_reached) {
5487 			ret = tree_advance(left_root, left_path, &left_level,
5488 					left_root_level,
5489 					advance_left != ADVANCE_ONLY_NEXT,
5490 					&left_key);
5491 			if (ret == -1)
5492 				left_end_reached = ADVANCE;
5493 			else if (ret < 0)
5494 				goto out;
5495 			advance_left = 0;
5496 		}
5497 		if (advance_right && !right_end_reached) {
5498 			ret = tree_advance(right_root, right_path, &right_level,
5499 					right_root_level,
5500 					advance_right != ADVANCE_ONLY_NEXT,
5501 					&right_key);
5502 			if (ret == -1)
5503 				right_end_reached = ADVANCE;
5504 			else if (ret < 0)
5505 				goto out;
5506 			advance_right = 0;
5507 		}
5508 
5509 		if (left_end_reached && right_end_reached) {
5510 			ret = 0;
5511 			goto out;
5512 		} else if (left_end_reached) {
5513 			if (right_level == 0) {
5514 				ret = changed_cb(left_root, right_root,
5515 						left_path, right_path,
5516 						&right_key,
5517 						BTRFS_COMPARE_TREE_DELETED,
5518 						ctx);
5519 				if (ret < 0)
5520 					goto out;
5521 			}
5522 			advance_right = ADVANCE;
5523 			continue;
5524 		} else if (right_end_reached) {
5525 			if (left_level == 0) {
5526 				ret = changed_cb(left_root, right_root,
5527 						left_path, right_path,
5528 						&left_key,
5529 						BTRFS_COMPARE_TREE_NEW,
5530 						ctx);
5531 				if (ret < 0)
5532 					goto out;
5533 			}
5534 			advance_left = ADVANCE;
5535 			continue;
5536 		}
5537 
5538 		if (left_level == 0 && right_level == 0) {
5539 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5540 			if (cmp < 0) {
5541 				ret = changed_cb(left_root, right_root,
5542 						left_path, right_path,
5543 						&left_key,
5544 						BTRFS_COMPARE_TREE_NEW,
5545 						ctx);
5546 				if (ret < 0)
5547 					goto out;
5548 				advance_left = ADVANCE;
5549 			} else if (cmp > 0) {
5550 				ret = changed_cb(left_root, right_root,
5551 						left_path, right_path,
5552 						&right_key,
5553 						BTRFS_COMPARE_TREE_DELETED,
5554 						ctx);
5555 				if (ret < 0)
5556 					goto out;
5557 				advance_right = ADVANCE;
5558 			} else {
5559 				enum btrfs_compare_tree_result result;
5560 
5561 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5562 				ret = tree_compare_item(left_root, left_path,
5563 						right_path, tmp_buf);
5564 				if (ret)
5565 					result = BTRFS_COMPARE_TREE_CHANGED;
5566 				else
5567 					result = BTRFS_COMPARE_TREE_SAME;
5568 				ret = changed_cb(left_root, right_root,
5569 						 left_path, right_path,
5570 						 &left_key, result, ctx);
5571 				if (ret < 0)
5572 					goto out;
5573 				advance_left = ADVANCE;
5574 				advance_right = ADVANCE;
5575 			}
5576 		} else if (left_level == right_level) {
5577 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5578 			if (cmp < 0) {
5579 				advance_left = ADVANCE;
5580 			} else if (cmp > 0) {
5581 				advance_right = ADVANCE;
5582 			} else {
5583 				left_blockptr = btrfs_node_blockptr(
5584 						left_path->nodes[left_level],
5585 						left_path->slots[left_level]);
5586 				right_blockptr = btrfs_node_blockptr(
5587 						right_path->nodes[right_level],
5588 						right_path->slots[right_level]);
5589 				left_gen = btrfs_node_ptr_generation(
5590 						left_path->nodes[left_level],
5591 						left_path->slots[left_level]);
5592 				right_gen = btrfs_node_ptr_generation(
5593 						right_path->nodes[right_level],
5594 						right_path->slots[right_level]);
5595 				if (left_blockptr == right_blockptr &&
5596 				    left_gen == right_gen) {
5597 					/*
5598 					 * As we're on a shared block, don't
5599 					 * allow to go deeper.
5600 					 */
5601 					advance_left = ADVANCE_ONLY_NEXT;
5602 					advance_right = ADVANCE_ONLY_NEXT;
5603 				} else {
5604 					advance_left = ADVANCE;
5605 					advance_right = ADVANCE;
5606 				}
5607 			}
5608 		} else if (left_level < right_level) {
5609 			advance_right = ADVANCE;
5610 		} else {
5611 			advance_left = ADVANCE;
5612 		}
5613 	}
5614 
5615 out:
5616 	btrfs_free_path(left_path);
5617 	btrfs_free_path(right_path);
5618 	kvfree(tmp_buf);
5619 	return ret;
5620 }
5621 
5622 /*
5623  * this is similar to btrfs_next_leaf, but does not try to preserve
5624  * and fixup the path.  It looks for and returns the next key in the
5625  * tree based on the current path and the min_trans parameters.
5626  *
5627  * 0 is returned if another key is found, < 0 if there are any errors
5628  * and 1 is returned if there are no higher keys in the tree
5629  *
5630  * path->keep_locks should be set to 1 on the search made before
5631  * calling this function.
5632  */
5633 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5634 			struct btrfs_key *key, int level, u64 min_trans)
5635 {
5636 	int slot;
5637 	struct extent_buffer *c;
5638 
5639 	WARN_ON(!path->keep_locks);
5640 	while (level < BTRFS_MAX_LEVEL) {
5641 		if (!path->nodes[level])
5642 			return 1;
5643 
5644 		slot = path->slots[level] + 1;
5645 		c = path->nodes[level];
5646 next:
5647 		if (slot >= btrfs_header_nritems(c)) {
5648 			int ret;
5649 			int orig_lowest;
5650 			struct btrfs_key cur_key;
5651 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5652 			    !path->nodes[level + 1])
5653 				return 1;
5654 
5655 			if (path->locks[level + 1]) {
5656 				level++;
5657 				continue;
5658 			}
5659 
5660 			slot = btrfs_header_nritems(c) - 1;
5661 			if (level == 0)
5662 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5663 			else
5664 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5665 
5666 			orig_lowest = path->lowest_level;
5667 			btrfs_release_path(path);
5668 			path->lowest_level = level;
5669 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5670 						0, 0);
5671 			path->lowest_level = orig_lowest;
5672 			if (ret < 0)
5673 				return ret;
5674 
5675 			c = path->nodes[level];
5676 			slot = path->slots[level];
5677 			if (ret == 0)
5678 				slot++;
5679 			goto next;
5680 		}
5681 
5682 		if (level == 0)
5683 			btrfs_item_key_to_cpu(c, key, slot);
5684 		else {
5685 			u64 gen = btrfs_node_ptr_generation(c, slot);
5686 
5687 			if (gen < min_trans) {
5688 				slot++;
5689 				goto next;
5690 			}
5691 			btrfs_node_key_to_cpu(c, key, slot);
5692 		}
5693 		return 0;
5694 	}
5695 	return 1;
5696 }
5697 
5698 /*
5699  * search the tree again to find a leaf with greater keys
5700  * returns 0 if it found something or 1 if there are no greater leaves.
5701  * returns < 0 on io errors.
5702  */
5703 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5704 {
5705 	return btrfs_next_old_leaf(root, path, 0);
5706 }
5707 
5708 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5709 			u64 time_seq)
5710 {
5711 	int slot;
5712 	int level;
5713 	struct extent_buffer *c;
5714 	struct extent_buffer *next;
5715 	struct btrfs_key key;
5716 	u32 nritems;
5717 	int ret;
5718 	int old_spinning = path->leave_spinning;
5719 	int next_rw_lock = 0;
5720 
5721 	nritems = btrfs_header_nritems(path->nodes[0]);
5722 	if (nritems == 0)
5723 		return 1;
5724 
5725 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5726 again:
5727 	level = 1;
5728 	next = NULL;
5729 	next_rw_lock = 0;
5730 	btrfs_release_path(path);
5731 
5732 	path->keep_locks = 1;
5733 	path->leave_spinning = 1;
5734 
5735 	if (time_seq)
5736 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5737 	else
5738 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5739 	path->keep_locks = 0;
5740 
5741 	if (ret < 0)
5742 		return ret;
5743 
5744 	nritems = btrfs_header_nritems(path->nodes[0]);
5745 	/*
5746 	 * by releasing the path above we dropped all our locks.  A balance
5747 	 * could have added more items next to the key that used to be
5748 	 * at the very end of the block.  So, check again here and
5749 	 * advance the path if there are now more items available.
5750 	 */
5751 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5752 		if (ret == 0)
5753 			path->slots[0]++;
5754 		ret = 0;
5755 		goto done;
5756 	}
5757 	/*
5758 	 * So the above check misses one case:
5759 	 * - after releasing the path above, someone has removed the item that
5760 	 *   used to be at the very end of the block, and balance between leafs
5761 	 *   gets another one with bigger key.offset to replace it.
5762 	 *
5763 	 * This one should be returned as well, or we can get leaf corruption
5764 	 * later(esp. in __btrfs_drop_extents()).
5765 	 *
5766 	 * And a bit more explanation about this check,
5767 	 * with ret > 0, the key isn't found, the path points to the slot
5768 	 * where it should be inserted, so the path->slots[0] item must be the
5769 	 * bigger one.
5770 	 */
5771 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5772 		ret = 0;
5773 		goto done;
5774 	}
5775 
5776 	while (level < BTRFS_MAX_LEVEL) {
5777 		if (!path->nodes[level]) {
5778 			ret = 1;
5779 			goto done;
5780 		}
5781 
5782 		slot = path->slots[level] + 1;
5783 		c = path->nodes[level];
5784 		if (slot >= btrfs_header_nritems(c)) {
5785 			level++;
5786 			if (level == BTRFS_MAX_LEVEL) {
5787 				ret = 1;
5788 				goto done;
5789 			}
5790 			continue;
5791 		}
5792 
5793 		if (next) {
5794 			btrfs_tree_unlock_rw(next, next_rw_lock);
5795 			free_extent_buffer(next);
5796 		}
5797 
5798 		next = c;
5799 		next_rw_lock = path->locks[level];
5800 		ret = read_block_for_search(NULL, root, path, &next, level,
5801 					    slot, &key, 0);
5802 		if (ret == -EAGAIN)
5803 			goto again;
5804 
5805 		if (ret < 0) {
5806 			btrfs_release_path(path);
5807 			goto done;
5808 		}
5809 
5810 		if (!path->skip_locking) {
5811 			ret = btrfs_try_tree_read_lock(next);
5812 			if (!ret && time_seq) {
5813 				/*
5814 				 * If we don't get the lock, we may be racing
5815 				 * with push_leaf_left, holding that lock while
5816 				 * itself waiting for the leaf we've currently
5817 				 * locked. To solve this situation, we give up
5818 				 * on our lock and cycle.
5819 				 */
5820 				free_extent_buffer(next);
5821 				btrfs_release_path(path);
5822 				cond_resched();
5823 				goto again;
5824 			}
5825 			if (!ret) {
5826 				btrfs_set_path_blocking(path);
5827 				btrfs_tree_read_lock(next);
5828 				btrfs_clear_path_blocking(path, next,
5829 							  BTRFS_READ_LOCK);
5830 			}
5831 			next_rw_lock = BTRFS_READ_LOCK;
5832 		}
5833 		break;
5834 	}
5835 	path->slots[level] = slot;
5836 	while (1) {
5837 		level--;
5838 		c = path->nodes[level];
5839 		if (path->locks[level])
5840 			btrfs_tree_unlock_rw(c, path->locks[level]);
5841 
5842 		free_extent_buffer(c);
5843 		path->nodes[level] = next;
5844 		path->slots[level] = 0;
5845 		if (!path->skip_locking)
5846 			path->locks[level] = next_rw_lock;
5847 		if (!level)
5848 			break;
5849 
5850 		ret = read_block_for_search(NULL, root, path, &next, level,
5851 					    0, &key, 0);
5852 		if (ret == -EAGAIN)
5853 			goto again;
5854 
5855 		if (ret < 0) {
5856 			btrfs_release_path(path);
5857 			goto done;
5858 		}
5859 
5860 		if (!path->skip_locking) {
5861 			ret = btrfs_try_tree_read_lock(next);
5862 			if (!ret) {
5863 				btrfs_set_path_blocking(path);
5864 				btrfs_tree_read_lock(next);
5865 				btrfs_clear_path_blocking(path, next,
5866 							  BTRFS_READ_LOCK);
5867 			}
5868 			next_rw_lock = BTRFS_READ_LOCK;
5869 		}
5870 	}
5871 	ret = 0;
5872 done:
5873 	unlock_up(path, 0, 1, 0, NULL);
5874 	path->leave_spinning = old_spinning;
5875 	if (!old_spinning)
5876 		btrfs_set_path_blocking(path);
5877 
5878 	return ret;
5879 }
5880 
5881 /*
5882  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5883  * searching until it gets past min_objectid or finds an item of 'type'
5884  *
5885  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5886  */
5887 int btrfs_previous_item(struct btrfs_root *root,
5888 			struct btrfs_path *path, u64 min_objectid,
5889 			int type)
5890 {
5891 	struct btrfs_key found_key;
5892 	struct extent_buffer *leaf;
5893 	u32 nritems;
5894 	int ret;
5895 
5896 	while (1) {
5897 		if (path->slots[0] == 0) {
5898 			btrfs_set_path_blocking(path);
5899 			ret = btrfs_prev_leaf(root, path);
5900 			if (ret != 0)
5901 				return ret;
5902 		} else {
5903 			path->slots[0]--;
5904 		}
5905 		leaf = path->nodes[0];
5906 		nritems = btrfs_header_nritems(leaf);
5907 		if (nritems == 0)
5908 			return 1;
5909 		if (path->slots[0] == nritems)
5910 			path->slots[0]--;
5911 
5912 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5913 		if (found_key.objectid < min_objectid)
5914 			break;
5915 		if (found_key.type == type)
5916 			return 0;
5917 		if (found_key.objectid == min_objectid &&
5918 		    found_key.type < type)
5919 			break;
5920 	}
5921 	return 1;
5922 }
5923 
5924 /*
5925  * search in extent tree to find a previous Metadata/Data extent item with
5926  * min objecitd.
5927  *
5928  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5929  */
5930 int btrfs_previous_extent_item(struct btrfs_root *root,
5931 			struct btrfs_path *path, u64 min_objectid)
5932 {
5933 	struct btrfs_key found_key;
5934 	struct extent_buffer *leaf;
5935 	u32 nritems;
5936 	int ret;
5937 
5938 	while (1) {
5939 		if (path->slots[0] == 0) {
5940 			btrfs_set_path_blocking(path);
5941 			ret = btrfs_prev_leaf(root, path);
5942 			if (ret != 0)
5943 				return ret;
5944 		} else {
5945 			path->slots[0]--;
5946 		}
5947 		leaf = path->nodes[0];
5948 		nritems = btrfs_header_nritems(leaf);
5949 		if (nritems == 0)
5950 			return 1;
5951 		if (path->slots[0] == nritems)
5952 			path->slots[0]--;
5953 
5954 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5955 		if (found_key.objectid < min_objectid)
5956 			break;
5957 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5958 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5959 			return 0;
5960 		if (found_key.objectid == min_objectid &&
5961 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5962 			break;
5963 	}
5964 	return 1;
5965 }
5966