xref: /linux/fs/btrfs/ctree.c (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 		    struct btrfs_path *path, int level, int slot,
42 		    int tree_mod_log);
43 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 				 struct extent_buffer *eb);
45 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46 					  u32 blocksize, u64 parent_transid,
47 					  u64 time_seq);
48 struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49 						u64 bytenr, u32 blocksize,
50 						u64 time_seq);
51 
52 struct btrfs_path *btrfs_alloc_path(void)
53 {
54 	struct btrfs_path *path;
55 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
56 	return path;
57 }
58 
59 /*
60  * set all locked nodes in the path to blocking locks.  This should
61  * be done before scheduling
62  */
63 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
64 {
65 	int i;
66 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
67 		if (!p->nodes[i] || !p->locks[i])
68 			continue;
69 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
70 		if (p->locks[i] == BTRFS_READ_LOCK)
71 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
72 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
73 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
74 	}
75 }
76 
77 /*
78  * reset all the locked nodes in the patch to spinning locks.
79  *
80  * held is used to keep lockdep happy, when lockdep is enabled
81  * we set held to a blocking lock before we go around and
82  * retake all the spinlocks in the path.  You can safely use NULL
83  * for held
84  */
85 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
86 					struct extent_buffer *held, int held_rw)
87 {
88 	int i;
89 
90 #ifdef CONFIG_DEBUG_LOCK_ALLOC
91 	/* lockdep really cares that we take all of these spinlocks
92 	 * in the right order.  If any of the locks in the path are not
93 	 * currently blocking, it is going to complain.  So, make really
94 	 * really sure by forcing the path to blocking before we clear
95 	 * the path blocking.
96 	 */
97 	if (held) {
98 		btrfs_set_lock_blocking_rw(held, held_rw);
99 		if (held_rw == BTRFS_WRITE_LOCK)
100 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
101 		else if (held_rw == BTRFS_READ_LOCK)
102 			held_rw = BTRFS_READ_LOCK_BLOCKING;
103 	}
104 	btrfs_set_path_blocking(p);
105 #endif
106 
107 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
108 		if (p->nodes[i] && p->locks[i]) {
109 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
110 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
111 				p->locks[i] = BTRFS_WRITE_LOCK;
112 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
113 				p->locks[i] = BTRFS_READ_LOCK;
114 		}
115 	}
116 
117 #ifdef CONFIG_DEBUG_LOCK_ALLOC
118 	if (held)
119 		btrfs_clear_lock_blocking_rw(held, held_rw);
120 #endif
121 }
122 
123 /* this also releases the path */
124 void btrfs_free_path(struct btrfs_path *p)
125 {
126 	if (!p)
127 		return;
128 	btrfs_release_path(p);
129 	kmem_cache_free(btrfs_path_cachep, p);
130 }
131 
132 /*
133  * path release drops references on the extent buffers in the path
134  * and it drops any locks held by this path
135  *
136  * It is safe to call this on paths that no locks or extent buffers held.
137  */
138 noinline void btrfs_release_path(struct btrfs_path *p)
139 {
140 	int i;
141 
142 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
143 		p->slots[i] = 0;
144 		if (!p->nodes[i])
145 			continue;
146 		if (p->locks[i]) {
147 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
148 			p->locks[i] = 0;
149 		}
150 		free_extent_buffer(p->nodes[i]);
151 		p->nodes[i] = NULL;
152 	}
153 }
154 
155 /*
156  * safely gets a reference on the root node of a tree.  A lock
157  * is not taken, so a concurrent writer may put a different node
158  * at the root of the tree.  See btrfs_lock_root_node for the
159  * looping required.
160  *
161  * The extent buffer returned by this has a reference taken, so
162  * it won't disappear.  It may stop being the root of the tree
163  * at any time because there are no locks held.
164  */
165 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
166 {
167 	struct extent_buffer *eb;
168 
169 	while (1) {
170 		rcu_read_lock();
171 		eb = rcu_dereference(root->node);
172 
173 		/*
174 		 * RCU really hurts here, we could free up the root node because
175 		 * it was cow'ed but we may not get the new root node yet so do
176 		 * the inc_not_zero dance and if it doesn't work then
177 		 * synchronize_rcu and try again.
178 		 */
179 		if (atomic_inc_not_zero(&eb->refs)) {
180 			rcu_read_unlock();
181 			break;
182 		}
183 		rcu_read_unlock();
184 		synchronize_rcu();
185 	}
186 	return eb;
187 }
188 
189 /* loop around taking references on and locking the root node of the
190  * tree until you end up with a lock on the root.  A locked buffer
191  * is returned, with a reference held.
192  */
193 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
194 {
195 	struct extent_buffer *eb;
196 
197 	while (1) {
198 		eb = btrfs_root_node(root);
199 		btrfs_tree_lock(eb);
200 		if (eb == root->node)
201 			break;
202 		btrfs_tree_unlock(eb);
203 		free_extent_buffer(eb);
204 	}
205 	return eb;
206 }
207 
208 /* loop around taking references on and locking the root node of the
209  * tree until you end up with a lock on the root.  A locked buffer
210  * is returned, with a reference held.
211  */
212 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
213 {
214 	struct extent_buffer *eb;
215 
216 	while (1) {
217 		eb = btrfs_root_node(root);
218 		btrfs_tree_read_lock(eb);
219 		if (eb == root->node)
220 			break;
221 		btrfs_tree_read_unlock(eb);
222 		free_extent_buffer(eb);
223 	}
224 	return eb;
225 }
226 
227 /* cowonly root (everything not a reference counted cow subvolume), just get
228  * put onto a simple dirty list.  transaction.c walks this to make sure they
229  * get properly updated on disk.
230  */
231 static void add_root_to_dirty_list(struct btrfs_root *root)
232 {
233 	spin_lock(&root->fs_info->trans_lock);
234 	if (root->track_dirty && list_empty(&root->dirty_list)) {
235 		list_add(&root->dirty_list,
236 			 &root->fs_info->dirty_cowonly_roots);
237 	}
238 	spin_unlock(&root->fs_info->trans_lock);
239 }
240 
241 /*
242  * used by snapshot creation to make a copy of a root for a tree with
243  * a given objectid.  The buffer with the new root node is returned in
244  * cow_ret, and this func returns zero on success or a negative error code.
245  */
246 int btrfs_copy_root(struct btrfs_trans_handle *trans,
247 		      struct btrfs_root *root,
248 		      struct extent_buffer *buf,
249 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
250 {
251 	struct extent_buffer *cow;
252 	int ret = 0;
253 	int level;
254 	struct btrfs_disk_key disk_key;
255 
256 	WARN_ON(root->ref_cows && trans->transid !=
257 		root->fs_info->running_transaction->transid);
258 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
259 
260 	level = btrfs_header_level(buf);
261 	if (level == 0)
262 		btrfs_item_key(buf, &disk_key, 0);
263 	else
264 		btrfs_node_key(buf, &disk_key, 0);
265 
266 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
267 				     new_root_objectid, &disk_key, level,
268 				     buf->start, 0);
269 	if (IS_ERR(cow))
270 		return PTR_ERR(cow);
271 
272 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
273 	btrfs_set_header_bytenr(cow, cow->start);
274 	btrfs_set_header_generation(cow, trans->transid);
275 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
276 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
277 				     BTRFS_HEADER_FLAG_RELOC);
278 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
279 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
280 	else
281 		btrfs_set_header_owner(cow, new_root_objectid);
282 
283 	write_extent_buffer(cow, root->fs_info->fsid,
284 			    (unsigned long)btrfs_header_fsid(cow),
285 			    BTRFS_FSID_SIZE);
286 
287 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
288 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
289 		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
290 	else
291 		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
292 
293 	if (ret)
294 		return ret;
295 
296 	btrfs_mark_buffer_dirty(cow);
297 	*cow_ret = cow;
298 	return 0;
299 }
300 
301 enum mod_log_op {
302 	MOD_LOG_KEY_REPLACE,
303 	MOD_LOG_KEY_ADD,
304 	MOD_LOG_KEY_REMOVE,
305 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307 	MOD_LOG_MOVE_KEYS,
308 	MOD_LOG_ROOT_REPLACE,
309 };
310 
311 struct tree_mod_move {
312 	int dst_slot;
313 	int nr_items;
314 };
315 
316 struct tree_mod_root {
317 	u64 logical;
318 	u8 level;
319 };
320 
321 struct tree_mod_elem {
322 	struct rb_node node;
323 	u64 index;		/* shifted logical */
324 	struct seq_list elem;
325 	enum mod_log_op op;
326 
327 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328 	int slot;
329 
330 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331 	u64 generation;
332 
333 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334 	struct btrfs_disk_key key;
335 	u64 blockptr;
336 
337 	/* this is used for op == MOD_LOG_MOVE_KEYS */
338 	struct tree_mod_move move;
339 
340 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
341 	struct tree_mod_root old_root;
342 };
343 
344 static inline void
345 __get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346 {
347 	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
348 	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349 }
350 
351 void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 			    struct seq_list *elem)
353 {
354 	elem->flags = 1;
355 	spin_lock(&fs_info->tree_mod_seq_lock);
356 	__get_tree_mod_seq(fs_info, elem);
357 	spin_unlock(&fs_info->tree_mod_seq_lock);
358 }
359 
360 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
361 			    struct seq_list *elem)
362 {
363 	struct rb_root *tm_root;
364 	struct rb_node *node;
365 	struct rb_node *next;
366 	struct seq_list *cur_elem;
367 	struct tree_mod_elem *tm;
368 	u64 min_seq = (u64)-1;
369 	u64 seq_putting = elem->seq;
370 
371 	if (!seq_putting)
372 		return;
373 
374 	BUG_ON(!(elem->flags & 1));
375 	spin_lock(&fs_info->tree_mod_seq_lock);
376 	list_del(&elem->list);
377 
378 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
380 			if (seq_putting > cur_elem->seq) {
381 				/*
382 				 * blocker with lower sequence number exists, we
383 				 * cannot remove anything from the log
384 				 */
385 				goto out;
386 			}
387 			min_seq = cur_elem->seq;
388 		}
389 	}
390 
391 	/*
392 	 * anything that's lower than the lowest existing (read: blocked)
393 	 * sequence number can be removed from the tree.
394 	 */
395 	write_lock(&fs_info->tree_mod_log_lock);
396 	tm_root = &fs_info->tree_mod_log;
397 	for (node = rb_first(tm_root); node; node = next) {
398 		next = rb_next(node);
399 		tm = container_of(node, struct tree_mod_elem, node);
400 		if (tm->elem.seq > min_seq)
401 			continue;
402 		rb_erase(node, tm_root);
403 		list_del(&tm->elem.list);
404 		kfree(tm);
405 	}
406 	write_unlock(&fs_info->tree_mod_log_lock);
407 out:
408 	spin_unlock(&fs_info->tree_mod_seq_lock);
409 }
410 
411 /*
412  * key order of the log:
413  *       index -> sequence
414  *
415  * the index is the shifted logical of the *new* root node for root replace
416  * operations, or the shifted logical of the affected block for all other
417  * operations.
418  */
419 static noinline int
420 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
421 {
422 	struct rb_root *tm_root;
423 	struct rb_node **new;
424 	struct rb_node *parent = NULL;
425 	struct tree_mod_elem *cur;
426 	int ret = 0;
427 
428 	BUG_ON(!tm || !tm->elem.seq);
429 
430 	write_lock(&fs_info->tree_mod_log_lock);
431 	tm_root = &fs_info->tree_mod_log;
432 	new = &tm_root->rb_node;
433 	while (*new) {
434 		cur = container_of(*new, struct tree_mod_elem, node);
435 		parent = *new;
436 		if (cur->index < tm->index)
437 			new = &((*new)->rb_left);
438 		else if (cur->index > tm->index)
439 			new = &((*new)->rb_right);
440 		else if (cur->elem.seq < tm->elem.seq)
441 			new = &((*new)->rb_left);
442 		else if (cur->elem.seq > tm->elem.seq)
443 			new = &((*new)->rb_right);
444 		else {
445 			kfree(tm);
446 			ret = -EEXIST;
447 			goto unlock;
448 		}
449 	}
450 
451 	rb_link_node(&tm->node, parent, new);
452 	rb_insert_color(&tm->node, tm_root);
453 unlock:
454 	write_unlock(&fs_info->tree_mod_log_lock);
455 	return ret;
456 }
457 
458 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 				    struct extent_buffer *eb) {
460 	smp_mb();
461 	if (list_empty(&(fs_info)->tree_mod_seq_list))
462 		return 1;
463 	if (!eb)
464 		return 0;
465 	if (btrfs_header_level(eb) == 0)
466 		return 1;
467 	return 0;
468 }
469 
470 /*
471  * This allocates memory and gets a tree modification sequence number when
472  * needed.
473  *
474  * Returns 0 when no sequence number is needed, < 0 on error.
475  * Returns 1 when a sequence number was added. In this case,
476  * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
477  * after inserting into the rb tree.
478  */
479 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
480 				 struct tree_mod_elem **tm_ret)
481 {
482 	struct tree_mod_elem *tm;
483 	int seq;
484 
485 	if (tree_mod_dont_log(fs_info, NULL))
486 		return 0;
487 
488 	tm = *tm_ret = kzalloc(sizeof(*tm), flags);
489 	if (!tm)
490 		return -ENOMEM;
491 
492 	tm->elem.flags = 0;
493 	spin_lock(&fs_info->tree_mod_seq_lock);
494 	if (list_empty(&fs_info->tree_mod_seq_list)) {
495 		/*
496 		 * someone emptied the list while we were waiting for the lock.
497 		 * we must not add to the list, because no blocker exists. items
498 		 * are removed from the list only when the existing blocker is
499 		 * removed from the list.
500 		 */
501 		kfree(tm);
502 		seq = 0;
503 		spin_unlock(&fs_info->tree_mod_seq_lock);
504 	} else {
505 		__get_tree_mod_seq(fs_info, &tm->elem);
506 		seq = tm->elem.seq;
507 	}
508 
509 	return seq;
510 }
511 
512 static noinline int
513 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
514 			     struct extent_buffer *eb, int slot,
515 			     enum mod_log_op op, gfp_t flags)
516 {
517 	struct tree_mod_elem *tm;
518 	int ret;
519 
520 	ret = tree_mod_alloc(fs_info, flags, &tm);
521 	if (ret <= 0)
522 		return ret;
523 
524 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
525 	if (op != MOD_LOG_KEY_ADD) {
526 		btrfs_node_key(eb, &tm->key, slot);
527 		tm->blockptr = btrfs_node_blockptr(eb, slot);
528 	}
529 	tm->op = op;
530 	tm->slot = slot;
531 	tm->generation = btrfs_node_ptr_generation(eb, slot);
532 
533 	ret = __tree_mod_log_insert(fs_info, tm);
534 	spin_unlock(&fs_info->tree_mod_seq_lock);
535 	return ret;
536 }
537 
538 static noinline int
539 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
540 			int slot, enum mod_log_op op)
541 {
542 	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
543 }
544 
545 static noinline int
546 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
547 			 struct extent_buffer *eb, int dst_slot, int src_slot,
548 			 int nr_items, gfp_t flags)
549 {
550 	struct tree_mod_elem *tm;
551 	int ret;
552 	int i;
553 
554 	if (tree_mod_dont_log(fs_info, eb))
555 		return 0;
556 
557 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
558 		ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
559 					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);
560 		BUG_ON(ret < 0);
561 	}
562 
563 	ret = tree_mod_alloc(fs_info, flags, &tm);
564 	if (ret <= 0)
565 		return ret;
566 
567 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
568 	tm->slot = src_slot;
569 	tm->move.dst_slot = dst_slot;
570 	tm->move.nr_items = nr_items;
571 	tm->op = MOD_LOG_MOVE_KEYS;
572 
573 	ret = __tree_mod_log_insert(fs_info, tm);
574 	spin_unlock(&fs_info->tree_mod_seq_lock);
575 	return ret;
576 }
577 
578 static noinline int
579 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
580 			 struct extent_buffer *old_root,
581 			 struct extent_buffer *new_root, gfp_t flags)
582 {
583 	struct tree_mod_elem *tm;
584 	int ret;
585 
586 	ret = tree_mod_alloc(fs_info, flags, &tm);
587 	if (ret <= 0)
588 		return ret;
589 
590 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
591 	tm->old_root.logical = old_root->start;
592 	tm->old_root.level = btrfs_header_level(old_root);
593 	tm->generation = btrfs_header_generation(old_root);
594 	tm->op = MOD_LOG_ROOT_REPLACE;
595 
596 	ret = __tree_mod_log_insert(fs_info, tm);
597 	spin_unlock(&fs_info->tree_mod_seq_lock);
598 	return ret;
599 }
600 
601 static struct tree_mod_elem *
602 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
603 		      int smallest)
604 {
605 	struct rb_root *tm_root;
606 	struct rb_node *node;
607 	struct tree_mod_elem *cur = NULL;
608 	struct tree_mod_elem *found = NULL;
609 	u64 index = start >> PAGE_CACHE_SHIFT;
610 
611 	read_lock(&fs_info->tree_mod_log_lock);
612 	tm_root = &fs_info->tree_mod_log;
613 	node = tm_root->rb_node;
614 	while (node) {
615 		cur = container_of(node, struct tree_mod_elem, node);
616 		if (cur->index < index) {
617 			node = node->rb_left;
618 		} else if (cur->index > index) {
619 			node = node->rb_right;
620 		} else if (cur->elem.seq < min_seq) {
621 			node = node->rb_left;
622 		} else if (!smallest) {
623 			/* we want the node with the highest seq */
624 			if (found)
625 				BUG_ON(found->elem.seq > cur->elem.seq);
626 			found = cur;
627 			node = node->rb_left;
628 		} else if (cur->elem.seq > min_seq) {
629 			/* we want the node with the smallest seq */
630 			if (found)
631 				BUG_ON(found->elem.seq < cur->elem.seq);
632 			found = cur;
633 			node = node->rb_right;
634 		} else {
635 			found = cur;
636 			break;
637 		}
638 	}
639 	read_unlock(&fs_info->tree_mod_log_lock);
640 
641 	return found;
642 }
643 
644 /*
645  * this returns the element from the log with the smallest time sequence
646  * value that's in the log (the oldest log item). any element with a time
647  * sequence lower than min_seq will be ignored.
648  */
649 static struct tree_mod_elem *
650 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
651 			   u64 min_seq)
652 {
653 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
654 }
655 
656 /*
657  * this returns the element from the log with the largest time sequence
658  * value that's in the log (the most recent log item). any element with
659  * a time sequence lower than min_seq will be ignored.
660  */
661 static struct tree_mod_elem *
662 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
663 {
664 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
665 }
666 
667 static inline void
668 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
669 		     struct extent_buffer *src, unsigned long dst_offset,
670 		     unsigned long src_offset, int nr_items)
671 {
672 	int ret;
673 	int i;
674 
675 	if (tree_mod_dont_log(fs_info, NULL))
676 		return;
677 
678 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
679 		return;
680 
681 	/* speed this up by single seq for all operations? */
682 	for (i = 0; i < nr_items; i++) {
683 		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
684 					      MOD_LOG_KEY_REMOVE);
685 		BUG_ON(ret < 0);
686 		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
687 					      MOD_LOG_KEY_ADD);
688 		BUG_ON(ret < 0);
689 	}
690 }
691 
692 static inline void
693 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
694 		     int dst_offset, int src_offset, int nr_items)
695 {
696 	int ret;
697 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
698 				       nr_items, GFP_NOFS);
699 	BUG_ON(ret < 0);
700 }
701 
702 static inline void
703 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
704 			  struct extent_buffer *eb,
705 			  struct btrfs_disk_key *disk_key, int slot, int atomic)
706 {
707 	int ret;
708 
709 	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
710 					   MOD_LOG_KEY_REPLACE,
711 					   atomic ? GFP_ATOMIC : GFP_NOFS);
712 	BUG_ON(ret < 0);
713 }
714 
715 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
716 				 struct extent_buffer *eb)
717 {
718 	int i;
719 	int ret;
720 	u32 nritems;
721 
722 	if (tree_mod_dont_log(fs_info, eb))
723 		return;
724 
725 	nritems = btrfs_header_nritems(eb);
726 	for (i = nritems - 1; i >= 0; i--) {
727 		ret = tree_mod_log_insert_key(fs_info, eb, i,
728 					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
729 		BUG_ON(ret < 0);
730 	}
731 }
732 
733 static inline void
734 tree_mod_log_set_root_pointer(struct btrfs_root *root,
735 			      struct extent_buffer *new_root_node)
736 {
737 	int ret;
738 	tree_mod_log_free_eb(root->fs_info, root->node);
739 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
740 				       new_root_node, GFP_NOFS);
741 	BUG_ON(ret < 0);
742 }
743 
744 /*
745  * check if the tree block can be shared by multiple trees
746  */
747 int btrfs_block_can_be_shared(struct btrfs_root *root,
748 			      struct extent_buffer *buf)
749 {
750 	/*
751 	 * Tree blocks not in refernece counted trees and tree roots
752 	 * are never shared. If a block was allocated after the last
753 	 * snapshot and the block was not allocated by tree relocation,
754 	 * we know the block is not shared.
755 	 */
756 	if (root->ref_cows &&
757 	    buf != root->node && buf != root->commit_root &&
758 	    (btrfs_header_generation(buf) <=
759 	     btrfs_root_last_snapshot(&root->root_item) ||
760 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
761 		return 1;
762 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
763 	if (root->ref_cows &&
764 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
765 		return 1;
766 #endif
767 	return 0;
768 }
769 
770 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
771 				       struct btrfs_root *root,
772 				       struct extent_buffer *buf,
773 				       struct extent_buffer *cow,
774 				       int *last_ref)
775 {
776 	u64 refs;
777 	u64 owner;
778 	u64 flags;
779 	u64 new_flags = 0;
780 	int ret;
781 
782 	/*
783 	 * Backrefs update rules:
784 	 *
785 	 * Always use full backrefs for extent pointers in tree block
786 	 * allocated by tree relocation.
787 	 *
788 	 * If a shared tree block is no longer referenced by its owner
789 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
790 	 * use full backrefs for extent pointers in tree block.
791 	 *
792 	 * If a tree block is been relocating
793 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
794 	 * use full backrefs for extent pointers in tree block.
795 	 * The reason for this is some operations (such as drop tree)
796 	 * are only allowed for blocks use full backrefs.
797 	 */
798 
799 	if (btrfs_block_can_be_shared(root, buf)) {
800 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
801 					       buf->len, &refs, &flags);
802 		if (ret)
803 			return ret;
804 		if (refs == 0) {
805 			ret = -EROFS;
806 			btrfs_std_error(root->fs_info, ret);
807 			return ret;
808 		}
809 	} else {
810 		refs = 1;
811 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
812 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
813 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
814 		else
815 			flags = 0;
816 	}
817 
818 	owner = btrfs_header_owner(buf);
819 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
820 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
821 
822 	if (refs > 1) {
823 		if ((owner == root->root_key.objectid ||
824 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
825 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
826 			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
827 			BUG_ON(ret); /* -ENOMEM */
828 
829 			if (root->root_key.objectid ==
830 			    BTRFS_TREE_RELOC_OBJECTID) {
831 				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
832 				BUG_ON(ret); /* -ENOMEM */
833 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
834 				BUG_ON(ret); /* -ENOMEM */
835 			}
836 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
837 		} else {
838 
839 			if (root->root_key.objectid ==
840 			    BTRFS_TREE_RELOC_OBJECTID)
841 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
842 			else
843 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
844 			BUG_ON(ret); /* -ENOMEM */
845 		}
846 		if (new_flags != 0) {
847 			ret = btrfs_set_disk_extent_flags(trans, root,
848 							  buf->start,
849 							  buf->len,
850 							  new_flags, 0);
851 			if (ret)
852 				return ret;
853 		}
854 	} else {
855 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
856 			if (root->root_key.objectid ==
857 			    BTRFS_TREE_RELOC_OBJECTID)
858 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
859 			else
860 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
861 			BUG_ON(ret); /* -ENOMEM */
862 			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
863 			BUG_ON(ret); /* -ENOMEM */
864 		}
865 		/*
866 		 * don't log freeing in case we're freeing the root node, this
867 		 * is done by tree_mod_log_set_root_pointer later
868 		 */
869 		if (buf != root->node && btrfs_header_level(buf) != 0)
870 			tree_mod_log_free_eb(root->fs_info, buf);
871 		clean_tree_block(trans, root, buf);
872 		*last_ref = 1;
873 	}
874 	return 0;
875 }
876 
877 /*
878  * does the dirty work in cow of a single block.  The parent block (if
879  * supplied) is updated to point to the new cow copy.  The new buffer is marked
880  * dirty and returned locked.  If you modify the block it needs to be marked
881  * dirty again.
882  *
883  * search_start -- an allocation hint for the new block
884  *
885  * empty_size -- a hint that you plan on doing more cow.  This is the size in
886  * bytes the allocator should try to find free next to the block it returns.
887  * This is just a hint and may be ignored by the allocator.
888  */
889 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
890 			     struct btrfs_root *root,
891 			     struct extent_buffer *buf,
892 			     struct extent_buffer *parent, int parent_slot,
893 			     struct extent_buffer **cow_ret,
894 			     u64 search_start, u64 empty_size)
895 {
896 	struct btrfs_disk_key disk_key;
897 	struct extent_buffer *cow;
898 	int level, ret;
899 	int last_ref = 0;
900 	int unlock_orig = 0;
901 	u64 parent_start;
902 
903 	if (*cow_ret == buf)
904 		unlock_orig = 1;
905 
906 	btrfs_assert_tree_locked(buf);
907 
908 	WARN_ON(root->ref_cows && trans->transid !=
909 		root->fs_info->running_transaction->transid);
910 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
911 
912 	level = btrfs_header_level(buf);
913 
914 	if (level == 0)
915 		btrfs_item_key(buf, &disk_key, 0);
916 	else
917 		btrfs_node_key(buf, &disk_key, 0);
918 
919 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
920 		if (parent)
921 			parent_start = parent->start;
922 		else
923 			parent_start = 0;
924 	} else
925 		parent_start = 0;
926 
927 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
928 				     root->root_key.objectid, &disk_key,
929 				     level, search_start, empty_size);
930 	if (IS_ERR(cow))
931 		return PTR_ERR(cow);
932 
933 	/* cow is set to blocking by btrfs_init_new_buffer */
934 
935 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
936 	btrfs_set_header_bytenr(cow, cow->start);
937 	btrfs_set_header_generation(cow, trans->transid);
938 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
939 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
940 				     BTRFS_HEADER_FLAG_RELOC);
941 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
942 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
943 	else
944 		btrfs_set_header_owner(cow, root->root_key.objectid);
945 
946 	write_extent_buffer(cow, root->fs_info->fsid,
947 			    (unsigned long)btrfs_header_fsid(cow),
948 			    BTRFS_FSID_SIZE);
949 
950 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
951 	if (ret) {
952 		btrfs_abort_transaction(trans, root, ret);
953 		return ret;
954 	}
955 
956 	if (root->ref_cows)
957 		btrfs_reloc_cow_block(trans, root, buf, cow);
958 
959 	if (buf == root->node) {
960 		WARN_ON(parent && parent != buf);
961 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
962 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
963 			parent_start = buf->start;
964 		else
965 			parent_start = 0;
966 
967 		extent_buffer_get(cow);
968 		tree_mod_log_set_root_pointer(root, cow);
969 		rcu_assign_pointer(root->node, cow);
970 
971 		btrfs_free_tree_block(trans, root, buf, parent_start,
972 				      last_ref);
973 		free_extent_buffer(buf);
974 		add_root_to_dirty_list(root);
975 	} else {
976 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
977 			parent_start = parent->start;
978 		else
979 			parent_start = 0;
980 
981 		WARN_ON(trans->transid != btrfs_header_generation(parent));
982 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
983 					MOD_LOG_KEY_REPLACE);
984 		btrfs_set_node_blockptr(parent, parent_slot,
985 					cow->start);
986 		btrfs_set_node_ptr_generation(parent, parent_slot,
987 					      trans->transid);
988 		btrfs_mark_buffer_dirty(parent);
989 		btrfs_free_tree_block(trans, root, buf, parent_start,
990 				      last_ref);
991 	}
992 	if (unlock_orig)
993 		btrfs_tree_unlock(buf);
994 	free_extent_buffer_stale(buf);
995 	btrfs_mark_buffer_dirty(cow);
996 	*cow_ret = cow;
997 	return 0;
998 }
999 
1000 /*
1001  * returns the logical address of the oldest predecessor of the given root.
1002  * entries older than time_seq are ignored.
1003  */
1004 static struct tree_mod_elem *
1005 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1006 			   struct btrfs_root *root, u64 time_seq)
1007 {
1008 	struct tree_mod_elem *tm;
1009 	struct tree_mod_elem *found = NULL;
1010 	u64 root_logical = root->node->start;
1011 	int looped = 0;
1012 
1013 	if (!time_seq)
1014 		return 0;
1015 
1016 	/*
1017 	 * the very last operation that's logged for a root is the replacement
1018 	 * operation (if it is replaced at all). this has the index of the *new*
1019 	 * root, making it the very first operation that's logged for this root.
1020 	 */
1021 	while (1) {
1022 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1023 						time_seq);
1024 		if (!looped && !tm)
1025 			return 0;
1026 		/*
1027 		 * if there are no tree operation for the oldest root, we simply
1028 		 * return it. this should only happen if that (old) root is at
1029 		 * level 0.
1030 		 */
1031 		if (!tm)
1032 			break;
1033 
1034 		/*
1035 		 * if there's an operation that's not a root replacement, we
1036 		 * found the oldest version of our root. normally, we'll find a
1037 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1038 		 */
1039 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1040 			break;
1041 
1042 		found = tm;
1043 		root_logical = tm->old_root.logical;
1044 		BUG_ON(root_logical == root->node->start);
1045 		looped = 1;
1046 	}
1047 
1048 	/* if there's no old root to return, return what we found instead */
1049 	if (!found)
1050 		found = tm;
1051 
1052 	return found;
1053 }
1054 
1055 /*
1056  * tm is a pointer to the first operation to rewind within eb. then, all
1057  * previous operations will be rewinded (until we reach something older than
1058  * time_seq).
1059  */
1060 static void
1061 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1062 		      struct tree_mod_elem *first_tm)
1063 {
1064 	u32 n;
1065 	struct rb_node *next;
1066 	struct tree_mod_elem *tm = first_tm;
1067 	unsigned long o_dst;
1068 	unsigned long o_src;
1069 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1070 
1071 	n = btrfs_header_nritems(eb);
1072 	while (tm && tm->elem.seq >= time_seq) {
1073 		/*
1074 		 * all the operations are recorded with the operator used for
1075 		 * the modification. as we're going backwards, we do the
1076 		 * opposite of each operation here.
1077 		 */
1078 		switch (tm->op) {
1079 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1080 			BUG_ON(tm->slot < n);
1081 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1082 		case MOD_LOG_KEY_REMOVE:
1083 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1084 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1085 			btrfs_set_node_ptr_generation(eb, tm->slot,
1086 						      tm->generation);
1087 			n++;
1088 			break;
1089 		case MOD_LOG_KEY_REPLACE:
1090 			BUG_ON(tm->slot >= n);
1091 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1092 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1093 			btrfs_set_node_ptr_generation(eb, tm->slot,
1094 						      tm->generation);
1095 			break;
1096 		case MOD_LOG_KEY_ADD:
1097 			/* if a move operation is needed it's in the log */
1098 			n--;
1099 			break;
1100 		case MOD_LOG_MOVE_KEYS:
1101 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1102 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1103 			memmove_extent_buffer(eb, o_dst, o_src,
1104 					      tm->move.nr_items * p_size);
1105 			break;
1106 		case MOD_LOG_ROOT_REPLACE:
1107 			/*
1108 			 * this operation is special. for roots, this must be
1109 			 * handled explicitly before rewinding.
1110 			 * for non-roots, this operation may exist if the node
1111 			 * was a root: root A -> child B; then A gets empty and
1112 			 * B is promoted to the new root. in the mod log, we'll
1113 			 * have a root-replace operation for B, a tree block
1114 			 * that is no root. we simply ignore that operation.
1115 			 */
1116 			break;
1117 		}
1118 		next = rb_next(&tm->node);
1119 		if (!next)
1120 			break;
1121 		tm = container_of(next, struct tree_mod_elem, node);
1122 		if (tm->index != first_tm->index)
1123 			break;
1124 	}
1125 	btrfs_set_header_nritems(eb, n);
1126 }
1127 
1128 static struct extent_buffer *
1129 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1130 		    u64 time_seq)
1131 {
1132 	struct extent_buffer *eb_rewin;
1133 	struct tree_mod_elem *tm;
1134 
1135 	if (!time_seq)
1136 		return eb;
1137 
1138 	if (btrfs_header_level(eb) == 0)
1139 		return eb;
1140 
1141 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1142 	if (!tm)
1143 		return eb;
1144 
1145 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1146 		BUG_ON(tm->slot != 0);
1147 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
1148 						fs_info->tree_root->nodesize);
1149 		BUG_ON(!eb_rewin);
1150 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1151 		btrfs_set_header_backref_rev(eb_rewin,
1152 					     btrfs_header_backref_rev(eb));
1153 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1154 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1155 	} else {
1156 		eb_rewin = btrfs_clone_extent_buffer(eb);
1157 		BUG_ON(!eb_rewin);
1158 	}
1159 
1160 	extent_buffer_get(eb_rewin);
1161 	free_extent_buffer(eb);
1162 
1163 	__tree_mod_log_rewind(eb_rewin, time_seq, tm);
1164 
1165 	return eb_rewin;
1166 }
1167 
1168 /*
1169  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1170  * value. If there are no changes, the current root->root_node is returned. If
1171  * anything changed in between, there's a fresh buffer allocated on which the
1172  * rewind operations are done. In any case, the returned buffer is read locked.
1173  * Returns NULL on error (with no locks held).
1174  */
1175 static inline struct extent_buffer *
1176 get_old_root(struct btrfs_root *root, u64 time_seq)
1177 {
1178 	struct tree_mod_elem *tm;
1179 	struct extent_buffer *eb;
1180 	struct tree_mod_root *old_root = NULL;
1181 	u64 old_generation = 0;
1182 	u64 logical;
1183 
1184 	eb = btrfs_read_lock_root_node(root);
1185 	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1186 	if (!tm)
1187 		return root->node;
1188 
1189 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1190 		old_root = &tm->old_root;
1191 		old_generation = tm->generation;
1192 		logical = old_root->logical;
1193 	} else {
1194 		logical = root->node->start;
1195 	}
1196 
1197 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1198 	if (old_root)
1199 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1200 	else
1201 		eb = btrfs_clone_extent_buffer(root->node);
1202 	btrfs_tree_read_unlock(root->node);
1203 	free_extent_buffer(root->node);
1204 	if (!eb)
1205 		return NULL;
1206 	btrfs_tree_read_lock(eb);
1207 	if (old_root) {
1208 		btrfs_set_header_bytenr(eb, eb->start);
1209 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1210 		btrfs_set_header_owner(eb, root->root_key.objectid);
1211 		btrfs_set_header_level(eb, old_root->level);
1212 		btrfs_set_header_generation(eb, old_generation);
1213 	}
1214 	if (tm)
1215 		__tree_mod_log_rewind(eb, time_seq, tm);
1216 	else
1217 		WARN_ON(btrfs_header_level(eb) != 0);
1218 	extent_buffer_get(eb);
1219 
1220 	return eb;
1221 }
1222 
1223 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1224 				   struct btrfs_root *root,
1225 				   struct extent_buffer *buf)
1226 {
1227 	/* ensure we can see the force_cow */
1228 	smp_rmb();
1229 
1230 	/*
1231 	 * We do not need to cow a block if
1232 	 * 1) this block is not created or changed in this transaction;
1233 	 * 2) this block does not belong to TREE_RELOC tree;
1234 	 * 3) the root is not forced COW.
1235 	 *
1236 	 * What is forced COW:
1237 	 *    when we create snapshot during commiting the transaction,
1238 	 *    after we've finished coping src root, we must COW the shared
1239 	 *    block to ensure the metadata consistency.
1240 	 */
1241 	if (btrfs_header_generation(buf) == trans->transid &&
1242 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1243 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1244 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1245 	    !root->force_cow)
1246 		return 0;
1247 	return 1;
1248 }
1249 
1250 /*
1251  * cows a single block, see __btrfs_cow_block for the real work.
1252  * This version of it has extra checks so that a block isn't cow'd more than
1253  * once per transaction, as long as it hasn't been written yet
1254  */
1255 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1256 		    struct btrfs_root *root, struct extent_buffer *buf,
1257 		    struct extent_buffer *parent, int parent_slot,
1258 		    struct extent_buffer **cow_ret)
1259 {
1260 	u64 search_start;
1261 	int ret;
1262 
1263 	if (trans->transaction != root->fs_info->running_transaction) {
1264 		printk(KERN_CRIT "trans %llu running %llu\n",
1265 		       (unsigned long long)trans->transid,
1266 		       (unsigned long long)
1267 		       root->fs_info->running_transaction->transid);
1268 		WARN_ON(1);
1269 	}
1270 	if (trans->transid != root->fs_info->generation) {
1271 		printk(KERN_CRIT "trans %llu running %llu\n",
1272 		       (unsigned long long)trans->transid,
1273 		       (unsigned long long)root->fs_info->generation);
1274 		WARN_ON(1);
1275 	}
1276 
1277 	if (!should_cow_block(trans, root, buf)) {
1278 		*cow_ret = buf;
1279 		return 0;
1280 	}
1281 
1282 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1283 
1284 	if (parent)
1285 		btrfs_set_lock_blocking(parent);
1286 	btrfs_set_lock_blocking(buf);
1287 
1288 	ret = __btrfs_cow_block(trans, root, buf, parent,
1289 				 parent_slot, cow_ret, search_start, 0);
1290 
1291 	trace_btrfs_cow_block(root, buf, *cow_ret);
1292 
1293 	return ret;
1294 }
1295 
1296 /*
1297  * helper function for defrag to decide if two blocks pointed to by a
1298  * node are actually close by
1299  */
1300 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1301 {
1302 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1303 		return 1;
1304 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1305 		return 1;
1306 	return 0;
1307 }
1308 
1309 /*
1310  * compare two keys in a memcmp fashion
1311  */
1312 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1313 {
1314 	struct btrfs_key k1;
1315 
1316 	btrfs_disk_key_to_cpu(&k1, disk);
1317 
1318 	return btrfs_comp_cpu_keys(&k1, k2);
1319 }
1320 
1321 /*
1322  * same as comp_keys only with two btrfs_key's
1323  */
1324 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1325 {
1326 	if (k1->objectid > k2->objectid)
1327 		return 1;
1328 	if (k1->objectid < k2->objectid)
1329 		return -1;
1330 	if (k1->type > k2->type)
1331 		return 1;
1332 	if (k1->type < k2->type)
1333 		return -1;
1334 	if (k1->offset > k2->offset)
1335 		return 1;
1336 	if (k1->offset < k2->offset)
1337 		return -1;
1338 	return 0;
1339 }
1340 
1341 /*
1342  * this is used by the defrag code to go through all the
1343  * leaves pointed to by a node and reallocate them so that
1344  * disk order is close to key order
1345  */
1346 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1347 		       struct btrfs_root *root, struct extent_buffer *parent,
1348 		       int start_slot, int cache_only, u64 *last_ret,
1349 		       struct btrfs_key *progress)
1350 {
1351 	struct extent_buffer *cur;
1352 	u64 blocknr;
1353 	u64 gen;
1354 	u64 search_start = *last_ret;
1355 	u64 last_block = 0;
1356 	u64 other;
1357 	u32 parent_nritems;
1358 	int end_slot;
1359 	int i;
1360 	int err = 0;
1361 	int parent_level;
1362 	int uptodate;
1363 	u32 blocksize;
1364 	int progress_passed = 0;
1365 	struct btrfs_disk_key disk_key;
1366 
1367 	parent_level = btrfs_header_level(parent);
1368 	if (cache_only && parent_level != 1)
1369 		return 0;
1370 
1371 	if (trans->transaction != root->fs_info->running_transaction)
1372 		WARN_ON(1);
1373 	if (trans->transid != root->fs_info->generation)
1374 		WARN_ON(1);
1375 
1376 	parent_nritems = btrfs_header_nritems(parent);
1377 	blocksize = btrfs_level_size(root, parent_level - 1);
1378 	end_slot = parent_nritems;
1379 
1380 	if (parent_nritems == 1)
1381 		return 0;
1382 
1383 	btrfs_set_lock_blocking(parent);
1384 
1385 	for (i = start_slot; i < end_slot; i++) {
1386 		int close = 1;
1387 
1388 		btrfs_node_key(parent, &disk_key, i);
1389 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1390 			continue;
1391 
1392 		progress_passed = 1;
1393 		blocknr = btrfs_node_blockptr(parent, i);
1394 		gen = btrfs_node_ptr_generation(parent, i);
1395 		if (last_block == 0)
1396 			last_block = blocknr;
1397 
1398 		if (i > 0) {
1399 			other = btrfs_node_blockptr(parent, i - 1);
1400 			close = close_blocks(blocknr, other, blocksize);
1401 		}
1402 		if (!close && i < end_slot - 2) {
1403 			other = btrfs_node_blockptr(parent, i + 1);
1404 			close = close_blocks(blocknr, other, blocksize);
1405 		}
1406 		if (close) {
1407 			last_block = blocknr;
1408 			continue;
1409 		}
1410 
1411 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
1412 		if (cur)
1413 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1414 		else
1415 			uptodate = 0;
1416 		if (!cur || !uptodate) {
1417 			if (cache_only) {
1418 				free_extent_buffer(cur);
1419 				continue;
1420 			}
1421 			if (!cur) {
1422 				cur = read_tree_block(root, blocknr,
1423 							 blocksize, gen);
1424 				if (!cur)
1425 					return -EIO;
1426 			} else if (!uptodate) {
1427 				err = btrfs_read_buffer(cur, gen);
1428 				if (err) {
1429 					free_extent_buffer(cur);
1430 					return err;
1431 				}
1432 			}
1433 		}
1434 		if (search_start == 0)
1435 			search_start = last_block;
1436 
1437 		btrfs_tree_lock(cur);
1438 		btrfs_set_lock_blocking(cur);
1439 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1440 					&cur, search_start,
1441 					min(16 * blocksize,
1442 					    (end_slot - i) * blocksize));
1443 		if (err) {
1444 			btrfs_tree_unlock(cur);
1445 			free_extent_buffer(cur);
1446 			break;
1447 		}
1448 		search_start = cur->start;
1449 		last_block = cur->start;
1450 		*last_ret = search_start;
1451 		btrfs_tree_unlock(cur);
1452 		free_extent_buffer(cur);
1453 	}
1454 	return err;
1455 }
1456 
1457 /*
1458  * The leaf data grows from end-to-front in the node.
1459  * this returns the address of the start of the last item,
1460  * which is the stop of the leaf data stack
1461  */
1462 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1463 					 struct extent_buffer *leaf)
1464 {
1465 	u32 nr = btrfs_header_nritems(leaf);
1466 	if (nr == 0)
1467 		return BTRFS_LEAF_DATA_SIZE(root);
1468 	return btrfs_item_offset_nr(leaf, nr - 1);
1469 }
1470 
1471 
1472 /*
1473  * search for key in the extent_buffer.  The items start at offset p,
1474  * and they are item_size apart.  There are 'max' items in p.
1475  *
1476  * the slot in the array is returned via slot, and it points to
1477  * the place where you would insert key if it is not found in
1478  * the array.
1479  *
1480  * slot may point to max if the key is bigger than all of the keys
1481  */
1482 static noinline int generic_bin_search(struct extent_buffer *eb,
1483 				       unsigned long p,
1484 				       int item_size, struct btrfs_key *key,
1485 				       int max, int *slot)
1486 {
1487 	int low = 0;
1488 	int high = max;
1489 	int mid;
1490 	int ret;
1491 	struct btrfs_disk_key *tmp = NULL;
1492 	struct btrfs_disk_key unaligned;
1493 	unsigned long offset;
1494 	char *kaddr = NULL;
1495 	unsigned long map_start = 0;
1496 	unsigned long map_len = 0;
1497 	int err;
1498 
1499 	while (low < high) {
1500 		mid = (low + high) / 2;
1501 		offset = p + mid * item_size;
1502 
1503 		if (!kaddr || offset < map_start ||
1504 		    (offset + sizeof(struct btrfs_disk_key)) >
1505 		    map_start + map_len) {
1506 
1507 			err = map_private_extent_buffer(eb, offset,
1508 						sizeof(struct btrfs_disk_key),
1509 						&kaddr, &map_start, &map_len);
1510 
1511 			if (!err) {
1512 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1513 							map_start);
1514 			} else {
1515 				read_extent_buffer(eb, &unaligned,
1516 						   offset, sizeof(unaligned));
1517 				tmp = &unaligned;
1518 			}
1519 
1520 		} else {
1521 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1522 							map_start);
1523 		}
1524 		ret = comp_keys(tmp, key);
1525 
1526 		if (ret < 0)
1527 			low = mid + 1;
1528 		else if (ret > 0)
1529 			high = mid;
1530 		else {
1531 			*slot = mid;
1532 			return 0;
1533 		}
1534 	}
1535 	*slot = low;
1536 	return 1;
1537 }
1538 
1539 /*
1540  * simple bin_search frontend that does the right thing for
1541  * leaves vs nodes
1542  */
1543 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1544 		      int level, int *slot)
1545 {
1546 	if (level == 0)
1547 		return generic_bin_search(eb,
1548 					  offsetof(struct btrfs_leaf, items),
1549 					  sizeof(struct btrfs_item),
1550 					  key, btrfs_header_nritems(eb),
1551 					  slot);
1552 	else
1553 		return generic_bin_search(eb,
1554 					  offsetof(struct btrfs_node, ptrs),
1555 					  sizeof(struct btrfs_key_ptr),
1556 					  key, btrfs_header_nritems(eb),
1557 					  slot);
1558 }
1559 
1560 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1561 		     int level, int *slot)
1562 {
1563 	return bin_search(eb, key, level, slot);
1564 }
1565 
1566 static void root_add_used(struct btrfs_root *root, u32 size)
1567 {
1568 	spin_lock(&root->accounting_lock);
1569 	btrfs_set_root_used(&root->root_item,
1570 			    btrfs_root_used(&root->root_item) + size);
1571 	spin_unlock(&root->accounting_lock);
1572 }
1573 
1574 static void root_sub_used(struct btrfs_root *root, u32 size)
1575 {
1576 	spin_lock(&root->accounting_lock);
1577 	btrfs_set_root_used(&root->root_item,
1578 			    btrfs_root_used(&root->root_item) - size);
1579 	spin_unlock(&root->accounting_lock);
1580 }
1581 
1582 /* given a node and slot number, this reads the blocks it points to.  The
1583  * extent buffer is returned with a reference taken (but unlocked).
1584  * NULL is returned on error.
1585  */
1586 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1587 				   struct extent_buffer *parent, int slot)
1588 {
1589 	int level = btrfs_header_level(parent);
1590 	if (slot < 0)
1591 		return NULL;
1592 	if (slot >= btrfs_header_nritems(parent))
1593 		return NULL;
1594 
1595 	BUG_ON(level == 0);
1596 
1597 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
1598 		       btrfs_level_size(root, level - 1),
1599 		       btrfs_node_ptr_generation(parent, slot));
1600 }
1601 
1602 /*
1603  * node level balancing, used to make sure nodes are in proper order for
1604  * item deletion.  We balance from the top down, so we have to make sure
1605  * that a deletion won't leave an node completely empty later on.
1606  */
1607 static noinline int balance_level(struct btrfs_trans_handle *trans,
1608 			 struct btrfs_root *root,
1609 			 struct btrfs_path *path, int level)
1610 {
1611 	struct extent_buffer *right = NULL;
1612 	struct extent_buffer *mid;
1613 	struct extent_buffer *left = NULL;
1614 	struct extent_buffer *parent = NULL;
1615 	int ret = 0;
1616 	int wret;
1617 	int pslot;
1618 	int orig_slot = path->slots[level];
1619 	u64 orig_ptr;
1620 
1621 	if (level == 0)
1622 		return 0;
1623 
1624 	mid = path->nodes[level];
1625 
1626 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1627 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1628 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1629 
1630 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1631 
1632 	if (level < BTRFS_MAX_LEVEL - 1) {
1633 		parent = path->nodes[level + 1];
1634 		pslot = path->slots[level + 1];
1635 	}
1636 
1637 	/*
1638 	 * deal with the case where there is only one pointer in the root
1639 	 * by promoting the node below to a root
1640 	 */
1641 	if (!parent) {
1642 		struct extent_buffer *child;
1643 
1644 		if (btrfs_header_nritems(mid) != 1)
1645 			return 0;
1646 
1647 		/* promote the child to a root */
1648 		child = read_node_slot(root, mid, 0);
1649 		if (!child) {
1650 			ret = -EROFS;
1651 			btrfs_std_error(root->fs_info, ret);
1652 			goto enospc;
1653 		}
1654 
1655 		btrfs_tree_lock(child);
1656 		btrfs_set_lock_blocking(child);
1657 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1658 		if (ret) {
1659 			btrfs_tree_unlock(child);
1660 			free_extent_buffer(child);
1661 			goto enospc;
1662 		}
1663 
1664 		tree_mod_log_set_root_pointer(root, child);
1665 		rcu_assign_pointer(root->node, child);
1666 
1667 		add_root_to_dirty_list(root);
1668 		btrfs_tree_unlock(child);
1669 
1670 		path->locks[level] = 0;
1671 		path->nodes[level] = NULL;
1672 		clean_tree_block(trans, root, mid);
1673 		btrfs_tree_unlock(mid);
1674 		/* once for the path */
1675 		free_extent_buffer(mid);
1676 
1677 		root_sub_used(root, mid->len);
1678 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1679 		/* once for the root ptr */
1680 		free_extent_buffer_stale(mid);
1681 		return 0;
1682 	}
1683 	if (btrfs_header_nritems(mid) >
1684 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1685 		return 0;
1686 
1687 	left = read_node_slot(root, parent, pslot - 1);
1688 	if (left) {
1689 		btrfs_tree_lock(left);
1690 		btrfs_set_lock_blocking(left);
1691 		wret = btrfs_cow_block(trans, root, left,
1692 				       parent, pslot - 1, &left);
1693 		if (wret) {
1694 			ret = wret;
1695 			goto enospc;
1696 		}
1697 	}
1698 	right = read_node_slot(root, parent, pslot + 1);
1699 	if (right) {
1700 		btrfs_tree_lock(right);
1701 		btrfs_set_lock_blocking(right);
1702 		wret = btrfs_cow_block(trans, root, right,
1703 				       parent, pslot + 1, &right);
1704 		if (wret) {
1705 			ret = wret;
1706 			goto enospc;
1707 		}
1708 	}
1709 
1710 	/* first, try to make some room in the middle buffer */
1711 	if (left) {
1712 		orig_slot += btrfs_header_nritems(left);
1713 		wret = push_node_left(trans, root, left, mid, 1);
1714 		if (wret < 0)
1715 			ret = wret;
1716 	}
1717 
1718 	/*
1719 	 * then try to empty the right most buffer into the middle
1720 	 */
1721 	if (right) {
1722 		wret = push_node_left(trans, root, mid, right, 1);
1723 		if (wret < 0 && wret != -ENOSPC)
1724 			ret = wret;
1725 		if (btrfs_header_nritems(right) == 0) {
1726 			clean_tree_block(trans, root, right);
1727 			btrfs_tree_unlock(right);
1728 			del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1729 			root_sub_used(root, right->len);
1730 			btrfs_free_tree_block(trans, root, right, 0, 1);
1731 			free_extent_buffer_stale(right);
1732 			right = NULL;
1733 		} else {
1734 			struct btrfs_disk_key right_key;
1735 			btrfs_node_key(right, &right_key, 0);
1736 			tree_mod_log_set_node_key(root->fs_info, parent,
1737 						  &right_key, pslot + 1, 0);
1738 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1739 			btrfs_mark_buffer_dirty(parent);
1740 		}
1741 	}
1742 	if (btrfs_header_nritems(mid) == 1) {
1743 		/*
1744 		 * we're not allowed to leave a node with one item in the
1745 		 * tree during a delete.  A deletion from lower in the tree
1746 		 * could try to delete the only pointer in this node.
1747 		 * So, pull some keys from the left.
1748 		 * There has to be a left pointer at this point because
1749 		 * otherwise we would have pulled some pointers from the
1750 		 * right
1751 		 */
1752 		if (!left) {
1753 			ret = -EROFS;
1754 			btrfs_std_error(root->fs_info, ret);
1755 			goto enospc;
1756 		}
1757 		wret = balance_node_right(trans, root, mid, left);
1758 		if (wret < 0) {
1759 			ret = wret;
1760 			goto enospc;
1761 		}
1762 		if (wret == 1) {
1763 			wret = push_node_left(trans, root, left, mid, 1);
1764 			if (wret < 0)
1765 				ret = wret;
1766 		}
1767 		BUG_ON(wret == 1);
1768 	}
1769 	if (btrfs_header_nritems(mid) == 0) {
1770 		clean_tree_block(trans, root, mid);
1771 		btrfs_tree_unlock(mid);
1772 		del_ptr(trans, root, path, level + 1, pslot, 1);
1773 		root_sub_used(root, mid->len);
1774 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1775 		free_extent_buffer_stale(mid);
1776 		mid = NULL;
1777 	} else {
1778 		/* update the parent key to reflect our changes */
1779 		struct btrfs_disk_key mid_key;
1780 		btrfs_node_key(mid, &mid_key, 0);
1781 		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1782 					  pslot, 0);
1783 		btrfs_set_node_key(parent, &mid_key, pslot);
1784 		btrfs_mark_buffer_dirty(parent);
1785 	}
1786 
1787 	/* update the path */
1788 	if (left) {
1789 		if (btrfs_header_nritems(left) > orig_slot) {
1790 			extent_buffer_get(left);
1791 			/* left was locked after cow */
1792 			path->nodes[level] = left;
1793 			path->slots[level + 1] -= 1;
1794 			path->slots[level] = orig_slot;
1795 			if (mid) {
1796 				btrfs_tree_unlock(mid);
1797 				free_extent_buffer(mid);
1798 			}
1799 		} else {
1800 			orig_slot -= btrfs_header_nritems(left);
1801 			path->slots[level] = orig_slot;
1802 		}
1803 	}
1804 	/* double check we haven't messed things up */
1805 	if (orig_ptr !=
1806 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1807 		BUG();
1808 enospc:
1809 	if (right) {
1810 		btrfs_tree_unlock(right);
1811 		free_extent_buffer(right);
1812 	}
1813 	if (left) {
1814 		if (path->nodes[level] != left)
1815 			btrfs_tree_unlock(left);
1816 		free_extent_buffer(left);
1817 	}
1818 	return ret;
1819 }
1820 
1821 /* Node balancing for insertion.  Here we only split or push nodes around
1822  * when they are completely full.  This is also done top down, so we
1823  * have to be pessimistic.
1824  */
1825 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1826 					  struct btrfs_root *root,
1827 					  struct btrfs_path *path, int level)
1828 {
1829 	struct extent_buffer *right = NULL;
1830 	struct extent_buffer *mid;
1831 	struct extent_buffer *left = NULL;
1832 	struct extent_buffer *parent = NULL;
1833 	int ret = 0;
1834 	int wret;
1835 	int pslot;
1836 	int orig_slot = path->slots[level];
1837 
1838 	if (level == 0)
1839 		return 1;
1840 
1841 	mid = path->nodes[level];
1842 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1843 
1844 	if (level < BTRFS_MAX_LEVEL - 1) {
1845 		parent = path->nodes[level + 1];
1846 		pslot = path->slots[level + 1];
1847 	}
1848 
1849 	if (!parent)
1850 		return 1;
1851 
1852 	left = read_node_slot(root, parent, pslot - 1);
1853 
1854 	/* first, try to make some room in the middle buffer */
1855 	if (left) {
1856 		u32 left_nr;
1857 
1858 		btrfs_tree_lock(left);
1859 		btrfs_set_lock_blocking(left);
1860 
1861 		left_nr = btrfs_header_nritems(left);
1862 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1863 			wret = 1;
1864 		} else {
1865 			ret = btrfs_cow_block(trans, root, left, parent,
1866 					      pslot - 1, &left);
1867 			if (ret)
1868 				wret = 1;
1869 			else {
1870 				wret = push_node_left(trans, root,
1871 						      left, mid, 0);
1872 			}
1873 		}
1874 		if (wret < 0)
1875 			ret = wret;
1876 		if (wret == 0) {
1877 			struct btrfs_disk_key disk_key;
1878 			orig_slot += left_nr;
1879 			btrfs_node_key(mid, &disk_key, 0);
1880 			tree_mod_log_set_node_key(root->fs_info, parent,
1881 						  &disk_key, pslot, 0);
1882 			btrfs_set_node_key(parent, &disk_key, pslot);
1883 			btrfs_mark_buffer_dirty(parent);
1884 			if (btrfs_header_nritems(left) > orig_slot) {
1885 				path->nodes[level] = left;
1886 				path->slots[level + 1] -= 1;
1887 				path->slots[level] = orig_slot;
1888 				btrfs_tree_unlock(mid);
1889 				free_extent_buffer(mid);
1890 			} else {
1891 				orig_slot -=
1892 					btrfs_header_nritems(left);
1893 				path->slots[level] = orig_slot;
1894 				btrfs_tree_unlock(left);
1895 				free_extent_buffer(left);
1896 			}
1897 			return 0;
1898 		}
1899 		btrfs_tree_unlock(left);
1900 		free_extent_buffer(left);
1901 	}
1902 	right = read_node_slot(root, parent, pslot + 1);
1903 
1904 	/*
1905 	 * then try to empty the right most buffer into the middle
1906 	 */
1907 	if (right) {
1908 		u32 right_nr;
1909 
1910 		btrfs_tree_lock(right);
1911 		btrfs_set_lock_blocking(right);
1912 
1913 		right_nr = btrfs_header_nritems(right);
1914 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1915 			wret = 1;
1916 		} else {
1917 			ret = btrfs_cow_block(trans, root, right,
1918 					      parent, pslot + 1,
1919 					      &right);
1920 			if (ret)
1921 				wret = 1;
1922 			else {
1923 				wret = balance_node_right(trans, root,
1924 							  right, mid);
1925 			}
1926 		}
1927 		if (wret < 0)
1928 			ret = wret;
1929 		if (wret == 0) {
1930 			struct btrfs_disk_key disk_key;
1931 
1932 			btrfs_node_key(right, &disk_key, 0);
1933 			tree_mod_log_set_node_key(root->fs_info, parent,
1934 						  &disk_key, pslot + 1, 0);
1935 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1936 			btrfs_mark_buffer_dirty(parent);
1937 
1938 			if (btrfs_header_nritems(mid) <= orig_slot) {
1939 				path->nodes[level] = right;
1940 				path->slots[level + 1] += 1;
1941 				path->slots[level] = orig_slot -
1942 					btrfs_header_nritems(mid);
1943 				btrfs_tree_unlock(mid);
1944 				free_extent_buffer(mid);
1945 			} else {
1946 				btrfs_tree_unlock(right);
1947 				free_extent_buffer(right);
1948 			}
1949 			return 0;
1950 		}
1951 		btrfs_tree_unlock(right);
1952 		free_extent_buffer(right);
1953 	}
1954 	return 1;
1955 }
1956 
1957 /*
1958  * readahead one full node of leaves, finding things that are close
1959  * to the block in 'slot', and triggering ra on them.
1960  */
1961 static void reada_for_search(struct btrfs_root *root,
1962 			     struct btrfs_path *path,
1963 			     int level, int slot, u64 objectid)
1964 {
1965 	struct extent_buffer *node;
1966 	struct btrfs_disk_key disk_key;
1967 	u32 nritems;
1968 	u64 search;
1969 	u64 target;
1970 	u64 nread = 0;
1971 	u64 gen;
1972 	int direction = path->reada;
1973 	struct extent_buffer *eb;
1974 	u32 nr;
1975 	u32 blocksize;
1976 	u32 nscan = 0;
1977 
1978 	if (level != 1)
1979 		return;
1980 
1981 	if (!path->nodes[level])
1982 		return;
1983 
1984 	node = path->nodes[level];
1985 
1986 	search = btrfs_node_blockptr(node, slot);
1987 	blocksize = btrfs_level_size(root, level - 1);
1988 	eb = btrfs_find_tree_block(root, search, blocksize);
1989 	if (eb) {
1990 		free_extent_buffer(eb);
1991 		return;
1992 	}
1993 
1994 	target = search;
1995 
1996 	nritems = btrfs_header_nritems(node);
1997 	nr = slot;
1998 
1999 	while (1) {
2000 		if (direction < 0) {
2001 			if (nr == 0)
2002 				break;
2003 			nr--;
2004 		} else if (direction > 0) {
2005 			nr++;
2006 			if (nr >= nritems)
2007 				break;
2008 		}
2009 		if (path->reada < 0 && objectid) {
2010 			btrfs_node_key(node, &disk_key, nr);
2011 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2012 				break;
2013 		}
2014 		search = btrfs_node_blockptr(node, nr);
2015 		if ((search <= target && target - search <= 65536) ||
2016 		    (search > target && search - target <= 65536)) {
2017 			gen = btrfs_node_ptr_generation(node, nr);
2018 			readahead_tree_block(root, search, blocksize, gen);
2019 			nread += blocksize;
2020 		}
2021 		nscan++;
2022 		if ((nread > 65536 || nscan > 32))
2023 			break;
2024 	}
2025 }
2026 
2027 /*
2028  * returns -EAGAIN if it had to drop the path, or zero if everything was in
2029  * cache
2030  */
2031 static noinline int reada_for_balance(struct btrfs_root *root,
2032 				      struct btrfs_path *path, int level)
2033 {
2034 	int slot;
2035 	int nritems;
2036 	struct extent_buffer *parent;
2037 	struct extent_buffer *eb;
2038 	u64 gen;
2039 	u64 block1 = 0;
2040 	u64 block2 = 0;
2041 	int ret = 0;
2042 	int blocksize;
2043 
2044 	parent = path->nodes[level + 1];
2045 	if (!parent)
2046 		return 0;
2047 
2048 	nritems = btrfs_header_nritems(parent);
2049 	slot = path->slots[level + 1];
2050 	blocksize = btrfs_level_size(root, level);
2051 
2052 	if (slot > 0) {
2053 		block1 = btrfs_node_blockptr(parent, slot - 1);
2054 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2055 		eb = btrfs_find_tree_block(root, block1, blocksize);
2056 		/*
2057 		 * if we get -eagain from btrfs_buffer_uptodate, we
2058 		 * don't want to return eagain here.  That will loop
2059 		 * forever
2060 		 */
2061 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2062 			block1 = 0;
2063 		free_extent_buffer(eb);
2064 	}
2065 	if (slot + 1 < nritems) {
2066 		block2 = btrfs_node_blockptr(parent, slot + 1);
2067 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2068 		eb = btrfs_find_tree_block(root, block2, blocksize);
2069 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2070 			block2 = 0;
2071 		free_extent_buffer(eb);
2072 	}
2073 	if (block1 || block2) {
2074 		ret = -EAGAIN;
2075 
2076 		/* release the whole path */
2077 		btrfs_release_path(path);
2078 
2079 		/* read the blocks */
2080 		if (block1)
2081 			readahead_tree_block(root, block1, blocksize, 0);
2082 		if (block2)
2083 			readahead_tree_block(root, block2, blocksize, 0);
2084 
2085 		if (block1) {
2086 			eb = read_tree_block(root, block1, blocksize, 0);
2087 			free_extent_buffer(eb);
2088 		}
2089 		if (block2) {
2090 			eb = read_tree_block(root, block2, blocksize, 0);
2091 			free_extent_buffer(eb);
2092 		}
2093 	}
2094 	return ret;
2095 }
2096 
2097 
2098 /*
2099  * when we walk down the tree, it is usually safe to unlock the higher layers
2100  * in the tree.  The exceptions are when our path goes through slot 0, because
2101  * operations on the tree might require changing key pointers higher up in the
2102  * tree.
2103  *
2104  * callers might also have set path->keep_locks, which tells this code to keep
2105  * the lock if the path points to the last slot in the block.  This is part of
2106  * walking through the tree, and selecting the next slot in the higher block.
2107  *
2108  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2109  * if lowest_unlock is 1, level 0 won't be unlocked
2110  */
2111 static noinline void unlock_up(struct btrfs_path *path, int level,
2112 			       int lowest_unlock, int min_write_lock_level,
2113 			       int *write_lock_level)
2114 {
2115 	int i;
2116 	int skip_level = level;
2117 	int no_skips = 0;
2118 	struct extent_buffer *t;
2119 
2120 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2121 		if (!path->nodes[i])
2122 			break;
2123 		if (!path->locks[i])
2124 			break;
2125 		if (!no_skips && path->slots[i] == 0) {
2126 			skip_level = i + 1;
2127 			continue;
2128 		}
2129 		if (!no_skips && path->keep_locks) {
2130 			u32 nritems;
2131 			t = path->nodes[i];
2132 			nritems = btrfs_header_nritems(t);
2133 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2134 				skip_level = i + 1;
2135 				continue;
2136 			}
2137 		}
2138 		if (skip_level < i && i >= lowest_unlock)
2139 			no_skips = 1;
2140 
2141 		t = path->nodes[i];
2142 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2143 			btrfs_tree_unlock_rw(t, path->locks[i]);
2144 			path->locks[i] = 0;
2145 			if (write_lock_level &&
2146 			    i > min_write_lock_level &&
2147 			    i <= *write_lock_level) {
2148 				*write_lock_level = i - 1;
2149 			}
2150 		}
2151 	}
2152 }
2153 
2154 /*
2155  * This releases any locks held in the path starting at level and
2156  * going all the way up to the root.
2157  *
2158  * btrfs_search_slot will keep the lock held on higher nodes in a few
2159  * corner cases, such as COW of the block at slot zero in the node.  This
2160  * ignores those rules, and it should only be called when there are no
2161  * more updates to be done higher up in the tree.
2162  */
2163 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2164 {
2165 	int i;
2166 
2167 	if (path->keep_locks)
2168 		return;
2169 
2170 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2171 		if (!path->nodes[i])
2172 			continue;
2173 		if (!path->locks[i])
2174 			continue;
2175 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2176 		path->locks[i] = 0;
2177 	}
2178 }
2179 
2180 /*
2181  * helper function for btrfs_search_slot.  The goal is to find a block
2182  * in cache without setting the path to blocking.  If we find the block
2183  * we return zero and the path is unchanged.
2184  *
2185  * If we can't find the block, we set the path blocking and do some
2186  * reada.  -EAGAIN is returned and the search must be repeated.
2187  */
2188 static int
2189 read_block_for_search(struct btrfs_trans_handle *trans,
2190 		       struct btrfs_root *root, struct btrfs_path *p,
2191 		       struct extent_buffer **eb_ret, int level, int slot,
2192 		       struct btrfs_key *key, u64 time_seq)
2193 {
2194 	u64 blocknr;
2195 	u64 gen;
2196 	u32 blocksize;
2197 	struct extent_buffer *b = *eb_ret;
2198 	struct extent_buffer *tmp;
2199 	int ret;
2200 
2201 	blocknr = btrfs_node_blockptr(b, slot);
2202 	gen = btrfs_node_ptr_generation(b, slot);
2203 	blocksize = btrfs_level_size(root, level - 1);
2204 
2205 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2206 	if (tmp) {
2207 		/* first we do an atomic uptodate check */
2208 		if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2209 			if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2210 				/*
2211 				 * we found an up to date block without
2212 				 * sleeping, return
2213 				 * right away
2214 				 */
2215 				*eb_ret = tmp;
2216 				return 0;
2217 			}
2218 			/* the pages were up to date, but we failed
2219 			 * the generation number check.  Do a full
2220 			 * read for the generation number that is correct.
2221 			 * We must do this without dropping locks so
2222 			 * we can trust our generation number
2223 			 */
2224 			free_extent_buffer(tmp);
2225 			btrfs_set_path_blocking(p);
2226 
2227 			/* now we're allowed to do a blocking uptodate check */
2228 			tmp = read_tree_block(root, blocknr, blocksize, gen);
2229 			if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2230 				*eb_ret = tmp;
2231 				return 0;
2232 			}
2233 			free_extent_buffer(tmp);
2234 			btrfs_release_path(p);
2235 			return -EIO;
2236 		}
2237 	}
2238 
2239 	/*
2240 	 * reduce lock contention at high levels
2241 	 * of the btree by dropping locks before
2242 	 * we read.  Don't release the lock on the current
2243 	 * level because we need to walk this node to figure
2244 	 * out which blocks to read.
2245 	 */
2246 	btrfs_unlock_up_safe(p, level + 1);
2247 	btrfs_set_path_blocking(p);
2248 
2249 	free_extent_buffer(tmp);
2250 	if (p->reada)
2251 		reada_for_search(root, p, level, slot, key->objectid);
2252 
2253 	btrfs_release_path(p);
2254 
2255 	ret = -EAGAIN;
2256 	tmp = read_tree_block(root, blocknr, blocksize, 0);
2257 	if (tmp) {
2258 		/*
2259 		 * If the read above didn't mark this buffer up to date,
2260 		 * it will never end up being up to date.  Set ret to EIO now
2261 		 * and give up so that our caller doesn't loop forever
2262 		 * on our EAGAINs.
2263 		 */
2264 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2265 			ret = -EIO;
2266 		free_extent_buffer(tmp);
2267 	}
2268 	return ret;
2269 }
2270 
2271 /*
2272  * helper function for btrfs_search_slot.  This does all of the checks
2273  * for node-level blocks and does any balancing required based on
2274  * the ins_len.
2275  *
2276  * If no extra work was required, zero is returned.  If we had to
2277  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2278  * start over
2279  */
2280 static int
2281 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2282 		       struct btrfs_root *root, struct btrfs_path *p,
2283 		       struct extent_buffer *b, int level, int ins_len,
2284 		       int *write_lock_level)
2285 {
2286 	int ret;
2287 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2288 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2289 		int sret;
2290 
2291 		if (*write_lock_level < level + 1) {
2292 			*write_lock_level = level + 1;
2293 			btrfs_release_path(p);
2294 			goto again;
2295 		}
2296 
2297 		sret = reada_for_balance(root, p, level);
2298 		if (sret)
2299 			goto again;
2300 
2301 		btrfs_set_path_blocking(p);
2302 		sret = split_node(trans, root, p, level);
2303 		btrfs_clear_path_blocking(p, NULL, 0);
2304 
2305 		BUG_ON(sret > 0);
2306 		if (sret) {
2307 			ret = sret;
2308 			goto done;
2309 		}
2310 		b = p->nodes[level];
2311 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2312 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2313 		int sret;
2314 
2315 		if (*write_lock_level < level + 1) {
2316 			*write_lock_level = level + 1;
2317 			btrfs_release_path(p);
2318 			goto again;
2319 		}
2320 
2321 		sret = reada_for_balance(root, p, level);
2322 		if (sret)
2323 			goto again;
2324 
2325 		btrfs_set_path_blocking(p);
2326 		sret = balance_level(trans, root, p, level);
2327 		btrfs_clear_path_blocking(p, NULL, 0);
2328 
2329 		if (sret) {
2330 			ret = sret;
2331 			goto done;
2332 		}
2333 		b = p->nodes[level];
2334 		if (!b) {
2335 			btrfs_release_path(p);
2336 			goto again;
2337 		}
2338 		BUG_ON(btrfs_header_nritems(b) == 1);
2339 	}
2340 	return 0;
2341 
2342 again:
2343 	ret = -EAGAIN;
2344 done:
2345 	return ret;
2346 }
2347 
2348 /*
2349  * look for key in the tree.  path is filled in with nodes along the way
2350  * if key is found, we return zero and you can find the item in the leaf
2351  * level of the path (level 0)
2352  *
2353  * If the key isn't found, the path points to the slot where it should
2354  * be inserted, and 1 is returned.  If there are other errors during the
2355  * search a negative error number is returned.
2356  *
2357  * if ins_len > 0, nodes and leaves will be split as we walk down the
2358  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2359  * possible)
2360  */
2361 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2362 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2363 		      ins_len, int cow)
2364 {
2365 	struct extent_buffer *b;
2366 	int slot;
2367 	int ret;
2368 	int err;
2369 	int level;
2370 	int lowest_unlock = 1;
2371 	int root_lock;
2372 	/* everything at write_lock_level or lower must be write locked */
2373 	int write_lock_level = 0;
2374 	u8 lowest_level = 0;
2375 	int min_write_lock_level;
2376 
2377 	lowest_level = p->lowest_level;
2378 	WARN_ON(lowest_level && ins_len > 0);
2379 	WARN_ON(p->nodes[0] != NULL);
2380 
2381 	if (ins_len < 0) {
2382 		lowest_unlock = 2;
2383 
2384 		/* when we are removing items, we might have to go up to level
2385 		 * two as we update tree pointers  Make sure we keep write
2386 		 * for those levels as well
2387 		 */
2388 		write_lock_level = 2;
2389 	} else if (ins_len > 0) {
2390 		/*
2391 		 * for inserting items, make sure we have a write lock on
2392 		 * level 1 so we can update keys
2393 		 */
2394 		write_lock_level = 1;
2395 	}
2396 
2397 	if (!cow)
2398 		write_lock_level = -1;
2399 
2400 	if (cow && (p->keep_locks || p->lowest_level))
2401 		write_lock_level = BTRFS_MAX_LEVEL;
2402 
2403 	min_write_lock_level = write_lock_level;
2404 
2405 again:
2406 	/*
2407 	 * we try very hard to do read locks on the root
2408 	 */
2409 	root_lock = BTRFS_READ_LOCK;
2410 	level = 0;
2411 	if (p->search_commit_root) {
2412 		/*
2413 		 * the commit roots are read only
2414 		 * so we always do read locks
2415 		 */
2416 		b = root->commit_root;
2417 		extent_buffer_get(b);
2418 		level = btrfs_header_level(b);
2419 		if (!p->skip_locking)
2420 			btrfs_tree_read_lock(b);
2421 	} else {
2422 		if (p->skip_locking) {
2423 			b = btrfs_root_node(root);
2424 			level = btrfs_header_level(b);
2425 		} else {
2426 			/* we don't know the level of the root node
2427 			 * until we actually have it read locked
2428 			 */
2429 			b = btrfs_read_lock_root_node(root);
2430 			level = btrfs_header_level(b);
2431 			if (level <= write_lock_level) {
2432 				/* whoops, must trade for write lock */
2433 				btrfs_tree_read_unlock(b);
2434 				free_extent_buffer(b);
2435 				b = btrfs_lock_root_node(root);
2436 				root_lock = BTRFS_WRITE_LOCK;
2437 
2438 				/* the level might have changed, check again */
2439 				level = btrfs_header_level(b);
2440 			}
2441 		}
2442 	}
2443 	p->nodes[level] = b;
2444 	if (!p->skip_locking)
2445 		p->locks[level] = root_lock;
2446 
2447 	while (b) {
2448 		level = btrfs_header_level(b);
2449 
2450 		/*
2451 		 * setup the path here so we can release it under lock
2452 		 * contention with the cow code
2453 		 */
2454 		if (cow) {
2455 			/*
2456 			 * if we don't really need to cow this block
2457 			 * then we don't want to set the path blocking,
2458 			 * so we test it here
2459 			 */
2460 			if (!should_cow_block(trans, root, b))
2461 				goto cow_done;
2462 
2463 			btrfs_set_path_blocking(p);
2464 
2465 			/*
2466 			 * must have write locks on this node and the
2467 			 * parent
2468 			 */
2469 			if (level + 1 > write_lock_level) {
2470 				write_lock_level = level + 1;
2471 				btrfs_release_path(p);
2472 				goto again;
2473 			}
2474 
2475 			err = btrfs_cow_block(trans, root, b,
2476 					      p->nodes[level + 1],
2477 					      p->slots[level + 1], &b);
2478 			if (err) {
2479 				ret = err;
2480 				goto done;
2481 			}
2482 		}
2483 cow_done:
2484 		BUG_ON(!cow && ins_len);
2485 
2486 		p->nodes[level] = b;
2487 		btrfs_clear_path_blocking(p, NULL, 0);
2488 
2489 		/*
2490 		 * we have a lock on b and as long as we aren't changing
2491 		 * the tree, there is no way to for the items in b to change.
2492 		 * It is safe to drop the lock on our parent before we
2493 		 * go through the expensive btree search on b.
2494 		 *
2495 		 * If cow is true, then we might be changing slot zero,
2496 		 * which may require changing the parent.  So, we can't
2497 		 * drop the lock until after we know which slot we're
2498 		 * operating on.
2499 		 */
2500 		if (!cow)
2501 			btrfs_unlock_up_safe(p, level + 1);
2502 
2503 		ret = bin_search(b, key, level, &slot);
2504 
2505 		if (level != 0) {
2506 			int dec = 0;
2507 			if (ret && slot > 0) {
2508 				dec = 1;
2509 				slot -= 1;
2510 			}
2511 			p->slots[level] = slot;
2512 			err = setup_nodes_for_search(trans, root, p, b, level,
2513 					     ins_len, &write_lock_level);
2514 			if (err == -EAGAIN)
2515 				goto again;
2516 			if (err) {
2517 				ret = err;
2518 				goto done;
2519 			}
2520 			b = p->nodes[level];
2521 			slot = p->slots[level];
2522 
2523 			/*
2524 			 * slot 0 is special, if we change the key
2525 			 * we have to update the parent pointer
2526 			 * which means we must have a write lock
2527 			 * on the parent
2528 			 */
2529 			if (slot == 0 && cow &&
2530 			    write_lock_level < level + 1) {
2531 				write_lock_level = level + 1;
2532 				btrfs_release_path(p);
2533 				goto again;
2534 			}
2535 
2536 			unlock_up(p, level, lowest_unlock,
2537 				  min_write_lock_level, &write_lock_level);
2538 
2539 			if (level == lowest_level) {
2540 				if (dec)
2541 					p->slots[level]++;
2542 				goto done;
2543 			}
2544 
2545 			err = read_block_for_search(trans, root, p,
2546 						    &b, level, slot, key, 0);
2547 			if (err == -EAGAIN)
2548 				goto again;
2549 			if (err) {
2550 				ret = err;
2551 				goto done;
2552 			}
2553 
2554 			if (!p->skip_locking) {
2555 				level = btrfs_header_level(b);
2556 				if (level <= write_lock_level) {
2557 					err = btrfs_try_tree_write_lock(b);
2558 					if (!err) {
2559 						btrfs_set_path_blocking(p);
2560 						btrfs_tree_lock(b);
2561 						btrfs_clear_path_blocking(p, b,
2562 								  BTRFS_WRITE_LOCK);
2563 					}
2564 					p->locks[level] = BTRFS_WRITE_LOCK;
2565 				} else {
2566 					err = btrfs_try_tree_read_lock(b);
2567 					if (!err) {
2568 						btrfs_set_path_blocking(p);
2569 						btrfs_tree_read_lock(b);
2570 						btrfs_clear_path_blocking(p, b,
2571 								  BTRFS_READ_LOCK);
2572 					}
2573 					p->locks[level] = BTRFS_READ_LOCK;
2574 				}
2575 				p->nodes[level] = b;
2576 			}
2577 		} else {
2578 			p->slots[level] = slot;
2579 			if (ins_len > 0 &&
2580 			    btrfs_leaf_free_space(root, b) < ins_len) {
2581 				if (write_lock_level < 1) {
2582 					write_lock_level = 1;
2583 					btrfs_release_path(p);
2584 					goto again;
2585 				}
2586 
2587 				btrfs_set_path_blocking(p);
2588 				err = split_leaf(trans, root, key,
2589 						 p, ins_len, ret == 0);
2590 				btrfs_clear_path_blocking(p, NULL, 0);
2591 
2592 				BUG_ON(err > 0);
2593 				if (err) {
2594 					ret = err;
2595 					goto done;
2596 				}
2597 			}
2598 			if (!p->search_for_split)
2599 				unlock_up(p, level, lowest_unlock,
2600 					  min_write_lock_level, &write_lock_level);
2601 			goto done;
2602 		}
2603 	}
2604 	ret = 1;
2605 done:
2606 	/*
2607 	 * we don't really know what they plan on doing with the path
2608 	 * from here on, so for now just mark it as blocking
2609 	 */
2610 	if (!p->leave_spinning)
2611 		btrfs_set_path_blocking(p);
2612 	if (ret < 0)
2613 		btrfs_release_path(p);
2614 	return ret;
2615 }
2616 
2617 /*
2618  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2619  * current state of the tree together with the operations recorded in the tree
2620  * modification log to search for the key in a previous version of this tree, as
2621  * denoted by the time_seq parameter.
2622  *
2623  * Naturally, there is no support for insert, delete or cow operations.
2624  *
2625  * The resulting path and return value will be set up as if we called
2626  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2627  */
2628 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2629 			  struct btrfs_path *p, u64 time_seq)
2630 {
2631 	struct extent_buffer *b;
2632 	int slot;
2633 	int ret;
2634 	int err;
2635 	int level;
2636 	int lowest_unlock = 1;
2637 	u8 lowest_level = 0;
2638 
2639 	lowest_level = p->lowest_level;
2640 	WARN_ON(p->nodes[0] != NULL);
2641 
2642 	if (p->search_commit_root) {
2643 		BUG_ON(time_seq);
2644 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2645 	}
2646 
2647 again:
2648 	b = get_old_root(root, time_seq);
2649 	level = btrfs_header_level(b);
2650 	p->locks[level] = BTRFS_READ_LOCK;
2651 
2652 	while (b) {
2653 		level = btrfs_header_level(b);
2654 		p->nodes[level] = b;
2655 		btrfs_clear_path_blocking(p, NULL, 0);
2656 
2657 		/*
2658 		 * we have a lock on b and as long as we aren't changing
2659 		 * the tree, there is no way to for the items in b to change.
2660 		 * It is safe to drop the lock on our parent before we
2661 		 * go through the expensive btree search on b.
2662 		 */
2663 		btrfs_unlock_up_safe(p, level + 1);
2664 
2665 		ret = bin_search(b, key, level, &slot);
2666 
2667 		if (level != 0) {
2668 			int dec = 0;
2669 			if (ret && slot > 0) {
2670 				dec = 1;
2671 				slot -= 1;
2672 			}
2673 			p->slots[level] = slot;
2674 			unlock_up(p, level, lowest_unlock, 0, NULL);
2675 
2676 			if (level == lowest_level) {
2677 				if (dec)
2678 					p->slots[level]++;
2679 				goto done;
2680 			}
2681 
2682 			err = read_block_for_search(NULL, root, p, &b, level,
2683 						    slot, key, time_seq);
2684 			if (err == -EAGAIN)
2685 				goto again;
2686 			if (err) {
2687 				ret = err;
2688 				goto done;
2689 			}
2690 
2691 			level = btrfs_header_level(b);
2692 			err = btrfs_try_tree_read_lock(b);
2693 			if (!err) {
2694 				btrfs_set_path_blocking(p);
2695 				btrfs_tree_read_lock(b);
2696 				btrfs_clear_path_blocking(p, b,
2697 							  BTRFS_READ_LOCK);
2698 			}
2699 			p->locks[level] = BTRFS_READ_LOCK;
2700 			p->nodes[level] = b;
2701 			b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2702 			if (b != p->nodes[level]) {
2703 				btrfs_tree_unlock_rw(p->nodes[level],
2704 						     p->locks[level]);
2705 				p->locks[level] = 0;
2706 				p->nodes[level] = b;
2707 			}
2708 		} else {
2709 			p->slots[level] = slot;
2710 			unlock_up(p, level, lowest_unlock, 0, NULL);
2711 			goto done;
2712 		}
2713 	}
2714 	ret = 1;
2715 done:
2716 	if (!p->leave_spinning)
2717 		btrfs_set_path_blocking(p);
2718 	if (ret < 0)
2719 		btrfs_release_path(p);
2720 
2721 	return ret;
2722 }
2723 
2724 /*
2725  * adjust the pointers going up the tree, starting at level
2726  * making sure the right key of each node is points to 'key'.
2727  * This is used after shifting pointers to the left, so it stops
2728  * fixing up pointers when a given leaf/node is not in slot 0 of the
2729  * higher levels
2730  *
2731  */
2732 static void fixup_low_keys(struct btrfs_trans_handle *trans,
2733 			   struct btrfs_root *root, struct btrfs_path *path,
2734 			   struct btrfs_disk_key *key, int level)
2735 {
2736 	int i;
2737 	struct extent_buffer *t;
2738 
2739 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2740 		int tslot = path->slots[i];
2741 		if (!path->nodes[i])
2742 			break;
2743 		t = path->nodes[i];
2744 		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
2745 		btrfs_set_node_key(t, key, tslot);
2746 		btrfs_mark_buffer_dirty(path->nodes[i]);
2747 		if (tslot != 0)
2748 			break;
2749 	}
2750 }
2751 
2752 /*
2753  * update item key.
2754  *
2755  * This function isn't completely safe. It's the caller's responsibility
2756  * that the new key won't break the order
2757  */
2758 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2759 			     struct btrfs_root *root, struct btrfs_path *path,
2760 			     struct btrfs_key *new_key)
2761 {
2762 	struct btrfs_disk_key disk_key;
2763 	struct extent_buffer *eb;
2764 	int slot;
2765 
2766 	eb = path->nodes[0];
2767 	slot = path->slots[0];
2768 	if (slot > 0) {
2769 		btrfs_item_key(eb, &disk_key, slot - 1);
2770 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2771 	}
2772 	if (slot < btrfs_header_nritems(eb) - 1) {
2773 		btrfs_item_key(eb, &disk_key, slot + 1);
2774 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2775 	}
2776 
2777 	btrfs_cpu_key_to_disk(&disk_key, new_key);
2778 	btrfs_set_item_key(eb, &disk_key, slot);
2779 	btrfs_mark_buffer_dirty(eb);
2780 	if (slot == 0)
2781 		fixup_low_keys(trans, root, path, &disk_key, 1);
2782 }
2783 
2784 /*
2785  * try to push data from one node into the next node left in the
2786  * tree.
2787  *
2788  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2789  * error, and > 0 if there was no room in the left hand block.
2790  */
2791 static int push_node_left(struct btrfs_trans_handle *trans,
2792 			  struct btrfs_root *root, struct extent_buffer *dst,
2793 			  struct extent_buffer *src, int empty)
2794 {
2795 	int push_items = 0;
2796 	int src_nritems;
2797 	int dst_nritems;
2798 	int ret = 0;
2799 
2800 	src_nritems = btrfs_header_nritems(src);
2801 	dst_nritems = btrfs_header_nritems(dst);
2802 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2803 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2804 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2805 
2806 	if (!empty && src_nritems <= 8)
2807 		return 1;
2808 
2809 	if (push_items <= 0)
2810 		return 1;
2811 
2812 	if (empty) {
2813 		push_items = min(src_nritems, push_items);
2814 		if (push_items < src_nritems) {
2815 			/* leave at least 8 pointers in the node if
2816 			 * we aren't going to empty it
2817 			 */
2818 			if (src_nritems - push_items < 8) {
2819 				if (push_items <= 8)
2820 					return 1;
2821 				push_items -= 8;
2822 			}
2823 		}
2824 	} else
2825 		push_items = min(src_nritems - 8, push_items);
2826 
2827 	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2828 			     push_items);
2829 	copy_extent_buffer(dst, src,
2830 			   btrfs_node_key_ptr_offset(dst_nritems),
2831 			   btrfs_node_key_ptr_offset(0),
2832 			   push_items * sizeof(struct btrfs_key_ptr));
2833 
2834 	if (push_items < src_nritems) {
2835 		tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2836 				     src_nritems - push_items);
2837 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2838 				      btrfs_node_key_ptr_offset(push_items),
2839 				      (src_nritems - push_items) *
2840 				      sizeof(struct btrfs_key_ptr));
2841 	}
2842 	btrfs_set_header_nritems(src, src_nritems - push_items);
2843 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2844 	btrfs_mark_buffer_dirty(src);
2845 	btrfs_mark_buffer_dirty(dst);
2846 
2847 	return ret;
2848 }
2849 
2850 /*
2851  * try to push data from one node into the next node right in the
2852  * tree.
2853  *
2854  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2855  * error, and > 0 if there was no room in the right hand block.
2856  *
2857  * this will  only push up to 1/2 the contents of the left node over
2858  */
2859 static int balance_node_right(struct btrfs_trans_handle *trans,
2860 			      struct btrfs_root *root,
2861 			      struct extent_buffer *dst,
2862 			      struct extent_buffer *src)
2863 {
2864 	int push_items = 0;
2865 	int max_push;
2866 	int src_nritems;
2867 	int dst_nritems;
2868 	int ret = 0;
2869 
2870 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2871 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2872 
2873 	src_nritems = btrfs_header_nritems(src);
2874 	dst_nritems = btrfs_header_nritems(dst);
2875 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2876 	if (push_items <= 0)
2877 		return 1;
2878 
2879 	if (src_nritems < 4)
2880 		return 1;
2881 
2882 	max_push = src_nritems / 2 + 1;
2883 	/* don't try to empty the node */
2884 	if (max_push >= src_nritems)
2885 		return 1;
2886 
2887 	if (max_push < push_items)
2888 		push_items = max_push;
2889 
2890 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
2891 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2892 				      btrfs_node_key_ptr_offset(0),
2893 				      (dst_nritems) *
2894 				      sizeof(struct btrfs_key_ptr));
2895 
2896 	tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
2897 			     src_nritems - push_items, push_items);
2898 	copy_extent_buffer(dst, src,
2899 			   btrfs_node_key_ptr_offset(0),
2900 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2901 			   push_items * sizeof(struct btrfs_key_ptr));
2902 
2903 	btrfs_set_header_nritems(src, src_nritems - push_items);
2904 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2905 
2906 	btrfs_mark_buffer_dirty(src);
2907 	btrfs_mark_buffer_dirty(dst);
2908 
2909 	return ret;
2910 }
2911 
2912 /*
2913  * helper function to insert a new root level in the tree.
2914  * A new node is allocated, and a single item is inserted to
2915  * point to the existing root
2916  *
2917  * returns zero on success or < 0 on failure.
2918  */
2919 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2920 			   struct btrfs_root *root,
2921 			   struct btrfs_path *path, int level)
2922 {
2923 	u64 lower_gen;
2924 	struct extent_buffer *lower;
2925 	struct extent_buffer *c;
2926 	struct extent_buffer *old;
2927 	struct btrfs_disk_key lower_key;
2928 
2929 	BUG_ON(path->nodes[level]);
2930 	BUG_ON(path->nodes[level-1] != root->node);
2931 
2932 	lower = path->nodes[level-1];
2933 	if (level == 1)
2934 		btrfs_item_key(lower, &lower_key, 0);
2935 	else
2936 		btrfs_node_key(lower, &lower_key, 0);
2937 
2938 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2939 				   root->root_key.objectid, &lower_key,
2940 				   level, root->node->start, 0);
2941 	if (IS_ERR(c))
2942 		return PTR_ERR(c);
2943 
2944 	root_add_used(root, root->nodesize);
2945 
2946 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2947 	btrfs_set_header_nritems(c, 1);
2948 	btrfs_set_header_level(c, level);
2949 	btrfs_set_header_bytenr(c, c->start);
2950 	btrfs_set_header_generation(c, trans->transid);
2951 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2952 	btrfs_set_header_owner(c, root->root_key.objectid);
2953 
2954 	write_extent_buffer(c, root->fs_info->fsid,
2955 			    (unsigned long)btrfs_header_fsid(c),
2956 			    BTRFS_FSID_SIZE);
2957 
2958 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2959 			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
2960 			    BTRFS_UUID_SIZE);
2961 
2962 	btrfs_set_node_key(c, &lower_key, 0);
2963 	btrfs_set_node_blockptr(c, 0, lower->start);
2964 	lower_gen = btrfs_header_generation(lower);
2965 	WARN_ON(lower_gen != trans->transid);
2966 
2967 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2968 
2969 	btrfs_mark_buffer_dirty(c);
2970 
2971 	old = root->node;
2972 	tree_mod_log_set_root_pointer(root, c);
2973 	rcu_assign_pointer(root->node, c);
2974 
2975 	/* the super has an extra ref to root->node */
2976 	free_extent_buffer(old);
2977 
2978 	add_root_to_dirty_list(root);
2979 	extent_buffer_get(c);
2980 	path->nodes[level] = c;
2981 	path->locks[level] = BTRFS_WRITE_LOCK;
2982 	path->slots[level] = 0;
2983 	return 0;
2984 }
2985 
2986 /*
2987  * worker function to insert a single pointer in a node.
2988  * the node should have enough room for the pointer already
2989  *
2990  * slot and level indicate where you want the key to go, and
2991  * blocknr is the block the key points to.
2992  */
2993 static void insert_ptr(struct btrfs_trans_handle *trans,
2994 		       struct btrfs_root *root, struct btrfs_path *path,
2995 		       struct btrfs_disk_key *key, u64 bytenr,
2996 		       int slot, int level)
2997 {
2998 	struct extent_buffer *lower;
2999 	int nritems;
3000 	int ret;
3001 
3002 	BUG_ON(!path->nodes[level]);
3003 	btrfs_assert_tree_locked(path->nodes[level]);
3004 	lower = path->nodes[level];
3005 	nritems = btrfs_header_nritems(lower);
3006 	BUG_ON(slot > nritems);
3007 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3008 	if (slot != nritems) {
3009 		if (level)
3010 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3011 					     slot, nritems - slot);
3012 		memmove_extent_buffer(lower,
3013 			      btrfs_node_key_ptr_offset(slot + 1),
3014 			      btrfs_node_key_ptr_offset(slot),
3015 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3016 	}
3017 	if (level) {
3018 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3019 					      MOD_LOG_KEY_ADD);
3020 		BUG_ON(ret < 0);
3021 	}
3022 	btrfs_set_node_key(lower, key, slot);
3023 	btrfs_set_node_blockptr(lower, slot, bytenr);
3024 	WARN_ON(trans->transid == 0);
3025 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3026 	btrfs_set_header_nritems(lower, nritems + 1);
3027 	btrfs_mark_buffer_dirty(lower);
3028 }
3029 
3030 /*
3031  * split the node at the specified level in path in two.
3032  * The path is corrected to point to the appropriate node after the split
3033  *
3034  * Before splitting this tries to make some room in the node by pushing
3035  * left and right, if either one works, it returns right away.
3036  *
3037  * returns 0 on success and < 0 on failure
3038  */
3039 static noinline int split_node(struct btrfs_trans_handle *trans,
3040 			       struct btrfs_root *root,
3041 			       struct btrfs_path *path, int level)
3042 {
3043 	struct extent_buffer *c;
3044 	struct extent_buffer *split;
3045 	struct btrfs_disk_key disk_key;
3046 	int mid;
3047 	int ret;
3048 	u32 c_nritems;
3049 
3050 	c = path->nodes[level];
3051 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3052 	if (c == root->node) {
3053 		/* trying to split the root, lets make a new one */
3054 		ret = insert_new_root(trans, root, path, level + 1);
3055 		if (ret)
3056 			return ret;
3057 	} else {
3058 		ret = push_nodes_for_insert(trans, root, path, level);
3059 		c = path->nodes[level];
3060 		if (!ret && btrfs_header_nritems(c) <
3061 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3062 			return 0;
3063 		if (ret < 0)
3064 			return ret;
3065 	}
3066 
3067 	c_nritems = btrfs_header_nritems(c);
3068 	mid = (c_nritems + 1) / 2;
3069 	btrfs_node_key(c, &disk_key, mid);
3070 
3071 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3072 					root->root_key.objectid,
3073 					&disk_key, level, c->start, 0);
3074 	if (IS_ERR(split))
3075 		return PTR_ERR(split);
3076 
3077 	root_add_used(root, root->nodesize);
3078 
3079 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3080 	btrfs_set_header_level(split, btrfs_header_level(c));
3081 	btrfs_set_header_bytenr(split, split->start);
3082 	btrfs_set_header_generation(split, trans->transid);
3083 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3084 	btrfs_set_header_owner(split, root->root_key.objectid);
3085 	write_extent_buffer(split, root->fs_info->fsid,
3086 			    (unsigned long)btrfs_header_fsid(split),
3087 			    BTRFS_FSID_SIZE);
3088 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3089 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
3090 			    BTRFS_UUID_SIZE);
3091 
3092 	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3093 	copy_extent_buffer(split, c,
3094 			   btrfs_node_key_ptr_offset(0),
3095 			   btrfs_node_key_ptr_offset(mid),
3096 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3097 	btrfs_set_header_nritems(split, c_nritems - mid);
3098 	btrfs_set_header_nritems(c, mid);
3099 	ret = 0;
3100 
3101 	btrfs_mark_buffer_dirty(c);
3102 	btrfs_mark_buffer_dirty(split);
3103 
3104 	insert_ptr(trans, root, path, &disk_key, split->start,
3105 		   path->slots[level + 1] + 1, level + 1);
3106 
3107 	if (path->slots[level] >= mid) {
3108 		path->slots[level] -= mid;
3109 		btrfs_tree_unlock(c);
3110 		free_extent_buffer(c);
3111 		path->nodes[level] = split;
3112 		path->slots[level + 1] += 1;
3113 	} else {
3114 		btrfs_tree_unlock(split);
3115 		free_extent_buffer(split);
3116 	}
3117 	return ret;
3118 }
3119 
3120 /*
3121  * how many bytes are required to store the items in a leaf.  start
3122  * and nr indicate which items in the leaf to check.  This totals up the
3123  * space used both by the item structs and the item data
3124  */
3125 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3126 {
3127 	int data_len;
3128 	int nritems = btrfs_header_nritems(l);
3129 	int end = min(nritems, start + nr) - 1;
3130 
3131 	if (!nr)
3132 		return 0;
3133 	data_len = btrfs_item_end_nr(l, start);
3134 	data_len = data_len - btrfs_item_offset_nr(l, end);
3135 	data_len += sizeof(struct btrfs_item) * nr;
3136 	WARN_ON(data_len < 0);
3137 	return data_len;
3138 }
3139 
3140 /*
3141  * The space between the end of the leaf items and
3142  * the start of the leaf data.  IOW, how much room
3143  * the leaf has left for both items and data
3144  */
3145 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3146 				   struct extent_buffer *leaf)
3147 {
3148 	int nritems = btrfs_header_nritems(leaf);
3149 	int ret;
3150 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3151 	if (ret < 0) {
3152 		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3153 		       "used %d nritems %d\n",
3154 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3155 		       leaf_space_used(leaf, 0, nritems), nritems);
3156 	}
3157 	return ret;
3158 }
3159 
3160 /*
3161  * min slot controls the lowest index we're willing to push to the
3162  * right.  We'll push up to and including min_slot, but no lower
3163  */
3164 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3165 				      struct btrfs_root *root,
3166 				      struct btrfs_path *path,
3167 				      int data_size, int empty,
3168 				      struct extent_buffer *right,
3169 				      int free_space, u32 left_nritems,
3170 				      u32 min_slot)
3171 {
3172 	struct extent_buffer *left = path->nodes[0];
3173 	struct extent_buffer *upper = path->nodes[1];
3174 	struct btrfs_map_token token;
3175 	struct btrfs_disk_key disk_key;
3176 	int slot;
3177 	u32 i;
3178 	int push_space = 0;
3179 	int push_items = 0;
3180 	struct btrfs_item *item;
3181 	u32 nr;
3182 	u32 right_nritems;
3183 	u32 data_end;
3184 	u32 this_item_size;
3185 
3186 	btrfs_init_map_token(&token);
3187 
3188 	if (empty)
3189 		nr = 0;
3190 	else
3191 		nr = max_t(u32, 1, min_slot);
3192 
3193 	if (path->slots[0] >= left_nritems)
3194 		push_space += data_size;
3195 
3196 	slot = path->slots[1];
3197 	i = left_nritems - 1;
3198 	while (i >= nr) {
3199 		item = btrfs_item_nr(left, i);
3200 
3201 		if (!empty && push_items > 0) {
3202 			if (path->slots[0] > i)
3203 				break;
3204 			if (path->slots[0] == i) {
3205 				int space = btrfs_leaf_free_space(root, left);
3206 				if (space + push_space * 2 > free_space)
3207 					break;
3208 			}
3209 		}
3210 
3211 		if (path->slots[0] == i)
3212 			push_space += data_size;
3213 
3214 		this_item_size = btrfs_item_size(left, item);
3215 		if (this_item_size + sizeof(*item) + push_space > free_space)
3216 			break;
3217 
3218 		push_items++;
3219 		push_space += this_item_size + sizeof(*item);
3220 		if (i == 0)
3221 			break;
3222 		i--;
3223 	}
3224 
3225 	if (push_items == 0)
3226 		goto out_unlock;
3227 
3228 	if (!empty && push_items == left_nritems)
3229 		WARN_ON(1);
3230 
3231 	/* push left to right */
3232 	right_nritems = btrfs_header_nritems(right);
3233 
3234 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3235 	push_space -= leaf_data_end(root, left);
3236 
3237 	/* make room in the right data area */
3238 	data_end = leaf_data_end(root, right);
3239 	memmove_extent_buffer(right,
3240 			      btrfs_leaf_data(right) + data_end - push_space,
3241 			      btrfs_leaf_data(right) + data_end,
3242 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3243 
3244 	/* copy from the left data area */
3245 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3246 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3247 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3248 		     push_space);
3249 
3250 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3251 			      btrfs_item_nr_offset(0),
3252 			      right_nritems * sizeof(struct btrfs_item));
3253 
3254 	/* copy the items from left to right */
3255 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3256 		   btrfs_item_nr_offset(left_nritems - push_items),
3257 		   push_items * sizeof(struct btrfs_item));
3258 
3259 	/* update the item pointers */
3260 	right_nritems += push_items;
3261 	btrfs_set_header_nritems(right, right_nritems);
3262 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3263 	for (i = 0; i < right_nritems; i++) {
3264 		item = btrfs_item_nr(right, i);
3265 		push_space -= btrfs_token_item_size(right, item, &token);
3266 		btrfs_set_token_item_offset(right, item, push_space, &token);
3267 	}
3268 
3269 	left_nritems -= push_items;
3270 	btrfs_set_header_nritems(left, left_nritems);
3271 
3272 	if (left_nritems)
3273 		btrfs_mark_buffer_dirty(left);
3274 	else
3275 		clean_tree_block(trans, root, left);
3276 
3277 	btrfs_mark_buffer_dirty(right);
3278 
3279 	btrfs_item_key(right, &disk_key, 0);
3280 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3281 	btrfs_mark_buffer_dirty(upper);
3282 
3283 	/* then fixup the leaf pointer in the path */
3284 	if (path->slots[0] >= left_nritems) {
3285 		path->slots[0] -= left_nritems;
3286 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3287 			clean_tree_block(trans, root, path->nodes[0]);
3288 		btrfs_tree_unlock(path->nodes[0]);
3289 		free_extent_buffer(path->nodes[0]);
3290 		path->nodes[0] = right;
3291 		path->slots[1] += 1;
3292 	} else {
3293 		btrfs_tree_unlock(right);
3294 		free_extent_buffer(right);
3295 	}
3296 	return 0;
3297 
3298 out_unlock:
3299 	btrfs_tree_unlock(right);
3300 	free_extent_buffer(right);
3301 	return 1;
3302 }
3303 
3304 /*
3305  * push some data in the path leaf to the right, trying to free up at
3306  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3307  *
3308  * returns 1 if the push failed because the other node didn't have enough
3309  * room, 0 if everything worked out and < 0 if there were major errors.
3310  *
3311  * this will push starting from min_slot to the end of the leaf.  It won't
3312  * push any slot lower than min_slot
3313  */
3314 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3315 			   *root, struct btrfs_path *path,
3316 			   int min_data_size, int data_size,
3317 			   int empty, u32 min_slot)
3318 {
3319 	struct extent_buffer *left = path->nodes[0];
3320 	struct extent_buffer *right;
3321 	struct extent_buffer *upper;
3322 	int slot;
3323 	int free_space;
3324 	u32 left_nritems;
3325 	int ret;
3326 
3327 	if (!path->nodes[1])
3328 		return 1;
3329 
3330 	slot = path->slots[1];
3331 	upper = path->nodes[1];
3332 	if (slot >= btrfs_header_nritems(upper) - 1)
3333 		return 1;
3334 
3335 	btrfs_assert_tree_locked(path->nodes[1]);
3336 
3337 	right = read_node_slot(root, upper, slot + 1);
3338 	if (right == NULL)
3339 		return 1;
3340 
3341 	btrfs_tree_lock(right);
3342 	btrfs_set_lock_blocking(right);
3343 
3344 	free_space = btrfs_leaf_free_space(root, right);
3345 	if (free_space < data_size)
3346 		goto out_unlock;
3347 
3348 	/* cow and double check */
3349 	ret = btrfs_cow_block(trans, root, right, upper,
3350 			      slot + 1, &right);
3351 	if (ret)
3352 		goto out_unlock;
3353 
3354 	free_space = btrfs_leaf_free_space(root, right);
3355 	if (free_space < data_size)
3356 		goto out_unlock;
3357 
3358 	left_nritems = btrfs_header_nritems(left);
3359 	if (left_nritems == 0)
3360 		goto out_unlock;
3361 
3362 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3363 				right, free_space, left_nritems, min_slot);
3364 out_unlock:
3365 	btrfs_tree_unlock(right);
3366 	free_extent_buffer(right);
3367 	return 1;
3368 }
3369 
3370 /*
3371  * push some data in the path leaf to the left, trying to free up at
3372  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3373  *
3374  * max_slot can put a limit on how far into the leaf we'll push items.  The
3375  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3376  * items
3377  */
3378 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3379 				     struct btrfs_root *root,
3380 				     struct btrfs_path *path, int data_size,
3381 				     int empty, struct extent_buffer *left,
3382 				     int free_space, u32 right_nritems,
3383 				     u32 max_slot)
3384 {
3385 	struct btrfs_disk_key disk_key;
3386 	struct extent_buffer *right = path->nodes[0];
3387 	int i;
3388 	int push_space = 0;
3389 	int push_items = 0;
3390 	struct btrfs_item *item;
3391 	u32 old_left_nritems;
3392 	u32 nr;
3393 	int ret = 0;
3394 	u32 this_item_size;
3395 	u32 old_left_item_size;
3396 	struct btrfs_map_token token;
3397 
3398 	btrfs_init_map_token(&token);
3399 
3400 	if (empty)
3401 		nr = min(right_nritems, max_slot);
3402 	else
3403 		nr = min(right_nritems - 1, max_slot);
3404 
3405 	for (i = 0; i < nr; i++) {
3406 		item = btrfs_item_nr(right, i);
3407 
3408 		if (!empty && push_items > 0) {
3409 			if (path->slots[0] < i)
3410 				break;
3411 			if (path->slots[0] == i) {
3412 				int space = btrfs_leaf_free_space(root, right);
3413 				if (space + push_space * 2 > free_space)
3414 					break;
3415 			}
3416 		}
3417 
3418 		if (path->slots[0] == i)
3419 			push_space += data_size;
3420 
3421 		this_item_size = btrfs_item_size(right, item);
3422 		if (this_item_size + sizeof(*item) + push_space > free_space)
3423 			break;
3424 
3425 		push_items++;
3426 		push_space += this_item_size + sizeof(*item);
3427 	}
3428 
3429 	if (push_items == 0) {
3430 		ret = 1;
3431 		goto out;
3432 	}
3433 	if (!empty && push_items == btrfs_header_nritems(right))
3434 		WARN_ON(1);
3435 
3436 	/* push data from right to left */
3437 	copy_extent_buffer(left, right,
3438 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3439 			   btrfs_item_nr_offset(0),
3440 			   push_items * sizeof(struct btrfs_item));
3441 
3442 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3443 		     btrfs_item_offset_nr(right, push_items - 1);
3444 
3445 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3446 		     leaf_data_end(root, left) - push_space,
3447 		     btrfs_leaf_data(right) +
3448 		     btrfs_item_offset_nr(right, push_items - 1),
3449 		     push_space);
3450 	old_left_nritems = btrfs_header_nritems(left);
3451 	BUG_ON(old_left_nritems <= 0);
3452 
3453 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3454 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3455 		u32 ioff;
3456 
3457 		item = btrfs_item_nr(left, i);
3458 
3459 		ioff = btrfs_token_item_offset(left, item, &token);
3460 		btrfs_set_token_item_offset(left, item,
3461 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3462 		      &token);
3463 	}
3464 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3465 
3466 	/* fixup right node */
3467 	if (push_items > right_nritems) {
3468 		printk(KERN_CRIT "push items %d nr %u\n", push_items,
3469 		       right_nritems);
3470 		WARN_ON(1);
3471 	}
3472 
3473 	if (push_items < right_nritems) {
3474 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3475 						  leaf_data_end(root, right);
3476 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3477 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3478 				      btrfs_leaf_data(right) +
3479 				      leaf_data_end(root, right), push_space);
3480 
3481 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3482 			      btrfs_item_nr_offset(push_items),
3483 			     (btrfs_header_nritems(right) - push_items) *
3484 			     sizeof(struct btrfs_item));
3485 	}
3486 	right_nritems -= push_items;
3487 	btrfs_set_header_nritems(right, right_nritems);
3488 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3489 	for (i = 0; i < right_nritems; i++) {
3490 		item = btrfs_item_nr(right, i);
3491 
3492 		push_space = push_space - btrfs_token_item_size(right,
3493 								item, &token);
3494 		btrfs_set_token_item_offset(right, item, push_space, &token);
3495 	}
3496 
3497 	btrfs_mark_buffer_dirty(left);
3498 	if (right_nritems)
3499 		btrfs_mark_buffer_dirty(right);
3500 	else
3501 		clean_tree_block(trans, root, right);
3502 
3503 	btrfs_item_key(right, &disk_key, 0);
3504 	fixup_low_keys(trans, root, path, &disk_key, 1);
3505 
3506 	/* then fixup the leaf pointer in the path */
3507 	if (path->slots[0] < push_items) {
3508 		path->slots[0] += old_left_nritems;
3509 		btrfs_tree_unlock(path->nodes[0]);
3510 		free_extent_buffer(path->nodes[0]);
3511 		path->nodes[0] = left;
3512 		path->slots[1] -= 1;
3513 	} else {
3514 		btrfs_tree_unlock(left);
3515 		free_extent_buffer(left);
3516 		path->slots[0] -= push_items;
3517 	}
3518 	BUG_ON(path->slots[0] < 0);
3519 	return ret;
3520 out:
3521 	btrfs_tree_unlock(left);
3522 	free_extent_buffer(left);
3523 	return ret;
3524 }
3525 
3526 /*
3527  * push some data in the path leaf to the left, trying to free up at
3528  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3529  *
3530  * max_slot can put a limit on how far into the leaf we'll push items.  The
3531  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3532  * items
3533  */
3534 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3535 			  *root, struct btrfs_path *path, int min_data_size,
3536 			  int data_size, int empty, u32 max_slot)
3537 {
3538 	struct extent_buffer *right = path->nodes[0];
3539 	struct extent_buffer *left;
3540 	int slot;
3541 	int free_space;
3542 	u32 right_nritems;
3543 	int ret = 0;
3544 
3545 	slot = path->slots[1];
3546 	if (slot == 0)
3547 		return 1;
3548 	if (!path->nodes[1])
3549 		return 1;
3550 
3551 	right_nritems = btrfs_header_nritems(right);
3552 	if (right_nritems == 0)
3553 		return 1;
3554 
3555 	btrfs_assert_tree_locked(path->nodes[1]);
3556 
3557 	left = read_node_slot(root, path->nodes[1], slot - 1);
3558 	if (left == NULL)
3559 		return 1;
3560 
3561 	btrfs_tree_lock(left);
3562 	btrfs_set_lock_blocking(left);
3563 
3564 	free_space = btrfs_leaf_free_space(root, left);
3565 	if (free_space < data_size) {
3566 		ret = 1;
3567 		goto out;
3568 	}
3569 
3570 	/* cow and double check */
3571 	ret = btrfs_cow_block(trans, root, left,
3572 			      path->nodes[1], slot - 1, &left);
3573 	if (ret) {
3574 		/* we hit -ENOSPC, but it isn't fatal here */
3575 		if (ret == -ENOSPC)
3576 			ret = 1;
3577 		goto out;
3578 	}
3579 
3580 	free_space = btrfs_leaf_free_space(root, left);
3581 	if (free_space < data_size) {
3582 		ret = 1;
3583 		goto out;
3584 	}
3585 
3586 	return __push_leaf_left(trans, root, path, min_data_size,
3587 			       empty, left, free_space, right_nritems,
3588 			       max_slot);
3589 out:
3590 	btrfs_tree_unlock(left);
3591 	free_extent_buffer(left);
3592 	return ret;
3593 }
3594 
3595 /*
3596  * split the path's leaf in two, making sure there is at least data_size
3597  * available for the resulting leaf level of the path.
3598  */
3599 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3600 				    struct btrfs_root *root,
3601 				    struct btrfs_path *path,
3602 				    struct extent_buffer *l,
3603 				    struct extent_buffer *right,
3604 				    int slot, int mid, int nritems)
3605 {
3606 	int data_copy_size;
3607 	int rt_data_off;
3608 	int i;
3609 	struct btrfs_disk_key disk_key;
3610 	struct btrfs_map_token token;
3611 
3612 	btrfs_init_map_token(&token);
3613 
3614 	nritems = nritems - mid;
3615 	btrfs_set_header_nritems(right, nritems);
3616 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3617 
3618 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3619 			   btrfs_item_nr_offset(mid),
3620 			   nritems * sizeof(struct btrfs_item));
3621 
3622 	copy_extent_buffer(right, l,
3623 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3624 		     data_copy_size, btrfs_leaf_data(l) +
3625 		     leaf_data_end(root, l), data_copy_size);
3626 
3627 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3628 		      btrfs_item_end_nr(l, mid);
3629 
3630 	for (i = 0; i < nritems; i++) {
3631 		struct btrfs_item *item = btrfs_item_nr(right, i);
3632 		u32 ioff;
3633 
3634 		ioff = btrfs_token_item_offset(right, item, &token);
3635 		btrfs_set_token_item_offset(right, item,
3636 					    ioff + rt_data_off, &token);
3637 	}
3638 
3639 	btrfs_set_header_nritems(l, mid);
3640 	btrfs_item_key(right, &disk_key, 0);
3641 	insert_ptr(trans, root, path, &disk_key, right->start,
3642 		   path->slots[1] + 1, 1);
3643 
3644 	btrfs_mark_buffer_dirty(right);
3645 	btrfs_mark_buffer_dirty(l);
3646 	BUG_ON(path->slots[0] != slot);
3647 
3648 	if (mid <= slot) {
3649 		btrfs_tree_unlock(path->nodes[0]);
3650 		free_extent_buffer(path->nodes[0]);
3651 		path->nodes[0] = right;
3652 		path->slots[0] -= mid;
3653 		path->slots[1] += 1;
3654 	} else {
3655 		btrfs_tree_unlock(right);
3656 		free_extent_buffer(right);
3657 	}
3658 
3659 	BUG_ON(path->slots[0] < 0);
3660 }
3661 
3662 /*
3663  * double splits happen when we need to insert a big item in the middle
3664  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3665  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3666  *          A                 B                 C
3667  *
3668  * We avoid this by trying to push the items on either side of our target
3669  * into the adjacent leaves.  If all goes well we can avoid the double split
3670  * completely.
3671  */
3672 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3673 					  struct btrfs_root *root,
3674 					  struct btrfs_path *path,
3675 					  int data_size)
3676 {
3677 	int ret;
3678 	int progress = 0;
3679 	int slot;
3680 	u32 nritems;
3681 
3682 	slot = path->slots[0];
3683 
3684 	/*
3685 	 * try to push all the items after our slot into the
3686 	 * right leaf
3687 	 */
3688 	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3689 	if (ret < 0)
3690 		return ret;
3691 
3692 	if (ret == 0)
3693 		progress++;
3694 
3695 	nritems = btrfs_header_nritems(path->nodes[0]);
3696 	/*
3697 	 * our goal is to get our slot at the start or end of a leaf.  If
3698 	 * we've done so we're done
3699 	 */
3700 	if (path->slots[0] == 0 || path->slots[0] == nritems)
3701 		return 0;
3702 
3703 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3704 		return 0;
3705 
3706 	/* try to push all the items before our slot into the next leaf */
3707 	slot = path->slots[0];
3708 	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3709 	if (ret < 0)
3710 		return ret;
3711 
3712 	if (ret == 0)
3713 		progress++;
3714 
3715 	if (progress)
3716 		return 0;
3717 	return 1;
3718 }
3719 
3720 /*
3721  * split the path's leaf in two, making sure there is at least data_size
3722  * available for the resulting leaf level of the path.
3723  *
3724  * returns 0 if all went well and < 0 on failure.
3725  */
3726 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3727 			       struct btrfs_root *root,
3728 			       struct btrfs_key *ins_key,
3729 			       struct btrfs_path *path, int data_size,
3730 			       int extend)
3731 {
3732 	struct btrfs_disk_key disk_key;
3733 	struct extent_buffer *l;
3734 	u32 nritems;
3735 	int mid;
3736 	int slot;
3737 	struct extent_buffer *right;
3738 	int ret = 0;
3739 	int wret;
3740 	int split;
3741 	int num_doubles = 0;
3742 	int tried_avoid_double = 0;
3743 
3744 	l = path->nodes[0];
3745 	slot = path->slots[0];
3746 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
3747 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3748 		return -EOVERFLOW;
3749 
3750 	/* first try to make some room by pushing left and right */
3751 	if (data_size) {
3752 		wret = push_leaf_right(trans, root, path, data_size,
3753 				       data_size, 0, 0);
3754 		if (wret < 0)
3755 			return wret;
3756 		if (wret) {
3757 			wret = push_leaf_left(trans, root, path, data_size,
3758 					      data_size, 0, (u32)-1);
3759 			if (wret < 0)
3760 				return wret;
3761 		}
3762 		l = path->nodes[0];
3763 
3764 		/* did the pushes work? */
3765 		if (btrfs_leaf_free_space(root, l) >= data_size)
3766 			return 0;
3767 	}
3768 
3769 	if (!path->nodes[1]) {
3770 		ret = insert_new_root(trans, root, path, 1);
3771 		if (ret)
3772 			return ret;
3773 	}
3774 again:
3775 	split = 1;
3776 	l = path->nodes[0];
3777 	slot = path->slots[0];
3778 	nritems = btrfs_header_nritems(l);
3779 	mid = (nritems + 1) / 2;
3780 
3781 	if (mid <= slot) {
3782 		if (nritems == 1 ||
3783 		    leaf_space_used(l, mid, nritems - mid) + data_size >
3784 			BTRFS_LEAF_DATA_SIZE(root)) {
3785 			if (slot >= nritems) {
3786 				split = 0;
3787 			} else {
3788 				mid = slot;
3789 				if (mid != nritems &&
3790 				    leaf_space_used(l, mid, nritems - mid) +
3791 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3792 					if (data_size && !tried_avoid_double)
3793 						goto push_for_double;
3794 					split = 2;
3795 				}
3796 			}
3797 		}
3798 	} else {
3799 		if (leaf_space_used(l, 0, mid) + data_size >
3800 			BTRFS_LEAF_DATA_SIZE(root)) {
3801 			if (!extend && data_size && slot == 0) {
3802 				split = 0;
3803 			} else if ((extend || !data_size) && slot == 0) {
3804 				mid = 1;
3805 			} else {
3806 				mid = slot;
3807 				if (mid != nritems &&
3808 				    leaf_space_used(l, mid, nritems - mid) +
3809 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3810 					if (data_size && !tried_avoid_double)
3811 						goto push_for_double;
3812 					split = 2 ;
3813 				}
3814 			}
3815 		}
3816 	}
3817 
3818 	if (split == 0)
3819 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
3820 	else
3821 		btrfs_item_key(l, &disk_key, mid);
3822 
3823 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3824 					root->root_key.objectid,
3825 					&disk_key, 0, l->start, 0);
3826 	if (IS_ERR(right))
3827 		return PTR_ERR(right);
3828 
3829 	root_add_used(root, root->leafsize);
3830 
3831 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
3832 	btrfs_set_header_bytenr(right, right->start);
3833 	btrfs_set_header_generation(right, trans->transid);
3834 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
3835 	btrfs_set_header_owner(right, root->root_key.objectid);
3836 	btrfs_set_header_level(right, 0);
3837 	write_extent_buffer(right, root->fs_info->fsid,
3838 			    (unsigned long)btrfs_header_fsid(right),
3839 			    BTRFS_FSID_SIZE);
3840 
3841 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
3842 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
3843 			    BTRFS_UUID_SIZE);
3844 
3845 	if (split == 0) {
3846 		if (mid <= slot) {
3847 			btrfs_set_header_nritems(right, 0);
3848 			insert_ptr(trans, root, path, &disk_key, right->start,
3849 				   path->slots[1] + 1, 1);
3850 			btrfs_tree_unlock(path->nodes[0]);
3851 			free_extent_buffer(path->nodes[0]);
3852 			path->nodes[0] = right;
3853 			path->slots[0] = 0;
3854 			path->slots[1] += 1;
3855 		} else {
3856 			btrfs_set_header_nritems(right, 0);
3857 			insert_ptr(trans, root, path, &disk_key, right->start,
3858 					  path->slots[1], 1);
3859 			btrfs_tree_unlock(path->nodes[0]);
3860 			free_extent_buffer(path->nodes[0]);
3861 			path->nodes[0] = right;
3862 			path->slots[0] = 0;
3863 			if (path->slots[1] == 0)
3864 				fixup_low_keys(trans, root, path,
3865 					       &disk_key, 1);
3866 		}
3867 		btrfs_mark_buffer_dirty(right);
3868 		return ret;
3869 	}
3870 
3871 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3872 
3873 	if (split == 2) {
3874 		BUG_ON(num_doubles != 0);
3875 		num_doubles++;
3876 		goto again;
3877 	}
3878 
3879 	return 0;
3880 
3881 push_for_double:
3882 	push_for_double_split(trans, root, path, data_size);
3883 	tried_avoid_double = 1;
3884 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3885 		return 0;
3886 	goto again;
3887 }
3888 
3889 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3890 					 struct btrfs_root *root,
3891 					 struct btrfs_path *path, int ins_len)
3892 {
3893 	struct btrfs_key key;
3894 	struct extent_buffer *leaf;
3895 	struct btrfs_file_extent_item *fi;
3896 	u64 extent_len = 0;
3897 	u32 item_size;
3898 	int ret;
3899 
3900 	leaf = path->nodes[0];
3901 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3902 
3903 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3904 	       key.type != BTRFS_EXTENT_CSUM_KEY);
3905 
3906 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3907 		return 0;
3908 
3909 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3910 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3911 		fi = btrfs_item_ptr(leaf, path->slots[0],
3912 				    struct btrfs_file_extent_item);
3913 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3914 	}
3915 	btrfs_release_path(path);
3916 
3917 	path->keep_locks = 1;
3918 	path->search_for_split = 1;
3919 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3920 	path->search_for_split = 0;
3921 	if (ret < 0)
3922 		goto err;
3923 
3924 	ret = -EAGAIN;
3925 	leaf = path->nodes[0];
3926 	/* if our item isn't there or got smaller, return now */
3927 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3928 		goto err;
3929 
3930 	/* the leaf has  changed, it now has room.  return now */
3931 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3932 		goto err;
3933 
3934 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3935 		fi = btrfs_item_ptr(leaf, path->slots[0],
3936 				    struct btrfs_file_extent_item);
3937 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3938 			goto err;
3939 	}
3940 
3941 	btrfs_set_path_blocking(path);
3942 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
3943 	if (ret)
3944 		goto err;
3945 
3946 	path->keep_locks = 0;
3947 	btrfs_unlock_up_safe(path, 1);
3948 	return 0;
3949 err:
3950 	path->keep_locks = 0;
3951 	return ret;
3952 }
3953 
3954 static noinline int split_item(struct btrfs_trans_handle *trans,
3955 			       struct btrfs_root *root,
3956 			       struct btrfs_path *path,
3957 			       struct btrfs_key *new_key,
3958 			       unsigned long split_offset)
3959 {
3960 	struct extent_buffer *leaf;
3961 	struct btrfs_item *item;
3962 	struct btrfs_item *new_item;
3963 	int slot;
3964 	char *buf;
3965 	u32 nritems;
3966 	u32 item_size;
3967 	u32 orig_offset;
3968 	struct btrfs_disk_key disk_key;
3969 
3970 	leaf = path->nodes[0];
3971 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3972 
3973 	btrfs_set_path_blocking(path);
3974 
3975 	item = btrfs_item_nr(leaf, path->slots[0]);
3976 	orig_offset = btrfs_item_offset(leaf, item);
3977 	item_size = btrfs_item_size(leaf, item);
3978 
3979 	buf = kmalloc(item_size, GFP_NOFS);
3980 	if (!buf)
3981 		return -ENOMEM;
3982 
3983 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3984 			    path->slots[0]), item_size);
3985 
3986 	slot = path->slots[0] + 1;
3987 	nritems = btrfs_header_nritems(leaf);
3988 	if (slot != nritems) {
3989 		/* shift the items */
3990 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3991 				btrfs_item_nr_offset(slot),
3992 				(nritems - slot) * sizeof(struct btrfs_item));
3993 	}
3994 
3995 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3996 	btrfs_set_item_key(leaf, &disk_key, slot);
3997 
3998 	new_item = btrfs_item_nr(leaf, slot);
3999 
4000 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4001 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4002 
4003 	btrfs_set_item_offset(leaf, item,
4004 			      orig_offset + item_size - split_offset);
4005 	btrfs_set_item_size(leaf, item, split_offset);
4006 
4007 	btrfs_set_header_nritems(leaf, nritems + 1);
4008 
4009 	/* write the data for the start of the original item */
4010 	write_extent_buffer(leaf, buf,
4011 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4012 			    split_offset);
4013 
4014 	/* write the data for the new item */
4015 	write_extent_buffer(leaf, buf + split_offset,
4016 			    btrfs_item_ptr_offset(leaf, slot),
4017 			    item_size - split_offset);
4018 	btrfs_mark_buffer_dirty(leaf);
4019 
4020 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4021 	kfree(buf);
4022 	return 0;
4023 }
4024 
4025 /*
4026  * This function splits a single item into two items,
4027  * giving 'new_key' to the new item and splitting the
4028  * old one at split_offset (from the start of the item).
4029  *
4030  * The path may be released by this operation.  After
4031  * the split, the path is pointing to the old item.  The
4032  * new item is going to be in the same node as the old one.
4033  *
4034  * Note, the item being split must be smaller enough to live alone on
4035  * a tree block with room for one extra struct btrfs_item
4036  *
4037  * This allows us to split the item in place, keeping a lock on the
4038  * leaf the entire time.
4039  */
4040 int btrfs_split_item(struct btrfs_trans_handle *trans,
4041 		     struct btrfs_root *root,
4042 		     struct btrfs_path *path,
4043 		     struct btrfs_key *new_key,
4044 		     unsigned long split_offset)
4045 {
4046 	int ret;
4047 	ret = setup_leaf_for_split(trans, root, path,
4048 				   sizeof(struct btrfs_item));
4049 	if (ret)
4050 		return ret;
4051 
4052 	ret = split_item(trans, root, path, new_key, split_offset);
4053 	return ret;
4054 }
4055 
4056 /*
4057  * This function duplicate a item, giving 'new_key' to the new item.
4058  * It guarantees both items live in the same tree leaf and the new item
4059  * is contiguous with the original item.
4060  *
4061  * This allows us to split file extent in place, keeping a lock on the
4062  * leaf the entire time.
4063  */
4064 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4065 			 struct btrfs_root *root,
4066 			 struct btrfs_path *path,
4067 			 struct btrfs_key *new_key)
4068 {
4069 	struct extent_buffer *leaf;
4070 	int ret;
4071 	u32 item_size;
4072 
4073 	leaf = path->nodes[0];
4074 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4075 	ret = setup_leaf_for_split(trans, root, path,
4076 				   item_size + sizeof(struct btrfs_item));
4077 	if (ret)
4078 		return ret;
4079 
4080 	path->slots[0]++;
4081 	setup_items_for_insert(trans, root, path, new_key, &item_size,
4082 			       item_size, item_size +
4083 			       sizeof(struct btrfs_item), 1);
4084 	leaf = path->nodes[0];
4085 	memcpy_extent_buffer(leaf,
4086 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4087 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4088 			     item_size);
4089 	return 0;
4090 }
4091 
4092 /*
4093  * make the item pointed to by the path smaller.  new_size indicates
4094  * how small to make it, and from_end tells us if we just chop bytes
4095  * off the end of the item or if we shift the item to chop bytes off
4096  * the front.
4097  */
4098 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4099 			 struct btrfs_root *root,
4100 			 struct btrfs_path *path,
4101 			 u32 new_size, int from_end)
4102 {
4103 	int slot;
4104 	struct extent_buffer *leaf;
4105 	struct btrfs_item *item;
4106 	u32 nritems;
4107 	unsigned int data_end;
4108 	unsigned int old_data_start;
4109 	unsigned int old_size;
4110 	unsigned int size_diff;
4111 	int i;
4112 	struct btrfs_map_token token;
4113 
4114 	btrfs_init_map_token(&token);
4115 
4116 	leaf = path->nodes[0];
4117 	slot = path->slots[0];
4118 
4119 	old_size = btrfs_item_size_nr(leaf, slot);
4120 	if (old_size == new_size)
4121 		return;
4122 
4123 	nritems = btrfs_header_nritems(leaf);
4124 	data_end = leaf_data_end(root, leaf);
4125 
4126 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4127 
4128 	size_diff = old_size - new_size;
4129 
4130 	BUG_ON(slot < 0);
4131 	BUG_ON(slot >= nritems);
4132 
4133 	/*
4134 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4135 	 */
4136 	/* first correct the data pointers */
4137 	for (i = slot; i < nritems; i++) {
4138 		u32 ioff;
4139 		item = btrfs_item_nr(leaf, i);
4140 
4141 		ioff = btrfs_token_item_offset(leaf, item, &token);
4142 		btrfs_set_token_item_offset(leaf, item,
4143 					    ioff + size_diff, &token);
4144 	}
4145 
4146 	/* shift the data */
4147 	if (from_end) {
4148 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4149 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4150 			      data_end, old_data_start + new_size - data_end);
4151 	} else {
4152 		struct btrfs_disk_key disk_key;
4153 		u64 offset;
4154 
4155 		btrfs_item_key(leaf, &disk_key, slot);
4156 
4157 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4158 			unsigned long ptr;
4159 			struct btrfs_file_extent_item *fi;
4160 
4161 			fi = btrfs_item_ptr(leaf, slot,
4162 					    struct btrfs_file_extent_item);
4163 			fi = (struct btrfs_file_extent_item *)(
4164 			     (unsigned long)fi - size_diff);
4165 
4166 			if (btrfs_file_extent_type(leaf, fi) ==
4167 			    BTRFS_FILE_EXTENT_INLINE) {
4168 				ptr = btrfs_item_ptr_offset(leaf, slot);
4169 				memmove_extent_buffer(leaf, ptr,
4170 				      (unsigned long)fi,
4171 				      offsetof(struct btrfs_file_extent_item,
4172 						 disk_bytenr));
4173 			}
4174 		}
4175 
4176 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4177 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4178 			      data_end, old_data_start - data_end);
4179 
4180 		offset = btrfs_disk_key_offset(&disk_key);
4181 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4182 		btrfs_set_item_key(leaf, &disk_key, slot);
4183 		if (slot == 0)
4184 			fixup_low_keys(trans, root, path, &disk_key, 1);
4185 	}
4186 
4187 	item = btrfs_item_nr(leaf, slot);
4188 	btrfs_set_item_size(leaf, item, new_size);
4189 	btrfs_mark_buffer_dirty(leaf);
4190 
4191 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4192 		btrfs_print_leaf(root, leaf);
4193 		BUG();
4194 	}
4195 }
4196 
4197 /*
4198  * make the item pointed to by the path bigger, data_size is the new size.
4199  */
4200 void btrfs_extend_item(struct btrfs_trans_handle *trans,
4201 		       struct btrfs_root *root, struct btrfs_path *path,
4202 		       u32 data_size)
4203 {
4204 	int slot;
4205 	struct extent_buffer *leaf;
4206 	struct btrfs_item *item;
4207 	u32 nritems;
4208 	unsigned int data_end;
4209 	unsigned int old_data;
4210 	unsigned int old_size;
4211 	int i;
4212 	struct btrfs_map_token token;
4213 
4214 	btrfs_init_map_token(&token);
4215 
4216 	leaf = path->nodes[0];
4217 
4218 	nritems = btrfs_header_nritems(leaf);
4219 	data_end = leaf_data_end(root, leaf);
4220 
4221 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4222 		btrfs_print_leaf(root, leaf);
4223 		BUG();
4224 	}
4225 	slot = path->slots[0];
4226 	old_data = btrfs_item_end_nr(leaf, slot);
4227 
4228 	BUG_ON(slot < 0);
4229 	if (slot >= nritems) {
4230 		btrfs_print_leaf(root, leaf);
4231 		printk(KERN_CRIT "slot %d too large, nritems %d\n",
4232 		       slot, nritems);
4233 		BUG_ON(1);
4234 	}
4235 
4236 	/*
4237 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4238 	 */
4239 	/* first correct the data pointers */
4240 	for (i = slot; i < nritems; i++) {
4241 		u32 ioff;
4242 		item = btrfs_item_nr(leaf, i);
4243 
4244 		ioff = btrfs_token_item_offset(leaf, item, &token);
4245 		btrfs_set_token_item_offset(leaf, item,
4246 					    ioff - data_size, &token);
4247 	}
4248 
4249 	/* shift the data */
4250 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4251 		      data_end - data_size, btrfs_leaf_data(leaf) +
4252 		      data_end, old_data - data_end);
4253 
4254 	data_end = old_data;
4255 	old_size = btrfs_item_size_nr(leaf, slot);
4256 	item = btrfs_item_nr(leaf, slot);
4257 	btrfs_set_item_size(leaf, item, old_size + data_size);
4258 	btrfs_mark_buffer_dirty(leaf);
4259 
4260 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4261 		btrfs_print_leaf(root, leaf);
4262 		BUG();
4263 	}
4264 }
4265 
4266 /*
4267  * Given a key and some data, insert items into the tree.
4268  * This does all the path init required, making room in the tree if needed.
4269  * Returns the number of keys that were inserted.
4270  */
4271 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4272 			    struct btrfs_root *root,
4273 			    struct btrfs_path *path,
4274 			    struct btrfs_key *cpu_key, u32 *data_size,
4275 			    int nr)
4276 {
4277 	struct extent_buffer *leaf;
4278 	struct btrfs_item *item;
4279 	int ret = 0;
4280 	int slot;
4281 	int i;
4282 	u32 nritems;
4283 	u32 total_data = 0;
4284 	u32 total_size = 0;
4285 	unsigned int data_end;
4286 	struct btrfs_disk_key disk_key;
4287 	struct btrfs_key found_key;
4288 	struct btrfs_map_token token;
4289 
4290 	btrfs_init_map_token(&token);
4291 
4292 	for (i = 0; i < nr; i++) {
4293 		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4294 		    BTRFS_LEAF_DATA_SIZE(root)) {
4295 			break;
4296 			nr = i;
4297 		}
4298 		total_data += data_size[i];
4299 		total_size += data_size[i] + sizeof(struct btrfs_item);
4300 	}
4301 	BUG_ON(nr == 0);
4302 
4303 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4304 	if (ret == 0)
4305 		return -EEXIST;
4306 	if (ret < 0)
4307 		goto out;
4308 
4309 	leaf = path->nodes[0];
4310 
4311 	nritems = btrfs_header_nritems(leaf);
4312 	data_end = leaf_data_end(root, leaf);
4313 
4314 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4315 		for (i = nr; i >= 0; i--) {
4316 			total_data -= data_size[i];
4317 			total_size -= data_size[i] + sizeof(struct btrfs_item);
4318 			if (total_size < btrfs_leaf_free_space(root, leaf))
4319 				break;
4320 		}
4321 		nr = i;
4322 	}
4323 
4324 	slot = path->slots[0];
4325 	BUG_ON(slot < 0);
4326 
4327 	if (slot != nritems) {
4328 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4329 
4330 		item = btrfs_item_nr(leaf, slot);
4331 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
4332 
4333 		/* figure out how many keys we can insert in here */
4334 		total_data = data_size[0];
4335 		for (i = 1; i < nr; i++) {
4336 			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4337 				break;
4338 			total_data += data_size[i];
4339 		}
4340 		nr = i;
4341 
4342 		if (old_data < data_end) {
4343 			btrfs_print_leaf(root, leaf);
4344 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4345 			       slot, old_data, data_end);
4346 			BUG_ON(1);
4347 		}
4348 		/*
4349 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4350 		 */
4351 		/* first correct the data pointers */
4352 		for (i = slot; i < nritems; i++) {
4353 			u32 ioff;
4354 
4355 			item = btrfs_item_nr(leaf, i);
4356 			ioff = btrfs_token_item_offset(leaf, item, &token);
4357 			btrfs_set_token_item_offset(leaf, item,
4358 						    ioff - total_data, &token);
4359 		}
4360 		/* shift the items */
4361 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4362 			      btrfs_item_nr_offset(slot),
4363 			      (nritems - slot) * sizeof(struct btrfs_item));
4364 
4365 		/* shift the data */
4366 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4367 			      data_end - total_data, btrfs_leaf_data(leaf) +
4368 			      data_end, old_data - data_end);
4369 		data_end = old_data;
4370 	} else {
4371 		/*
4372 		 * this sucks but it has to be done, if we are inserting at
4373 		 * the end of the leaf only insert 1 of the items, since we
4374 		 * have no way of knowing whats on the next leaf and we'd have
4375 		 * to drop our current locks to figure it out
4376 		 */
4377 		nr = 1;
4378 	}
4379 
4380 	/* setup the item for the new data */
4381 	for (i = 0; i < nr; i++) {
4382 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4383 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4384 		item = btrfs_item_nr(leaf, slot + i);
4385 		btrfs_set_token_item_offset(leaf, item,
4386 					    data_end - data_size[i], &token);
4387 		data_end -= data_size[i];
4388 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4389 	}
4390 	btrfs_set_header_nritems(leaf, nritems + nr);
4391 	btrfs_mark_buffer_dirty(leaf);
4392 
4393 	ret = 0;
4394 	if (slot == 0) {
4395 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4396 		fixup_low_keys(trans, root, path, &disk_key, 1);
4397 	}
4398 
4399 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4400 		btrfs_print_leaf(root, leaf);
4401 		BUG();
4402 	}
4403 out:
4404 	if (!ret)
4405 		ret = nr;
4406 	return ret;
4407 }
4408 
4409 /*
4410  * this is a helper for btrfs_insert_empty_items, the main goal here is
4411  * to save stack depth by doing the bulk of the work in a function
4412  * that doesn't call btrfs_search_slot
4413  */
4414 void setup_items_for_insert(struct btrfs_trans_handle *trans,
4415 			    struct btrfs_root *root, struct btrfs_path *path,
4416 			    struct btrfs_key *cpu_key, u32 *data_size,
4417 			    u32 total_data, u32 total_size, int nr)
4418 {
4419 	struct btrfs_item *item;
4420 	int i;
4421 	u32 nritems;
4422 	unsigned int data_end;
4423 	struct btrfs_disk_key disk_key;
4424 	struct extent_buffer *leaf;
4425 	int slot;
4426 	struct btrfs_map_token token;
4427 
4428 	btrfs_init_map_token(&token);
4429 
4430 	leaf = path->nodes[0];
4431 	slot = path->slots[0];
4432 
4433 	nritems = btrfs_header_nritems(leaf);
4434 	data_end = leaf_data_end(root, leaf);
4435 
4436 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4437 		btrfs_print_leaf(root, leaf);
4438 		printk(KERN_CRIT "not enough freespace need %u have %d\n",
4439 		       total_size, btrfs_leaf_free_space(root, leaf));
4440 		BUG();
4441 	}
4442 
4443 	if (slot != nritems) {
4444 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4445 
4446 		if (old_data < data_end) {
4447 			btrfs_print_leaf(root, leaf);
4448 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4449 			       slot, old_data, data_end);
4450 			BUG_ON(1);
4451 		}
4452 		/*
4453 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4454 		 */
4455 		/* first correct the data pointers */
4456 		for (i = slot; i < nritems; i++) {
4457 			u32 ioff;
4458 
4459 			item = btrfs_item_nr(leaf, i);
4460 			ioff = btrfs_token_item_offset(leaf, item, &token);
4461 			btrfs_set_token_item_offset(leaf, item,
4462 						    ioff - total_data, &token);
4463 		}
4464 		/* shift the items */
4465 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4466 			      btrfs_item_nr_offset(slot),
4467 			      (nritems - slot) * sizeof(struct btrfs_item));
4468 
4469 		/* shift the data */
4470 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4471 			      data_end - total_data, btrfs_leaf_data(leaf) +
4472 			      data_end, old_data - data_end);
4473 		data_end = old_data;
4474 	}
4475 
4476 	/* setup the item for the new data */
4477 	for (i = 0; i < nr; i++) {
4478 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4479 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4480 		item = btrfs_item_nr(leaf, slot + i);
4481 		btrfs_set_token_item_offset(leaf, item,
4482 					    data_end - data_size[i], &token);
4483 		data_end -= data_size[i];
4484 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4485 	}
4486 
4487 	btrfs_set_header_nritems(leaf, nritems + nr);
4488 
4489 	if (slot == 0) {
4490 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4491 		fixup_low_keys(trans, root, path, &disk_key, 1);
4492 	}
4493 	btrfs_unlock_up_safe(path, 1);
4494 	btrfs_mark_buffer_dirty(leaf);
4495 
4496 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4497 		btrfs_print_leaf(root, leaf);
4498 		BUG();
4499 	}
4500 }
4501 
4502 /*
4503  * Given a key and some data, insert items into the tree.
4504  * This does all the path init required, making room in the tree if needed.
4505  */
4506 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4507 			    struct btrfs_root *root,
4508 			    struct btrfs_path *path,
4509 			    struct btrfs_key *cpu_key, u32 *data_size,
4510 			    int nr)
4511 {
4512 	int ret = 0;
4513 	int slot;
4514 	int i;
4515 	u32 total_size = 0;
4516 	u32 total_data = 0;
4517 
4518 	for (i = 0; i < nr; i++)
4519 		total_data += data_size[i];
4520 
4521 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4522 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4523 	if (ret == 0)
4524 		return -EEXIST;
4525 	if (ret < 0)
4526 		return ret;
4527 
4528 	slot = path->slots[0];
4529 	BUG_ON(slot < 0);
4530 
4531 	setup_items_for_insert(trans, root, path, cpu_key, data_size,
4532 			       total_data, total_size, nr);
4533 	return 0;
4534 }
4535 
4536 /*
4537  * Given a key and some data, insert an item into the tree.
4538  * This does all the path init required, making room in the tree if needed.
4539  */
4540 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4541 		      *root, struct btrfs_key *cpu_key, void *data, u32
4542 		      data_size)
4543 {
4544 	int ret = 0;
4545 	struct btrfs_path *path;
4546 	struct extent_buffer *leaf;
4547 	unsigned long ptr;
4548 
4549 	path = btrfs_alloc_path();
4550 	if (!path)
4551 		return -ENOMEM;
4552 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4553 	if (!ret) {
4554 		leaf = path->nodes[0];
4555 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4556 		write_extent_buffer(leaf, data, ptr, data_size);
4557 		btrfs_mark_buffer_dirty(leaf);
4558 	}
4559 	btrfs_free_path(path);
4560 	return ret;
4561 }
4562 
4563 /*
4564  * delete the pointer from a given node.
4565  *
4566  * the tree should have been previously balanced so the deletion does not
4567  * empty a node.
4568  */
4569 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4570 		    struct btrfs_path *path, int level, int slot,
4571 		    int tree_mod_log)
4572 {
4573 	struct extent_buffer *parent = path->nodes[level];
4574 	u32 nritems;
4575 	int ret;
4576 
4577 	nritems = btrfs_header_nritems(parent);
4578 	if (slot != nritems - 1) {
4579 		if (tree_mod_log && level)
4580 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4581 					     slot + 1, nritems - slot - 1);
4582 		memmove_extent_buffer(parent,
4583 			      btrfs_node_key_ptr_offset(slot),
4584 			      btrfs_node_key_ptr_offset(slot + 1),
4585 			      sizeof(struct btrfs_key_ptr) *
4586 			      (nritems - slot - 1));
4587 	} else if (tree_mod_log && level) {
4588 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4589 					      MOD_LOG_KEY_REMOVE);
4590 		BUG_ON(ret < 0);
4591 	}
4592 
4593 	nritems--;
4594 	btrfs_set_header_nritems(parent, nritems);
4595 	if (nritems == 0 && parent == root->node) {
4596 		BUG_ON(btrfs_header_level(root->node) != 1);
4597 		/* just turn the root into a leaf and break */
4598 		btrfs_set_header_level(root->node, 0);
4599 	} else if (slot == 0) {
4600 		struct btrfs_disk_key disk_key;
4601 
4602 		btrfs_node_key(parent, &disk_key, 0);
4603 		fixup_low_keys(trans, root, path, &disk_key, level + 1);
4604 	}
4605 	btrfs_mark_buffer_dirty(parent);
4606 }
4607 
4608 /*
4609  * a helper function to delete the leaf pointed to by path->slots[1] and
4610  * path->nodes[1].
4611  *
4612  * This deletes the pointer in path->nodes[1] and frees the leaf
4613  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4614  *
4615  * The path must have already been setup for deleting the leaf, including
4616  * all the proper balancing.  path->nodes[1] must be locked.
4617  */
4618 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4619 				    struct btrfs_root *root,
4620 				    struct btrfs_path *path,
4621 				    struct extent_buffer *leaf)
4622 {
4623 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4624 	del_ptr(trans, root, path, 1, path->slots[1], 1);
4625 
4626 	/*
4627 	 * btrfs_free_extent is expensive, we want to make sure we
4628 	 * aren't holding any locks when we call it
4629 	 */
4630 	btrfs_unlock_up_safe(path, 0);
4631 
4632 	root_sub_used(root, leaf->len);
4633 
4634 	extent_buffer_get(leaf);
4635 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4636 	free_extent_buffer_stale(leaf);
4637 }
4638 /*
4639  * delete the item at the leaf level in path.  If that empties
4640  * the leaf, remove it from the tree
4641  */
4642 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4643 		    struct btrfs_path *path, int slot, int nr)
4644 {
4645 	struct extent_buffer *leaf;
4646 	struct btrfs_item *item;
4647 	int last_off;
4648 	int dsize = 0;
4649 	int ret = 0;
4650 	int wret;
4651 	int i;
4652 	u32 nritems;
4653 	struct btrfs_map_token token;
4654 
4655 	btrfs_init_map_token(&token);
4656 
4657 	leaf = path->nodes[0];
4658 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4659 
4660 	for (i = 0; i < nr; i++)
4661 		dsize += btrfs_item_size_nr(leaf, slot + i);
4662 
4663 	nritems = btrfs_header_nritems(leaf);
4664 
4665 	if (slot + nr != nritems) {
4666 		int data_end = leaf_data_end(root, leaf);
4667 
4668 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4669 			      data_end + dsize,
4670 			      btrfs_leaf_data(leaf) + data_end,
4671 			      last_off - data_end);
4672 
4673 		for (i = slot + nr; i < nritems; i++) {
4674 			u32 ioff;
4675 
4676 			item = btrfs_item_nr(leaf, i);
4677 			ioff = btrfs_token_item_offset(leaf, item, &token);
4678 			btrfs_set_token_item_offset(leaf, item,
4679 						    ioff + dsize, &token);
4680 		}
4681 
4682 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4683 			      btrfs_item_nr_offset(slot + nr),
4684 			      sizeof(struct btrfs_item) *
4685 			      (nritems - slot - nr));
4686 	}
4687 	btrfs_set_header_nritems(leaf, nritems - nr);
4688 	nritems -= nr;
4689 
4690 	/* delete the leaf if we've emptied it */
4691 	if (nritems == 0) {
4692 		if (leaf == root->node) {
4693 			btrfs_set_header_level(leaf, 0);
4694 		} else {
4695 			btrfs_set_path_blocking(path);
4696 			clean_tree_block(trans, root, leaf);
4697 			btrfs_del_leaf(trans, root, path, leaf);
4698 		}
4699 	} else {
4700 		int used = leaf_space_used(leaf, 0, nritems);
4701 		if (slot == 0) {
4702 			struct btrfs_disk_key disk_key;
4703 
4704 			btrfs_item_key(leaf, &disk_key, 0);
4705 			fixup_low_keys(trans, root, path, &disk_key, 1);
4706 		}
4707 
4708 		/* delete the leaf if it is mostly empty */
4709 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4710 			/* push_leaf_left fixes the path.
4711 			 * make sure the path still points to our leaf
4712 			 * for possible call to del_ptr below
4713 			 */
4714 			slot = path->slots[1];
4715 			extent_buffer_get(leaf);
4716 
4717 			btrfs_set_path_blocking(path);
4718 			wret = push_leaf_left(trans, root, path, 1, 1,
4719 					      1, (u32)-1);
4720 			if (wret < 0 && wret != -ENOSPC)
4721 				ret = wret;
4722 
4723 			if (path->nodes[0] == leaf &&
4724 			    btrfs_header_nritems(leaf)) {
4725 				wret = push_leaf_right(trans, root, path, 1,
4726 						       1, 1, 0);
4727 				if (wret < 0 && wret != -ENOSPC)
4728 					ret = wret;
4729 			}
4730 
4731 			if (btrfs_header_nritems(leaf) == 0) {
4732 				path->slots[1] = slot;
4733 				btrfs_del_leaf(trans, root, path, leaf);
4734 				free_extent_buffer(leaf);
4735 				ret = 0;
4736 			} else {
4737 				/* if we're still in the path, make sure
4738 				 * we're dirty.  Otherwise, one of the
4739 				 * push_leaf functions must have already
4740 				 * dirtied this buffer
4741 				 */
4742 				if (path->nodes[0] == leaf)
4743 					btrfs_mark_buffer_dirty(leaf);
4744 				free_extent_buffer(leaf);
4745 			}
4746 		} else {
4747 			btrfs_mark_buffer_dirty(leaf);
4748 		}
4749 	}
4750 	return ret;
4751 }
4752 
4753 /*
4754  * search the tree again to find a leaf with lesser keys
4755  * returns 0 if it found something or 1 if there are no lesser leaves.
4756  * returns < 0 on io errors.
4757  *
4758  * This may release the path, and so you may lose any locks held at the
4759  * time you call it.
4760  */
4761 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4762 {
4763 	struct btrfs_key key;
4764 	struct btrfs_disk_key found_key;
4765 	int ret;
4766 
4767 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4768 
4769 	if (key.offset > 0)
4770 		key.offset--;
4771 	else if (key.type > 0)
4772 		key.type--;
4773 	else if (key.objectid > 0)
4774 		key.objectid--;
4775 	else
4776 		return 1;
4777 
4778 	btrfs_release_path(path);
4779 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4780 	if (ret < 0)
4781 		return ret;
4782 	btrfs_item_key(path->nodes[0], &found_key, 0);
4783 	ret = comp_keys(&found_key, &key);
4784 	if (ret < 0)
4785 		return 0;
4786 	return 1;
4787 }
4788 
4789 /*
4790  * A helper function to walk down the tree starting at min_key, and looking
4791  * for nodes or leaves that are either in cache or have a minimum
4792  * transaction id.  This is used by the btree defrag code, and tree logging
4793  *
4794  * This does not cow, but it does stuff the starting key it finds back
4795  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4796  * key and get a writable path.
4797  *
4798  * This does lock as it descends, and path->keep_locks should be set
4799  * to 1 by the caller.
4800  *
4801  * This honors path->lowest_level to prevent descent past a given level
4802  * of the tree.
4803  *
4804  * min_trans indicates the oldest transaction that you are interested
4805  * in walking through.  Any nodes or leaves older than min_trans are
4806  * skipped over (without reading them).
4807  *
4808  * returns zero if something useful was found, < 0 on error and 1 if there
4809  * was nothing in the tree that matched the search criteria.
4810  */
4811 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4812 			 struct btrfs_key *max_key,
4813 			 struct btrfs_path *path, int cache_only,
4814 			 u64 min_trans)
4815 {
4816 	struct extent_buffer *cur;
4817 	struct btrfs_key found_key;
4818 	int slot;
4819 	int sret;
4820 	u32 nritems;
4821 	int level;
4822 	int ret = 1;
4823 
4824 	WARN_ON(!path->keep_locks);
4825 again:
4826 	cur = btrfs_read_lock_root_node(root);
4827 	level = btrfs_header_level(cur);
4828 	WARN_ON(path->nodes[level]);
4829 	path->nodes[level] = cur;
4830 	path->locks[level] = BTRFS_READ_LOCK;
4831 
4832 	if (btrfs_header_generation(cur) < min_trans) {
4833 		ret = 1;
4834 		goto out;
4835 	}
4836 	while (1) {
4837 		nritems = btrfs_header_nritems(cur);
4838 		level = btrfs_header_level(cur);
4839 		sret = bin_search(cur, min_key, level, &slot);
4840 
4841 		/* at the lowest level, we're done, setup the path and exit */
4842 		if (level == path->lowest_level) {
4843 			if (slot >= nritems)
4844 				goto find_next_key;
4845 			ret = 0;
4846 			path->slots[level] = slot;
4847 			btrfs_item_key_to_cpu(cur, &found_key, slot);
4848 			goto out;
4849 		}
4850 		if (sret && slot > 0)
4851 			slot--;
4852 		/*
4853 		 * check this node pointer against the cache_only and
4854 		 * min_trans parameters.  If it isn't in cache or is too
4855 		 * old, skip to the next one.
4856 		 */
4857 		while (slot < nritems) {
4858 			u64 blockptr;
4859 			u64 gen;
4860 			struct extent_buffer *tmp;
4861 			struct btrfs_disk_key disk_key;
4862 
4863 			blockptr = btrfs_node_blockptr(cur, slot);
4864 			gen = btrfs_node_ptr_generation(cur, slot);
4865 			if (gen < min_trans) {
4866 				slot++;
4867 				continue;
4868 			}
4869 			if (!cache_only)
4870 				break;
4871 
4872 			if (max_key) {
4873 				btrfs_node_key(cur, &disk_key, slot);
4874 				if (comp_keys(&disk_key, max_key) >= 0) {
4875 					ret = 1;
4876 					goto out;
4877 				}
4878 			}
4879 
4880 			tmp = btrfs_find_tree_block(root, blockptr,
4881 					    btrfs_level_size(root, level - 1));
4882 
4883 			if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
4884 				free_extent_buffer(tmp);
4885 				break;
4886 			}
4887 			if (tmp)
4888 				free_extent_buffer(tmp);
4889 			slot++;
4890 		}
4891 find_next_key:
4892 		/*
4893 		 * we didn't find a candidate key in this node, walk forward
4894 		 * and find another one
4895 		 */
4896 		if (slot >= nritems) {
4897 			path->slots[level] = slot;
4898 			btrfs_set_path_blocking(path);
4899 			sret = btrfs_find_next_key(root, path, min_key, level,
4900 						  cache_only, min_trans);
4901 			if (sret == 0) {
4902 				btrfs_release_path(path);
4903 				goto again;
4904 			} else {
4905 				goto out;
4906 			}
4907 		}
4908 		/* save our key for returning back */
4909 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4910 		path->slots[level] = slot;
4911 		if (level == path->lowest_level) {
4912 			ret = 0;
4913 			unlock_up(path, level, 1, 0, NULL);
4914 			goto out;
4915 		}
4916 		btrfs_set_path_blocking(path);
4917 		cur = read_node_slot(root, cur, slot);
4918 		BUG_ON(!cur); /* -ENOMEM */
4919 
4920 		btrfs_tree_read_lock(cur);
4921 
4922 		path->locks[level - 1] = BTRFS_READ_LOCK;
4923 		path->nodes[level - 1] = cur;
4924 		unlock_up(path, level, 1, 0, NULL);
4925 		btrfs_clear_path_blocking(path, NULL, 0);
4926 	}
4927 out:
4928 	if (ret == 0)
4929 		memcpy(min_key, &found_key, sizeof(found_key));
4930 	btrfs_set_path_blocking(path);
4931 	return ret;
4932 }
4933 
4934 /*
4935  * this is similar to btrfs_next_leaf, but does not try to preserve
4936  * and fixup the path.  It looks for and returns the next key in the
4937  * tree based on the current path and the cache_only and min_trans
4938  * parameters.
4939  *
4940  * 0 is returned if another key is found, < 0 if there are any errors
4941  * and 1 is returned if there are no higher keys in the tree
4942  *
4943  * path->keep_locks should be set to 1 on the search made before
4944  * calling this function.
4945  */
4946 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4947 			struct btrfs_key *key, int level,
4948 			int cache_only, u64 min_trans)
4949 {
4950 	int slot;
4951 	struct extent_buffer *c;
4952 
4953 	WARN_ON(!path->keep_locks);
4954 	while (level < BTRFS_MAX_LEVEL) {
4955 		if (!path->nodes[level])
4956 			return 1;
4957 
4958 		slot = path->slots[level] + 1;
4959 		c = path->nodes[level];
4960 next:
4961 		if (slot >= btrfs_header_nritems(c)) {
4962 			int ret;
4963 			int orig_lowest;
4964 			struct btrfs_key cur_key;
4965 			if (level + 1 >= BTRFS_MAX_LEVEL ||
4966 			    !path->nodes[level + 1])
4967 				return 1;
4968 
4969 			if (path->locks[level + 1]) {
4970 				level++;
4971 				continue;
4972 			}
4973 
4974 			slot = btrfs_header_nritems(c) - 1;
4975 			if (level == 0)
4976 				btrfs_item_key_to_cpu(c, &cur_key, slot);
4977 			else
4978 				btrfs_node_key_to_cpu(c, &cur_key, slot);
4979 
4980 			orig_lowest = path->lowest_level;
4981 			btrfs_release_path(path);
4982 			path->lowest_level = level;
4983 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
4984 						0, 0);
4985 			path->lowest_level = orig_lowest;
4986 			if (ret < 0)
4987 				return ret;
4988 
4989 			c = path->nodes[level];
4990 			slot = path->slots[level];
4991 			if (ret == 0)
4992 				slot++;
4993 			goto next;
4994 		}
4995 
4996 		if (level == 0)
4997 			btrfs_item_key_to_cpu(c, key, slot);
4998 		else {
4999 			u64 blockptr = btrfs_node_blockptr(c, slot);
5000 			u64 gen = btrfs_node_ptr_generation(c, slot);
5001 
5002 			if (cache_only) {
5003 				struct extent_buffer *cur;
5004 				cur = btrfs_find_tree_block(root, blockptr,
5005 					    btrfs_level_size(root, level - 1));
5006 				if (!cur ||
5007 				    btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5008 					slot++;
5009 					if (cur)
5010 						free_extent_buffer(cur);
5011 					goto next;
5012 				}
5013 				free_extent_buffer(cur);
5014 			}
5015 			if (gen < min_trans) {
5016 				slot++;
5017 				goto next;
5018 			}
5019 			btrfs_node_key_to_cpu(c, key, slot);
5020 		}
5021 		return 0;
5022 	}
5023 	return 1;
5024 }
5025 
5026 /*
5027  * search the tree again to find a leaf with greater keys
5028  * returns 0 if it found something or 1 if there are no greater leaves.
5029  * returns < 0 on io errors.
5030  */
5031 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5032 {
5033 	return btrfs_next_old_leaf(root, path, 0);
5034 }
5035 
5036 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5037 			u64 time_seq)
5038 {
5039 	int slot;
5040 	int level;
5041 	struct extent_buffer *c;
5042 	struct extent_buffer *next;
5043 	struct btrfs_key key;
5044 	u32 nritems;
5045 	int ret;
5046 	int old_spinning = path->leave_spinning;
5047 	int next_rw_lock = 0;
5048 
5049 	nritems = btrfs_header_nritems(path->nodes[0]);
5050 	if (nritems == 0)
5051 		return 1;
5052 
5053 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5054 again:
5055 	level = 1;
5056 	next = NULL;
5057 	next_rw_lock = 0;
5058 	btrfs_release_path(path);
5059 
5060 	path->keep_locks = 1;
5061 	path->leave_spinning = 1;
5062 
5063 	if (time_seq)
5064 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5065 	else
5066 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5067 	path->keep_locks = 0;
5068 
5069 	if (ret < 0)
5070 		return ret;
5071 
5072 	nritems = btrfs_header_nritems(path->nodes[0]);
5073 	/*
5074 	 * by releasing the path above we dropped all our locks.  A balance
5075 	 * could have added more items next to the key that used to be
5076 	 * at the very end of the block.  So, check again here and
5077 	 * advance the path if there are now more items available.
5078 	 */
5079 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5080 		if (ret == 0)
5081 			path->slots[0]++;
5082 		ret = 0;
5083 		goto done;
5084 	}
5085 
5086 	while (level < BTRFS_MAX_LEVEL) {
5087 		if (!path->nodes[level]) {
5088 			ret = 1;
5089 			goto done;
5090 		}
5091 
5092 		slot = path->slots[level] + 1;
5093 		c = path->nodes[level];
5094 		if (slot >= btrfs_header_nritems(c)) {
5095 			level++;
5096 			if (level == BTRFS_MAX_LEVEL) {
5097 				ret = 1;
5098 				goto done;
5099 			}
5100 			continue;
5101 		}
5102 
5103 		if (next) {
5104 			btrfs_tree_unlock_rw(next, next_rw_lock);
5105 			free_extent_buffer(next);
5106 		}
5107 
5108 		next = c;
5109 		next_rw_lock = path->locks[level];
5110 		ret = read_block_for_search(NULL, root, path, &next, level,
5111 					    slot, &key, 0);
5112 		if (ret == -EAGAIN)
5113 			goto again;
5114 
5115 		if (ret < 0) {
5116 			btrfs_release_path(path);
5117 			goto done;
5118 		}
5119 
5120 		if (!path->skip_locking) {
5121 			ret = btrfs_try_tree_read_lock(next);
5122 			if (!ret && time_seq) {
5123 				/*
5124 				 * If we don't get the lock, we may be racing
5125 				 * with push_leaf_left, holding that lock while
5126 				 * itself waiting for the leaf we've currently
5127 				 * locked. To solve this situation, we give up
5128 				 * on our lock and cycle.
5129 				 */
5130 				btrfs_release_path(path);
5131 				cond_resched();
5132 				goto again;
5133 			}
5134 			if (!ret) {
5135 				btrfs_set_path_blocking(path);
5136 				btrfs_tree_read_lock(next);
5137 				btrfs_clear_path_blocking(path, next,
5138 							  BTRFS_READ_LOCK);
5139 			}
5140 			next_rw_lock = BTRFS_READ_LOCK;
5141 		}
5142 		break;
5143 	}
5144 	path->slots[level] = slot;
5145 	while (1) {
5146 		level--;
5147 		c = path->nodes[level];
5148 		if (path->locks[level])
5149 			btrfs_tree_unlock_rw(c, path->locks[level]);
5150 
5151 		free_extent_buffer(c);
5152 		path->nodes[level] = next;
5153 		path->slots[level] = 0;
5154 		if (!path->skip_locking)
5155 			path->locks[level] = next_rw_lock;
5156 		if (!level)
5157 			break;
5158 
5159 		ret = read_block_for_search(NULL, root, path, &next, level,
5160 					    0, &key, 0);
5161 		if (ret == -EAGAIN)
5162 			goto again;
5163 
5164 		if (ret < 0) {
5165 			btrfs_release_path(path);
5166 			goto done;
5167 		}
5168 
5169 		if (!path->skip_locking) {
5170 			ret = btrfs_try_tree_read_lock(next);
5171 			if (!ret) {
5172 				btrfs_set_path_blocking(path);
5173 				btrfs_tree_read_lock(next);
5174 				btrfs_clear_path_blocking(path, next,
5175 							  BTRFS_READ_LOCK);
5176 			}
5177 			next_rw_lock = BTRFS_READ_LOCK;
5178 		}
5179 	}
5180 	ret = 0;
5181 done:
5182 	unlock_up(path, 0, 1, 0, NULL);
5183 	path->leave_spinning = old_spinning;
5184 	if (!old_spinning)
5185 		btrfs_set_path_blocking(path);
5186 
5187 	return ret;
5188 }
5189 
5190 /*
5191  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5192  * searching until it gets past min_objectid or finds an item of 'type'
5193  *
5194  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5195  */
5196 int btrfs_previous_item(struct btrfs_root *root,
5197 			struct btrfs_path *path, u64 min_objectid,
5198 			int type)
5199 {
5200 	struct btrfs_key found_key;
5201 	struct extent_buffer *leaf;
5202 	u32 nritems;
5203 	int ret;
5204 
5205 	while (1) {
5206 		if (path->slots[0] == 0) {
5207 			btrfs_set_path_blocking(path);
5208 			ret = btrfs_prev_leaf(root, path);
5209 			if (ret != 0)
5210 				return ret;
5211 		} else {
5212 			path->slots[0]--;
5213 		}
5214 		leaf = path->nodes[0];
5215 		nritems = btrfs_header_nritems(leaf);
5216 		if (nritems == 0)
5217 			return 1;
5218 		if (path->slots[0] == nritems)
5219 			path->slots[0]--;
5220 
5221 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5222 		if (found_key.objectid < min_objectid)
5223 			break;
5224 		if (found_key.type == type)
5225 			return 0;
5226 		if (found_key.objectid == min_objectid &&
5227 		    found_key.type < type)
5228 			break;
5229 	}
5230 	return 1;
5231 }
5232