xref: /freebsd/sys/contrib/openzfs/module/avl/avl.c (revision 4731124cace5e7a0224e29784617d2856e5c59ab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
28  * Copyright (c) 2015 by Delphix. All rights reserved.
29  */
30 
31 /*
32  * AVL - generic AVL tree implementation for kernel use
33  *
34  * A complete description of AVL trees can be found in many CS textbooks.
35  *
36  * Here is a very brief overview. An AVL tree is a binary search tree that is
37  * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
38  * any given node, the left and right subtrees are allowed to differ in height
39  * by at most 1 level.
40  *
41  * This relaxation from a perfectly balanced binary tree allows doing
42  * insertion and deletion relatively efficiently. Searching the tree is
43  * still a fast operation, roughly O(log(N)).
44  *
45  * The key to insertion and deletion is a set of tree manipulations called
46  * rotations, which bring unbalanced subtrees back into the semi-balanced state.
47  *
48  * This implementation of AVL trees has the following peculiarities:
49  *
50  *	- The AVL specific data structures are physically embedded as fields
51  *	  in the "using" data structures.  To maintain generality the code
52  *	  must constantly translate between "avl_node_t *" and containing
53  *	  data structure "void *"s by adding/subtracting the avl_offset.
54  *
55  *	- Since the AVL data is always embedded in other structures, there is
56  *	  no locking or memory allocation in the AVL routines. This must be
57  *	  provided for by the enclosing data structure's semantics. Typically,
58  *	  avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
59  *	  exclusive write lock. Other operations require a read lock.
60  *
61  *      - The implementation uses iteration instead of explicit recursion,
62  *	  since it is intended to run on limited size kernel stacks. Since
63  *	  there is no recursion stack present to move "up" in the tree,
64  *	  there is an explicit "parent" link in the avl_node_t.
65  *
66  *      - The left/right children pointers of a node are in an array.
67  *	  In the code, variables (instead of constants) are used to represent
68  *	  left and right indices.  The implementation is written as if it only
69  *	  dealt with left handed manipulations.  By changing the value assigned
70  *	  to "left", the code also works for right handed trees.  The
71  *	  following variables/terms are frequently used:
72  *
73  *		int left;	// 0 when dealing with left children,
74  *				// 1 for dealing with right children
75  *
76  *		int left_heavy;	// -1 when left subtree is taller at some node,
77  *				// +1 when right subtree is taller
78  *
79  *		int right;	// will be the opposite of left (0 or 1)
80  *		int right_heavy;// will be the opposite of left_heavy (-1 or 1)
81  *
82  *		int direction;  // 0 for "<" (ie. left child); 1 for ">" (right)
83  *
84  *	  Though it is a little more confusing to read the code, the approach
85  *	  allows using half as much code (and hence cache footprint) for tree
86  *	  manipulations and eliminates many conditional branches.
87  *
88  *	- The avl_index_t is an opaque "cookie" used to find nodes at or
89  *	  adjacent to where a new value would be inserted in the tree. The value
90  *	  is a modified "avl_node_t *".  The bottom bit (normally 0 for a
91  *	  pointer) is set to indicate if that the new node has a value greater
92  *	  than the value of the indicated "avl_node_t *".
93  *
94  * Note - in addition to userland (e.g. libavl and libutil) and the kernel
95  * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
96  * which each have their own compilation environments and subsequent
97  * requirements. Each of these environments must be considered when adding
98  * dependencies from avl.c.
99  *
100  * Link to Illumos.org for more information on avl function:
101  * [1] https://illumos.org/man/9f/avl
102  */
103 
104 #include <sys/types.h>
105 #include <sys/param.h>
106 #include <sys/debug.h>
107 #include <sys/avl.h>
108 #include <sys/cmn_err.h>
109 #include <sys/mod.h>
110 
111 /*
112  * Small arrays to translate between balance (or diff) values and child indices.
113  *
114  * Code that deals with binary tree data structures will randomly use
115  * left and right children when examining a tree.  C "if()" statements
116  * which evaluate randomly suffer from very poor hardware branch prediction.
117  * In this code we avoid some of the branch mispredictions by using the
118  * following translation arrays. They replace random branches with an
119  * additional memory reference. Since the translation arrays are both very
120  * small the data should remain efficiently in cache.
121  */
122 static const int  avl_child2balance[]	= {-1, 1};
123 static const int  avl_balance2child[]	= {0, 0, 1};
124 
125 
126 /*
127  * Walk from one node to the previous valued node (ie. an infix walk
128  * towards the left). At any given node we do one of 2 things:
129  *
130  * - If there is a left child, go to it, then to it's rightmost descendant.
131  *
132  * - otherwise we return through parent nodes until we've come from a right
133  *   child.
134  *
135  * Return Value:
136  * NULL - if at the end of the nodes
137  * otherwise next node
138  */
139 void *
140 avl_walk(avl_tree_t *tree, void	*oldnode, int left)
141 {
142 	size_t off = tree->avl_offset;
143 	avl_node_t *node = AVL_DATA2NODE(oldnode, off);
144 	int right = 1 - left;
145 	int was_child;
146 
147 
148 	/*
149 	 * nowhere to walk to if tree is empty
150 	 */
151 	if (node == NULL)
152 		return (NULL);
153 
154 	/*
155 	 * Visit the previous valued node. There are two possibilities:
156 	 *
157 	 * If this node has a left child, go down one left, then all
158 	 * the way right.
159 	 */
160 	if (node->avl_child[left] != NULL) {
161 		for (node = node->avl_child[left];
162 		    node->avl_child[right] != NULL;
163 		    node = node->avl_child[right])
164 			;
165 	/*
166 	 * Otherwise, return through left children as far as we can.
167 	 */
168 	} else {
169 		for (;;) {
170 			was_child = AVL_XCHILD(node);
171 			node = AVL_XPARENT(node);
172 			if (node == NULL)
173 				return (NULL);
174 			if (was_child == right)
175 				break;
176 		}
177 	}
178 
179 	return (AVL_NODE2DATA(node, off));
180 }
181 
182 /*
183  * Return the lowest valued node in a tree or NULL.
184  * (leftmost child from root of tree)
185  */
186 void *
187 avl_first(avl_tree_t *tree)
188 {
189 	avl_node_t *node;
190 	avl_node_t *prev = NULL;
191 	size_t off = tree->avl_offset;
192 
193 	for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
194 		prev = node;
195 
196 	if (prev != NULL)
197 		return (AVL_NODE2DATA(prev, off));
198 	return (NULL);
199 }
200 
201 /*
202  * Return the highest valued node in a tree or NULL.
203  * (rightmost child from root of tree)
204  */
205 void *
206 avl_last(avl_tree_t *tree)
207 {
208 	avl_node_t *node;
209 	avl_node_t *prev = NULL;
210 	size_t off = tree->avl_offset;
211 
212 	for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
213 		prev = node;
214 
215 	if (prev != NULL)
216 		return (AVL_NODE2DATA(prev, off));
217 	return (NULL);
218 }
219 
220 /*
221  * Access the node immediately before or after an insertion point.
222  *
223  * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
224  *
225  * Return value:
226  *	NULL: no node in the given direction
227  *	"void *"  of the found tree node
228  */
229 void *
230 avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
231 {
232 	int child = AVL_INDEX2CHILD(where);
233 	avl_node_t *node = AVL_INDEX2NODE(where);
234 	void *data;
235 	size_t off = tree->avl_offset;
236 
237 	if (node == NULL) {
238 		ASSERT(tree->avl_root == NULL);
239 		return (NULL);
240 	}
241 	data = AVL_NODE2DATA(node, off);
242 	if (child != direction)
243 		return (data);
244 
245 	return (avl_walk(tree, data, direction));
246 }
247 
248 
249 /*
250  * Search for the node which contains "value".  The algorithm is a
251  * simple binary tree search.
252  *
253  * return value:
254  *	NULL: the value is not in the AVL tree
255  *		*where (if not NULL)  is set to indicate the insertion point
256  *	"void *"  of the found tree node
257  */
258 void *
259 avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
260 {
261 	avl_node_t *node;
262 	avl_node_t *prev = NULL;
263 	int child = 0;
264 	int diff;
265 	size_t off = tree->avl_offset;
266 
267 	for (node = tree->avl_root; node != NULL;
268 	    node = node->avl_child[child]) {
269 
270 		prev = node;
271 
272 		diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
273 		ASSERT(-1 <= diff && diff <= 1);
274 		if (diff == 0) {
275 #ifdef ZFS_DEBUG
276 			if (where != NULL)
277 				*where = 0;
278 #endif
279 			return (AVL_NODE2DATA(node, off));
280 		}
281 		child = avl_balance2child[1 + diff];
282 
283 	}
284 
285 	if (where != NULL)
286 		*where = AVL_MKINDEX(prev, child);
287 
288 	return (NULL);
289 }
290 
291 
292 /*
293  * Perform a rotation to restore balance at the subtree given by depth.
294  *
295  * This routine is used by both insertion and deletion. The return value
296  * indicates:
297  *	 0 : subtree did not change height
298  *	!0 : subtree was reduced in height
299  *
300  * The code is written as if handling left rotations, right rotations are
301  * symmetric and handled by swapping values of variables right/left[_heavy]
302  *
303  * On input balance is the "new" balance at "node". This value is either
304  * -2 or +2.
305  */
306 static int
307 avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
308 {
309 	int left = !(balance < 0);	/* when balance = -2, left will be 0 */
310 	int right = 1 - left;
311 	int left_heavy = balance >> 1;
312 	int right_heavy = -left_heavy;
313 	avl_node_t *parent = AVL_XPARENT(node);
314 	avl_node_t *child = node->avl_child[left];
315 	avl_node_t *cright;
316 	avl_node_t *gchild;
317 	avl_node_t *gright;
318 	avl_node_t *gleft;
319 	int which_child = AVL_XCHILD(node);
320 	int child_bal = AVL_XBALANCE(child);
321 
322 	/* BEGIN CSTYLED */
323 	/*
324 	 * case 1 : node is overly left heavy, the left child is balanced or
325 	 * also left heavy. This requires the following rotation.
326 	 *
327 	 *                   (node bal:-2)
328 	 *                    /           \
329 	 *                   /             \
330 	 *              (child bal:0 or -1)
331 	 *              /    \
332 	 *             /      \
333 	 *                     cright
334 	 *
335 	 * becomes:
336 	 *
337 	 *              (child bal:1 or 0)
338 	 *              /        \
339 	 *             /          \
340 	 *                        (node bal:-1 or 0)
341 	 *                         /     \
342 	 *                        /       \
343 	 *                     cright
344 	 *
345 	 * we detect this situation by noting that child's balance is not
346 	 * right_heavy.
347 	 */
348 	/* END CSTYLED */
349 	if (child_bal != right_heavy) {
350 
351 		/*
352 		 * compute new balance of nodes
353 		 *
354 		 * If child used to be left heavy (now balanced) we reduced
355 		 * the height of this sub-tree -- used in "return...;" below
356 		 */
357 		child_bal += right_heavy; /* adjust towards right */
358 
359 		/*
360 		 * move "cright" to be node's left child
361 		 */
362 		cright = child->avl_child[right];
363 		node->avl_child[left] = cright;
364 		if (cright != NULL) {
365 			AVL_SETPARENT(cright, node);
366 			AVL_SETCHILD(cright, left);
367 		}
368 
369 		/*
370 		 * move node to be child's right child
371 		 */
372 		child->avl_child[right] = node;
373 		AVL_SETBALANCE(node, -child_bal);
374 		AVL_SETCHILD(node, right);
375 		AVL_SETPARENT(node, child);
376 
377 		/*
378 		 * update the pointer into this subtree
379 		 */
380 		AVL_SETBALANCE(child, child_bal);
381 		AVL_SETCHILD(child, which_child);
382 		AVL_SETPARENT(child, parent);
383 		if (parent != NULL)
384 			parent->avl_child[which_child] = child;
385 		else
386 			tree->avl_root = child;
387 
388 		return (child_bal == 0);
389 	}
390 
391 	/* BEGIN CSTYLED */
392 	/*
393 	 * case 2 : When node is left heavy, but child is right heavy we use
394 	 * a different rotation.
395 	 *
396 	 *                   (node b:-2)
397 	 *                    /   \
398 	 *                   /     \
399 	 *                  /       \
400 	 *             (child b:+1)
401 	 *              /     \
402 	 *             /       \
403 	 *                   (gchild b: != 0)
404 	 *                     /  \
405 	 *                    /    \
406 	 *                 gleft   gright
407 	 *
408 	 * becomes:
409 	 *
410 	 *              (gchild b:0)
411 	 *              /       \
412 	 *             /         \
413 	 *            /           \
414 	 *        (child b:?)   (node b:?)
415 	 *         /  \          /   \
416 	 *        /    \        /     \
417 	 *            gleft   gright
418 	 *
419 	 * computing the new balances is more complicated. As an example:
420 	 *	 if gchild was right_heavy, then child is now left heavy
421 	 *		else it is balanced
422 	 */
423 	/* END CSTYLED */
424 	gchild = child->avl_child[right];
425 	gleft = gchild->avl_child[left];
426 	gright = gchild->avl_child[right];
427 
428 	/*
429 	 * move gright to left child of node and
430 	 *
431 	 * move gleft to right child of node
432 	 */
433 	node->avl_child[left] = gright;
434 	if (gright != NULL) {
435 		AVL_SETPARENT(gright, node);
436 		AVL_SETCHILD(gright, left);
437 	}
438 
439 	child->avl_child[right] = gleft;
440 	if (gleft != NULL) {
441 		AVL_SETPARENT(gleft, child);
442 		AVL_SETCHILD(gleft, right);
443 	}
444 
445 	/*
446 	 * move child to left child of gchild and
447 	 *
448 	 * move node to right child of gchild and
449 	 *
450 	 * fixup parent of all this to point to gchild
451 	 */
452 	balance = AVL_XBALANCE(gchild);
453 	gchild->avl_child[left] = child;
454 	AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
455 	AVL_SETPARENT(child, gchild);
456 	AVL_SETCHILD(child, left);
457 
458 	gchild->avl_child[right] = node;
459 	AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
460 	AVL_SETPARENT(node, gchild);
461 	AVL_SETCHILD(node, right);
462 
463 	AVL_SETBALANCE(gchild, 0);
464 	AVL_SETPARENT(gchild, parent);
465 	AVL_SETCHILD(gchild, which_child);
466 	if (parent != NULL)
467 		parent->avl_child[which_child] = gchild;
468 	else
469 		tree->avl_root = gchild;
470 
471 	return (1);	/* the new tree is always shorter */
472 }
473 
474 
475 /*
476  * Insert a new node into an AVL tree at the specified (from avl_find()) place.
477  *
478  * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
479  * searches out to the leaf positions.  The avl_index_t indicates the node
480  * which will be the parent of the new node.
481  *
482  * After the node is inserted, a single rotation further up the tree may
483  * be necessary to maintain an acceptable AVL balance.
484  */
485 void
486 avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
487 {
488 	avl_node_t *node;
489 	avl_node_t *parent = AVL_INDEX2NODE(where);
490 	int old_balance;
491 	int new_balance;
492 	int which_child = AVL_INDEX2CHILD(where);
493 	size_t off = tree->avl_offset;
494 
495 #ifdef _LP64
496 	ASSERT(((uintptr_t)new_data & 0x7) == 0);
497 #endif
498 
499 	node = AVL_DATA2NODE(new_data, off);
500 
501 	/*
502 	 * First, add the node to the tree at the indicated position.
503 	 */
504 	++tree->avl_numnodes;
505 
506 	node->avl_child[0] = NULL;
507 	node->avl_child[1] = NULL;
508 
509 	AVL_SETCHILD(node, which_child);
510 	AVL_SETBALANCE(node, 0);
511 	AVL_SETPARENT(node, parent);
512 	if (parent != NULL) {
513 		ASSERT(parent->avl_child[which_child] == NULL);
514 		parent->avl_child[which_child] = node;
515 	} else {
516 		ASSERT(tree->avl_root == NULL);
517 		tree->avl_root = node;
518 	}
519 	/*
520 	 * Now, back up the tree modifying the balance of all nodes above the
521 	 * insertion point. If we get to a highly unbalanced ancestor, we
522 	 * need to do a rotation.  If we back out of the tree we are done.
523 	 * If we brought any subtree into perfect balance (0), we are also done.
524 	 */
525 	for (;;) {
526 		node = parent;
527 		if (node == NULL)
528 			return;
529 
530 		/*
531 		 * Compute the new balance
532 		 */
533 		old_balance = AVL_XBALANCE(node);
534 		new_balance = old_balance + avl_child2balance[which_child];
535 
536 		/*
537 		 * If we introduced equal balance, then we are done immediately
538 		 */
539 		if (new_balance == 0) {
540 			AVL_SETBALANCE(node, 0);
541 			return;
542 		}
543 
544 		/*
545 		 * If both old and new are not zero we went
546 		 * from -1 to -2 balance, do a rotation.
547 		 */
548 		if (old_balance != 0)
549 			break;
550 
551 		AVL_SETBALANCE(node, new_balance);
552 		parent = AVL_XPARENT(node);
553 		which_child = AVL_XCHILD(node);
554 	}
555 
556 	/*
557 	 * perform a rotation to fix the tree and return
558 	 */
559 	(void) avl_rotation(tree, node, new_balance);
560 }
561 
562 /*
563  * Insert "new_data" in "tree" in the given "direction" either after or
564  * before (AVL_AFTER, AVL_BEFORE) the data "here".
565  *
566  * Insertions can only be done at empty leaf points in the tree, therefore
567  * if the given child of the node is already present we move to either
568  * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
569  * every other node in the tree is a leaf, this always works.
570  *
571  * To help developers using this interface, we assert that the new node
572  * is correctly ordered at every step of the way in DEBUG kernels.
573  */
574 void
575 avl_insert_here(
576 	avl_tree_t *tree,
577 	void *new_data,
578 	void *here,
579 	int direction)
580 {
581 	avl_node_t *node;
582 	int child = direction;	/* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
583 #ifdef ZFS_DEBUG
584 	int diff;
585 #endif
586 
587 	ASSERT(tree != NULL);
588 	ASSERT(new_data != NULL);
589 	ASSERT(here != NULL);
590 	ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
591 
592 	/*
593 	 * If corresponding child of node is not NULL, go to the neighboring
594 	 * node and reverse the insertion direction.
595 	 */
596 	node = AVL_DATA2NODE(here, tree->avl_offset);
597 
598 #ifdef ZFS_DEBUG
599 	diff = tree->avl_compar(new_data, here);
600 	ASSERT(-1 <= diff && diff <= 1);
601 	ASSERT(diff != 0);
602 	ASSERT(diff > 0 ? child == 1 : child == 0);
603 #endif
604 
605 	if (node->avl_child[child] != NULL) {
606 		node = node->avl_child[child];
607 		child = 1 - child;
608 		while (node->avl_child[child] != NULL) {
609 #ifdef ZFS_DEBUG
610 			diff = tree->avl_compar(new_data,
611 			    AVL_NODE2DATA(node, tree->avl_offset));
612 			ASSERT(-1 <= diff && diff <= 1);
613 			ASSERT(diff != 0);
614 			ASSERT(diff > 0 ? child == 1 : child == 0);
615 #endif
616 			node = node->avl_child[child];
617 		}
618 #ifdef ZFS_DEBUG
619 		diff = tree->avl_compar(new_data,
620 		    AVL_NODE2DATA(node, tree->avl_offset));
621 		ASSERT(-1 <= diff && diff <= 1);
622 		ASSERT(diff != 0);
623 		ASSERT(diff > 0 ? child == 1 : child == 0);
624 #endif
625 	}
626 	ASSERT(node->avl_child[child] == NULL);
627 
628 	avl_insert(tree, new_data, AVL_MKINDEX(node, child));
629 }
630 
631 /*
632  * Add a new node to an AVL tree.  Strictly enforce that no duplicates can
633  * be added to the tree with a VERIFY which is enabled for non-DEBUG builds.
634  */
635 void
636 avl_add(avl_tree_t *tree, void *new_node)
637 {
638 	avl_index_t where = 0;
639 
640 	VERIFY(avl_find(tree, new_node, &where) == NULL);
641 
642 	avl_insert(tree, new_node, where);
643 }
644 
645 /*
646  * Delete a node from the AVL tree.  Deletion is similar to insertion, but
647  * with 2 complications.
648  *
649  * First, we may be deleting an interior node. Consider the following subtree:
650  *
651  *     d           c            c
652  *    / \         / \          / \
653  *   b   e       b   e        b   e
654  *  / \	        / \          /
655  * a   c       a            a
656  *
657  * When we are deleting node (d), we find and bring up an adjacent valued leaf
658  * node, say (c), to take the interior node's place. In the code this is
659  * handled by temporarily swapping (d) and (c) in the tree and then using
660  * common code to delete (d) from the leaf position.
661  *
662  * Secondly, an interior deletion from a deep tree may require more than one
663  * rotation to fix the balance. This is handled by moving up the tree through
664  * parents and applying rotations as needed. The return value from
665  * avl_rotation() is used to detect when a subtree did not change overall
666  * height due to a rotation.
667  */
668 void
669 avl_remove(avl_tree_t *tree, void *data)
670 {
671 	avl_node_t *delete;
672 	avl_node_t *parent;
673 	avl_node_t *node;
674 	avl_node_t tmp;
675 	int old_balance;
676 	int new_balance;
677 	int left;
678 	int right;
679 	int which_child;
680 	size_t off = tree->avl_offset;
681 
682 	delete = AVL_DATA2NODE(data, off);
683 
684 	/*
685 	 * Deletion is easiest with a node that has at most 1 child.
686 	 * We swap a node with 2 children with a sequentially valued
687 	 * neighbor node. That node will have at most 1 child. Note this
688 	 * has no effect on the ordering of the remaining nodes.
689 	 *
690 	 * As an optimization, we choose the greater neighbor if the tree
691 	 * is right heavy, otherwise the left neighbor. This reduces the
692 	 * number of rotations needed.
693 	 */
694 	if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
695 
696 		/*
697 		 * choose node to swap from whichever side is taller
698 		 */
699 		old_balance = AVL_XBALANCE(delete);
700 		left = avl_balance2child[old_balance + 1];
701 		right = 1 - left;
702 
703 		/*
704 		 * get to the previous value'd node
705 		 * (down 1 left, as far as possible right)
706 		 */
707 		for (node = delete->avl_child[left];
708 		    node->avl_child[right] != NULL;
709 		    node = node->avl_child[right])
710 			;
711 
712 		/*
713 		 * create a temp placeholder for 'node'
714 		 * move 'node' to delete's spot in the tree
715 		 */
716 		tmp = *node;
717 
718 		*node = *delete;
719 		if (node->avl_child[left] == node)
720 			node->avl_child[left] = &tmp;
721 
722 		parent = AVL_XPARENT(node);
723 		if (parent != NULL)
724 			parent->avl_child[AVL_XCHILD(node)] = node;
725 		else
726 			tree->avl_root = node;
727 		AVL_SETPARENT(node->avl_child[left], node);
728 		AVL_SETPARENT(node->avl_child[right], node);
729 
730 		/*
731 		 * Put tmp where node used to be (just temporary).
732 		 * It always has a parent and at most 1 child.
733 		 */
734 		delete = &tmp;
735 		parent = AVL_XPARENT(delete);
736 		parent->avl_child[AVL_XCHILD(delete)] = delete;
737 		which_child = (delete->avl_child[1] != 0);
738 		if (delete->avl_child[which_child] != NULL)
739 			AVL_SETPARENT(delete->avl_child[which_child], delete);
740 	}
741 
742 
743 	/*
744 	 * Here we know "delete" is at least partially a leaf node. It can
745 	 * be easily removed from the tree.
746 	 */
747 	ASSERT(tree->avl_numnodes > 0);
748 	--tree->avl_numnodes;
749 	parent = AVL_XPARENT(delete);
750 	which_child = AVL_XCHILD(delete);
751 	if (delete->avl_child[0] != NULL)
752 		node = delete->avl_child[0];
753 	else
754 		node = delete->avl_child[1];
755 
756 	/*
757 	 * Connect parent directly to node (leaving out delete).
758 	 */
759 	if (node != NULL) {
760 		AVL_SETPARENT(node, parent);
761 		AVL_SETCHILD(node, which_child);
762 	}
763 	if (parent == NULL) {
764 		tree->avl_root = node;
765 		return;
766 	}
767 	parent->avl_child[which_child] = node;
768 
769 
770 	/*
771 	 * Since the subtree is now shorter, begin adjusting parent balances
772 	 * and performing any needed rotations.
773 	 */
774 	do {
775 
776 		/*
777 		 * Move up the tree and adjust the balance
778 		 *
779 		 * Capture the parent and which_child values for the next
780 		 * iteration before any rotations occur.
781 		 */
782 		node = parent;
783 		old_balance = AVL_XBALANCE(node);
784 		new_balance = old_balance - avl_child2balance[which_child];
785 		parent = AVL_XPARENT(node);
786 		which_child = AVL_XCHILD(node);
787 
788 		/*
789 		 * If a node was in perfect balance but isn't anymore then
790 		 * we can stop, since the height didn't change above this point
791 		 * due to a deletion.
792 		 */
793 		if (old_balance == 0) {
794 			AVL_SETBALANCE(node, new_balance);
795 			break;
796 		}
797 
798 		/*
799 		 * If the new balance is zero, we don't need to rotate
800 		 * else
801 		 * need a rotation to fix the balance.
802 		 * If the rotation doesn't change the height
803 		 * of the sub-tree we have finished adjusting.
804 		 */
805 		if (new_balance == 0)
806 			AVL_SETBALANCE(node, new_balance);
807 		else if (!avl_rotation(tree, node, new_balance))
808 			break;
809 	} while (parent != NULL);
810 }
811 
812 #define	AVL_REINSERT(tree, obj)		\
813 	avl_remove((tree), (obj));	\
814 	avl_add((tree), (obj))
815 
816 boolean_t
817 avl_update_lt(avl_tree_t *t, void *obj)
818 {
819 	void *neighbor;
820 
821 	ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
822 	    (t->avl_compar(obj, neighbor) <= 0));
823 
824 	neighbor = AVL_PREV(t, obj);
825 	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
826 		AVL_REINSERT(t, obj);
827 		return (B_TRUE);
828 	}
829 
830 	return (B_FALSE);
831 }
832 
833 boolean_t
834 avl_update_gt(avl_tree_t *t, void *obj)
835 {
836 	void *neighbor;
837 
838 	ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
839 	    (t->avl_compar(obj, neighbor) >= 0));
840 
841 	neighbor = AVL_NEXT(t, obj);
842 	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
843 		AVL_REINSERT(t, obj);
844 		return (B_TRUE);
845 	}
846 
847 	return (B_FALSE);
848 }
849 
850 boolean_t
851 avl_update(avl_tree_t *t, void *obj)
852 {
853 	void *neighbor;
854 
855 	neighbor = AVL_PREV(t, obj);
856 	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
857 		AVL_REINSERT(t, obj);
858 		return (B_TRUE);
859 	}
860 
861 	neighbor = AVL_NEXT(t, obj);
862 	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
863 		AVL_REINSERT(t, obj);
864 		return (B_TRUE);
865 	}
866 
867 	return (B_FALSE);
868 }
869 
870 void
871 avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
872 {
873 	avl_node_t *temp_node;
874 	ulong_t temp_numnodes;
875 
876 	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
877 	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
878 
879 	temp_node = tree1->avl_root;
880 	temp_numnodes = tree1->avl_numnodes;
881 	tree1->avl_root = tree2->avl_root;
882 	tree1->avl_numnodes = tree2->avl_numnodes;
883 	tree2->avl_root = temp_node;
884 	tree2->avl_numnodes = temp_numnodes;
885 }
886 
887 /*
888  * initialize a new AVL tree
889  */
890 void
891 avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
892     size_t size, size_t offset)
893 {
894 	ASSERT(tree);
895 	ASSERT(compar);
896 	ASSERT(size > 0);
897 	ASSERT(size >= offset + sizeof (avl_node_t));
898 #ifdef _LP64
899 	ASSERT((offset & 0x7) == 0);
900 #endif
901 
902 	tree->avl_compar = compar;
903 	tree->avl_root = NULL;
904 	tree->avl_numnodes = 0;
905 	tree->avl_offset = offset;
906 }
907 
908 /*
909  * Delete a tree.
910  */
911 void
912 avl_destroy(avl_tree_t *tree)
913 {
914 	ASSERT(tree);
915 	ASSERT(tree->avl_numnodes == 0);
916 	ASSERT(tree->avl_root == NULL);
917 }
918 
919 
920 /*
921  * Return the number of nodes in an AVL tree.
922  */
923 ulong_t
924 avl_numnodes(avl_tree_t *tree)
925 {
926 	ASSERT(tree);
927 	return (tree->avl_numnodes);
928 }
929 
930 boolean_t
931 avl_is_empty(avl_tree_t *tree)
932 {
933 	ASSERT(tree);
934 	return (tree->avl_numnodes == 0);
935 }
936 
937 #define	CHILDBIT	(1L)
938 
939 /*
940  * Post-order tree walk used to visit all tree nodes and destroy the tree
941  * in post order. This is used for removing all the nodes from a tree without
942  * paying any cost for rebalancing it.
943  *
944  * example:
945  *
946  *	void *cookie = NULL;
947  *	my_data_t *node;
948  *
949  *	while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
950  *		free(node);
951  *	avl_destroy(tree);
952  *
953  * The cookie is really an avl_node_t to the current node's parent and
954  * an indication of which child you looked at last.
955  *
956  * On input, a cookie value of CHILDBIT indicates the tree is done.
957  */
958 void *
959 avl_destroy_nodes(avl_tree_t *tree, void **cookie)
960 {
961 	avl_node_t	*node;
962 	avl_node_t	*parent;
963 	int		child;
964 	void		*first;
965 	size_t		off = tree->avl_offset;
966 
967 	/*
968 	 * Initial calls go to the first node or it's right descendant.
969 	 */
970 	if (*cookie == NULL) {
971 		first = avl_first(tree);
972 
973 		/*
974 		 * deal with an empty tree
975 		 */
976 		if (first == NULL) {
977 			*cookie = (void *)CHILDBIT;
978 			return (NULL);
979 		}
980 
981 		node = AVL_DATA2NODE(first, off);
982 		parent = AVL_XPARENT(node);
983 		goto check_right_side;
984 	}
985 
986 	/*
987 	 * If there is no parent to return to we are done.
988 	 */
989 	parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
990 	if (parent == NULL) {
991 		if (tree->avl_root != NULL) {
992 			ASSERT(tree->avl_numnodes == 1);
993 			tree->avl_root = NULL;
994 			tree->avl_numnodes = 0;
995 		}
996 		return (NULL);
997 	}
998 
999 	/*
1000 	 * Remove the child pointer we just visited from the parent and tree.
1001 	 */
1002 	child = (uintptr_t)(*cookie) & CHILDBIT;
1003 	parent->avl_child[child] = NULL;
1004 	ASSERT(tree->avl_numnodes > 1);
1005 	--tree->avl_numnodes;
1006 
1007 	/*
1008 	 * If we just removed a right child or there isn't one, go up to parent.
1009 	 */
1010 	if (child == 1 || parent->avl_child[1] == NULL) {
1011 		node = parent;
1012 		parent = AVL_XPARENT(parent);
1013 		goto done;
1014 	}
1015 
1016 	/*
1017 	 * Do parent's right child, then leftmost descendent.
1018 	 */
1019 	node = parent->avl_child[1];
1020 	while (node->avl_child[0] != NULL) {
1021 		parent = node;
1022 		node = node->avl_child[0];
1023 	}
1024 
1025 	/*
1026 	 * If here, we moved to a left child. It may have one
1027 	 * child on the right (when balance == +1).
1028 	 */
1029 check_right_side:
1030 	if (node->avl_child[1] != NULL) {
1031 		ASSERT(AVL_XBALANCE(node) == 1);
1032 		parent = node;
1033 		node = node->avl_child[1];
1034 		ASSERT(node->avl_child[0] == NULL &&
1035 		    node->avl_child[1] == NULL);
1036 	} else {
1037 		ASSERT(AVL_XBALANCE(node) <= 0);
1038 	}
1039 
1040 done:
1041 	if (parent == NULL) {
1042 		*cookie = (void *)CHILDBIT;
1043 		ASSERT(node == tree->avl_root);
1044 	} else {
1045 		*cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
1046 	}
1047 
1048 	return (AVL_NODE2DATA(node, off));
1049 }
1050 
1051 #if defined(_KERNEL)
1052 
1053 static int __init
1054 avl_init(void)
1055 {
1056 	return (0);
1057 }
1058 
1059 static void __exit
1060 avl_fini(void)
1061 {
1062 }
1063 
1064 module_init(avl_init);
1065 module_exit(avl_fini);
1066 #endif
1067 
1068 ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation");
1069 ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
1070 ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
1071 ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
1072 
1073 EXPORT_SYMBOL(avl_create);
1074 EXPORT_SYMBOL(avl_find);
1075 EXPORT_SYMBOL(avl_insert);
1076 EXPORT_SYMBOL(avl_insert_here);
1077 EXPORT_SYMBOL(avl_walk);
1078 EXPORT_SYMBOL(avl_first);
1079 EXPORT_SYMBOL(avl_last);
1080 EXPORT_SYMBOL(avl_nearest);
1081 EXPORT_SYMBOL(avl_add);
1082 EXPORT_SYMBOL(avl_swap);
1083 EXPORT_SYMBOL(avl_is_empty);
1084 EXPORT_SYMBOL(avl_remove);
1085 EXPORT_SYMBOL(avl_numnodes);
1086 EXPORT_SYMBOL(avl_destroy_nodes);
1087 EXPORT_SYMBOL(avl_destroy);
1088 EXPORT_SYMBOL(avl_update_lt);
1089 EXPORT_SYMBOL(avl_update_gt);
1090 EXPORT_SYMBOL(avl_update);
1091