xref: /titanic_44/usr/src/common/net/patricia/radix.c (revision 8c97a06b88a7f040b7d72941ca32323bb75cfec7)
1 /*
2  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  *
5  * Copyright (c) 1988, 1989, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)radix.c	8.5 (Berkeley) 5/19/95
33  * $FreeBSD: /repoman/r/ncvs/src/sys/net/radix.c,v 1.36.2.1 2005/01/31 23:26:23
34  * imp Exp $
35  */
36 
37 
38 /*
39  * Routines to build and maintain radix trees for routing lookups.
40  */
41 #include <sys/types.h>
42 
43 #ifndef _RADIX_H_
44 #include <sys/param.h>
45 #ifdef	_KERNEL
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/systm.h>
49 #include <sys/cmn_err.h>
50 #else
51 #include <assert.h>
52 #define	ASSERT assert
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <syslog.h>
56 #include <strings.h>
57 #endif	/* _KERNEL */
58 #include <net/radix.h>
59 #endif
60 
61 #ifndef	_KERNEL
62 void
63 panic(const char *str)
64 {
65 	fprintf(stderr, "Panic - %s\n", str);
66 	abort();
67 }
68 #endif	/* _KERNEL */
69 
70 static int	rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
71 static int	rn_walktree_mt(struct radix_node_head *, walktree_f_t *,
72     void *, lockf_t, lockf_t);
73 static struct radix_node
74 	*rn_insert(void *, struct radix_node_head *, int *,
75 	    struct radix_node [2]),
76 	*rn_newpair(void *, int, struct radix_node[2]),
77 	*rn_search(void *, struct radix_node *),
78 	*rn_search_m(void *, struct radix_node *, void *),
79 	*rn_lookup(void *, void *, struct radix_node_head *),
80 	*rn_match(void *, struct radix_node_head *),
81 	*rn_match_args(void *, struct radix_node_head *, match_leaf_t *,
82 	    void *),
83 	*rn_addmask(void *, int, int),
84 	*rn_addroute(void *, void *, struct radix_node_head *,
85 	    struct radix_node [2]),
86 	*rn_delete(void *, void *, struct radix_node_head *);
87 static	boolean_t rn_refines(void *, void *);
88 
89 /*
90  * IPF also uses PATRICIA tree to manage ippools. IPF stores its own structure
91  * addrfamily_t. sizeof (addrfamily_t) == 24.
92  */
93 #define	MAX_KEYLEN	24
94 static int	max_keylen = MAX_KEYLEN;
95 
96 #ifdef	_KERNEL
97 static struct kmem_cache *radix_mask_cache; /* for rn_mkfreelist */
98 static struct kmem_cache *radix_node_cache;
99 #else
100 static char *radix_mask_cache, *radix_node_cache; /* dummy vars. never inited */
101 #endif	/* _KERNEL */
102 
103 static struct radix_mask *rn_mkfreelist;
104 static struct radix_node_head *mask_rnhead;
105 /*
106  * Work area -- the following point to 2 buffers of size max_keylen,
107  * allocated in this order in a block of memory malloc'ed by rn_init.
108  * A third buffer of size MAX_KEYLEN is allocated from the stack.
109  */
110 static char *rn_zeros, *rn_ones;
111 
112 #define	MKGet(m)  R_Malloc(m, radix_mask_cache, sizeof (struct radix_mask))
113 #define	MKFree(m) Free(m, radix_mask_cache)
114 #define	rn_masktop (mask_rnhead->rnh_treetop)
115 
116 static boolean_t	rn_lexobetter(void *m_arg, void *n_arg);
117 static struct radix_mask *
118 		rn_new_radix_mask(struct radix_node *tt,
119 		    struct radix_mask *next);
120 static boolean_t
121 		rn_satisfies_leaf(char *trial, struct radix_node *leaf,
122 		    int skip, match_leaf_t *rn_leaf_fn, void *rn_leaf_arg);
123 
124 #define	RN_MATCHF(rn, f, arg)	(f == NULL || (*f)((rn), arg))
125 
126 /*
127  * The data structure for the keys is a radix tree with one way
128  * branching removed.  The index rn_bit at an internal node n represents a bit
129  * position to be tested.  The tree is arranged so that all descendants
130  * of a node n have keys whose bits all agree up to position rn_bit - 1.
131  * (We say the index of n is rn_bit.)
132  *
133  * There is at least one descendant which has a one bit at position rn_bit,
134  * and at least one with a zero there.
135  *
136  * A route is determined by a pair of key and mask.  We require that the
137  * bit-wise logical and of the key and mask to be the key.
138  * We define the index of a route associated with the mask to be
139  * the first bit number in the mask where 0 occurs (with bit number 0
140  * representing the highest order bit).
141  *
142  * We say a mask is normal if every bit is 0, past the index of the mask.
143  * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
144  * and m is a normal mask, then the route applies to every descendant of n.
145  * If the index(m) < rn_bit, this implies the trailing last few bits of k
146  * before bit b are all 0, (and hence consequently true of every descendant
147  * of n), so the route applies to all descendants of the node as well.
148  *
149  * Similar logic shows that a non-normal mask m such that
150  * index(m) <= index(n) could potentially apply to many children of n.
151  * Thus, for each non-host route, we attach its mask to a list at an internal
152  * node as high in the tree as we can go.
153  *
154  * The present version of the code makes use of normal routes in short-
155  * circuiting an explict mask and compare operation when testing whether
156  * a key satisfies a normal route, and also in remembering the unique leaf
157  * that governs a subtree.
158  */
159 
160 /*
161  * Most of the functions in this code assume that the key/mask arguments
162  * are sockaddr-like structures, where the first byte is an uchar_t
163  * indicating the size of the entire structure.
164  *
165  * To make the assumption more explicit, we use the LEN() macro to access
166  * this field. It is safe to pass an expression with side effects
167  * to LEN() as the argument is evaluated only once.
168  */
169 #define	LEN(x) (*(const uchar_t *)(x))
170 
171 
172 /*
173  * Search a node in the tree matching the key.
174  */
175 static struct radix_node *
176 rn_search(v_arg, head)
177 	void *v_arg;
178 	struct radix_node *head;
179 {
180 	struct radix_node *x;
181 	caddr_t v;
182 
183 	for (x = head, v = v_arg; x->rn_bit >= 0; ) {
184 		if (x->rn_bmask & v[x->rn_offset])
185 			x = x->rn_right;
186 		else
187 			x = x->rn_left;
188 	}
189 	return (x);
190 }
191 
192 /*
193  * Same as above, but with an additional mask.
194  */
195 static struct radix_node *
196 rn_search_m(v_arg, head, m_arg)
197 	struct radix_node *head;
198 	void *v_arg, *m_arg;
199 {
200 	struct radix_node *x;
201 	caddr_t v = v_arg, m = m_arg;
202 
203 	for (x = head; x->rn_bit >= 0; ) {
204 		if ((x->rn_bmask & m[x->rn_offset]) &&
205 		    (x->rn_bmask & v[x->rn_offset]))
206 			x = x->rn_right;
207 		else
208 			x = x->rn_left;
209 	}
210 	return (x);
211 }
212 
213 /*
214  * Returns true if there are no bits set in n_arg that are zero in
215  * m_arg and the masks aren't equal.  In other words, it returns true
216  * when m_arg is a finer-granularity netmask -- it represents a subset
217  * of the destinations implied by n_arg.
218  */
219 static boolean_t
220 rn_refines(m_arg, n_arg)
221 	void *m_arg, *n_arg;
222 {
223 	caddr_t m = m_arg, n = n_arg;
224 	caddr_t lim = n + LEN(n), lim2 = lim;
225 	int longer = LEN(n++) - (int)LEN(m++);
226 	boolean_t masks_are_equal = B_TRUE;
227 
228 	if (longer > 0)
229 		lim -= longer;
230 	while (n < lim) {
231 		if (*n & ~(*m))
232 			return (0);
233 		if (*n++ != *m++)
234 			masks_are_equal = B_FALSE;
235 	}
236 	while (n < lim2)
237 		if (*n++)
238 			return (B_FALSE);
239 	if (masks_are_equal && (longer < 0))
240 		for (lim2 = m - longer; m < lim2; )
241 			if (*m++)
242 				return (B_TRUE);
243 	return (!masks_are_equal);
244 }
245 
246 static struct radix_node *
247 rn_lookup(v_arg, m_arg, head)
248 	void *v_arg, *m_arg;
249 	struct radix_node_head *head;
250 {
251 	struct radix_node *x;
252 	caddr_t netmask = NULL;
253 
254 	if (m_arg) {
255 		x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
256 		if (x == NULL)
257 			return (NULL);
258 		netmask = x->rn_key;
259 	}
260 	x = rn_match(v_arg, head);
261 	if (x && netmask) {
262 		while (x && x->rn_mask != netmask)
263 			x = x->rn_dupedkey;
264 	}
265 	return (x);
266 }
267 
268 /*
269  * Returns true if address 'trial' has no bits differing from the
270  * leaf's key when compared under the leaf's mask.  In other words,
271  * returns true when 'trial' matches leaf.
272  * In addition, if a rn_leaf_fn is passed in, that is used to find
273  * a match on conditions defined by the caller of rn_match.  This is
274  * used by the kernel ftable to match on IRE_MATCH_* conditions.
275  */
276 static boolean_t
277 rn_satisfies_leaf(trial, leaf, skip, rn_leaf_fn, rn_leaf_arg)
278 	caddr_t trial;
279 	struct radix_node *leaf;
280 	int skip;
281 	match_leaf_t *rn_leaf_fn;
282 	void *rn_leaf_arg;
283 {
284 	char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
285 	char *cplim;
286 	int length = min(LEN(cp), LEN(cp2));
287 
288 	if (cp3 == 0)
289 		cp3 = rn_ones;
290 	else
291 		length = min(length, LEN(cp3));
292 	cplim = cp + length;
293 	cp3 += skip;
294 	cp2 += skip;
295 
296 	for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
297 		if ((*cp ^ *cp2) & *cp3)
298 			return (B_FALSE);
299 
300 	return (RN_MATCHF(leaf, rn_leaf_fn, rn_leaf_arg));
301 }
302 
303 static struct radix_node *
304 rn_match(v_arg, head)
305 	void *v_arg;
306 	struct radix_node_head *head;
307 {
308 	return (rn_match_args(v_arg, head, NULL, NULL));
309 }
310 
311 static struct radix_node *
312 rn_match_args(v_arg, head, rn_leaf_fn, rn_leaf_arg)
313 	void *v_arg;
314 	struct radix_node_head *head;
315 	match_leaf_t *rn_leaf_fn;
316 	void *rn_leaf_arg;
317 {
318 	caddr_t v = v_arg;
319 	struct radix_node *t = head->rnh_treetop, *x;
320 	caddr_t cp = v, cp2;
321 	caddr_t cplim;
322 	struct radix_node *saved_t, *top = t;
323 	int off = t->rn_offset, vlen = LEN(cp), matched_off;
324 	int test, b, rn_bit;
325 
326 	/*
327 	 * Open code rn_search(v, top) to avoid overhead of extra
328 	 * subroutine call.
329 	 */
330 	for (; t->rn_bit >= 0; ) {
331 		if (t->rn_bmask & cp[t->rn_offset])
332 			t = t->rn_right;
333 		else
334 			t = t->rn_left;
335 	}
336 	/*
337 	 * See if we match exactly as a host destination
338 	 * or at least learn how many bits match, for normal mask finesse.
339 	 *
340 	 * It doesn't hurt us to limit how many bytes to check
341 	 * to the length of the mask, since if it matches we had a genuine
342 	 * match and the leaf we have is the most specific one anyway;
343 	 * if it didn't match with a shorter length it would fail
344 	 * with a long one.  This wins big for class B&C netmasks which
345 	 * are probably the most common case...
346 	 */
347 	if (t->rn_mask)
348 		vlen = LEN(t->rn_mask);
349 	cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
350 	for (; cp < cplim; cp++, cp2++)
351 		if (*cp != *cp2)
352 			goto keydiff;
353 	/*
354 	 * This extra grot is in case we are explicitly asked
355 	 * to look up the default.  Ugh!
356 	 *
357 	 * Never return the root node itself, it seems to cause a
358 	 * lot of confusion.
359 	 */
360 	if (t->rn_flags & RNF_ROOT)
361 		t = t->rn_dupedkey;
362 	if (t == NULL || RN_MATCHF(t, rn_leaf_fn, rn_leaf_arg)) {
363 		return (t);
364 	} else {
365 		/*
366 		 * Although we found an exact match on the key, rn_leaf_fn
367 		 * is looking for some other criteria as well. Continue
368 		 * looking as if the exact match failed.
369 		 */
370 		if (t->rn_parent->rn_flags & RNF_ROOT) {
371 			/* hit the top. have to give up */
372 			return (NULL);
373 		}
374 		b = 0;
375 		goto keeplooking;
376 
377 	}
378 keydiff:
379 	test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
380 	for (b = 7; (test >>= 1) > 0; )
381 		b--;
382 keeplooking:
383 	matched_off = cp - v;
384 	b += matched_off << 3;
385 	rn_bit = -1 - b;
386 
387 	/*
388 	 * If there is a host route in a duped-key chain, it will be first.
389 	 */
390 	if ((saved_t = t)->rn_mask == 0)
391 		t = t->rn_dupedkey;
392 	for (; t != NULL; t = t->rn_dupedkey) {
393 		/*
394 		 * Even if we don't match exactly as a host,
395 		 * we may match if the leaf we wound up at is
396 		 * a route to a net.
397 		 */
398 
399 		if (t->rn_flags & RNF_NORMAL) {
400 			if ((rn_bit <= t->rn_bit) &&
401 			    RN_MATCHF(t, rn_leaf_fn, rn_leaf_arg)) {
402 				return (t);
403 			}
404 		} else if (rn_satisfies_leaf(v, t, matched_off, rn_leaf_fn,
405 		    rn_leaf_arg)) {
406 			return (t);
407 		}
408 	}
409 	t = saved_t;
410 	/* start searching up the tree */
411 	do {
412 		struct radix_mask *m;
413 
414 		t = t->rn_parent;
415 		m = t->rn_mklist;
416 		/*
417 		 * If non-contiguous masks ever become important
418 		 * we can restore the masking and open coding of
419 		 * the search and satisfaction test and put the
420 		 * calculation of "off" back before the "do".
421 		 */
422 		while (m) {
423 			if (m->rm_flags & RNF_NORMAL) {
424 				if ((rn_bit <= m->rm_bit) &&
425 				    RN_MATCHF(m->rm_leaf, rn_leaf_fn,
426 				    rn_leaf_arg)) {
427 					return (m->rm_leaf);
428 				}
429 			} else {
430 				off = min(t->rn_offset, matched_off);
431 				x = rn_search_m(v, t, m->rm_mask);
432 				while (x != NULL && x->rn_mask != m->rm_mask)
433 					x = x->rn_dupedkey;
434 				if (x && rn_satisfies_leaf(v, x, off,
435 				    rn_leaf_fn, rn_leaf_arg)) {
436 					return (x);
437 				}
438 			}
439 			m = m->rm_mklist;
440 		}
441 	} while (t != top);
442 	return (0);
443 }
444 
445 /*
446  * Whenever we add a new leaf to the tree, we also add a parent node,
447  * so we allocate them as an array of two elements: the first one must be
448  * the leaf (see RNTORT() in route.c), the second one is the parent.
449  * This routine initializes the relevant fields of the nodes, so that
450  * the leaf is the left child of the parent node, and both nodes have
451  * (almost) all all fields filled as appropriate.
452  * The function returns a pointer to the parent node.
453  */
454 
455 static struct radix_node *
456 rn_newpair(v, b, nodes)
457 	void *v;
458 	int b;
459 	struct radix_node nodes[2];
460 {
461 	struct radix_node *tt = nodes, *t = tt + 1;
462 
463 	t->rn_bit = b;
464 	t->rn_bmask = 0x80 >> (b & 7);
465 	t->rn_left = tt;
466 	t->rn_offset = b >> 3;
467 
468 	/*
469 	 * t->rn_parent, r->rn_right, tt->rn_mask, tt->rn_dupedkey
470 	 * and tt->rn_bmask must have been zeroed by caller.
471 	 */
472 	tt->rn_bit = -1;
473 	tt->rn_key = v;
474 	tt->rn_parent = t;
475 	tt->rn_flags = t->rn_flags = RNF_ACTIVE;
476 	tt->rn_mklist = t->rn_mklist = 0;
477 	return (t);
478 }
479 
480 static struct radix_node *
481 rn_insert(v_arg, head, dupentry, nodes)
482 	void *v_arg;
483 	struct radix_node_head *head;
484 	int *dupentry;
485 	struct radix_node nodes[2];
486 {
487 	caddr_t v = v_arg;
488 	struct radix_node *top = head->rnh_treetop;
489 	int head_off = top->rn_offset, vlen = (int)LEN(v);
490 	struct radix_node *t = rn_search(v_arg, top);
491 	caddr_t cp = v + head_off;
492 	int b;
493 	struct radix_node *tt;
494 
495 	/*
496 	 * Find first bit at which v and t->rn_key differ
497 	 */
498 	{
499 		caddr_t cp2 = t->rn_key + head_off;
500 		int cmp_res;
501 		caddr_t cplim = v + vlen;
502 
503 		while (cp < cplim)
504 			if (*cp2++ != *cp++)
505 				goto on1;
506 		*dupentry = 1;
507 		return (t);
508 on1:
509 		*dupentry = 0;
510 		cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
511 		for (b = (cp - v) << 3; cmp_res; b--)
512 			cmp_res >>= 1;
513 	}
514 	{
515 		struct radix_node *p, *x = top;
516 		cp = v;
517 		do {
518 			p = x;
519 			if (cp[x->rn_offset] & x->rn_bmask)
520 				x = x->rn_right;
521 			else
522 				x = x->rn_left;
523 		} while (b > (unsigned)x->rn_bit);
524 				/* x->rn_bit < b && x->rn_bit >= 0 */
525 		t = rn_newpair(v_arg, b, nodes);
526 		tt = t->rn_left;
527 		if ((cp[p->rn_offset] & p->rn_bmask) == 0)
528 			p->rn_left = t;
529 		else
530 			p->rn_right = t;
531 		x->rn_parent = t;
532 		t->rn_parent = p;
533 		if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
534 			t->rn_right = x;
535 		} else {
536 			t->rn_right = tt;
537 			t->rn_left = x;
538 		}
539 	}
540 	return (tt);
541 }
542 
543 static struct radix_node *
544 rn_addmask(n_arg, search, skip)
545 	int search, skip;
546 	void *n_arg;
547 {
548 	caddr_t netmask = (caddr_t)n_arg;
549 	struct radix_node *x;
550 	caddr_t cp, cplim;
551 	int b = 0, mlen, j;
552 	int maskduplicated, m0, isnormal;
553 	struct radix_node *saved_x;
554 	int last_zeroed = 0;
555 	char addmask_key[MAX_KEYLEN];
556 
557 	if ((mlen = LEN(netmask)) > max_keylen)
558 		mlen = max_keylen;
559 	if (skip == 0)
560 		skip = 1;
561 	if (mlen <= skip)
562 		return (mask_rnhead->rnh_nodes);
563 	if (skip > 1)
564 		bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
565 	if ((m0 = mlen) > skip)
566 		bcopy(netmask + skip, addmask_key + skip, mlen - skip);
567 	/*
568 	 * Trim trailing zeroes.
569 	 */
570 	for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0; )
571 		cp--;
572 	mlen = cp - addmask_key;
573 	if (mlen <= skip) {
574 		if (m0 >= last_zeroed)
575 			last_zeroed = mlen;
576 		return (mask_rnhead->rnh_nodes);
577 	}
578 	if (m0 < last_zeroed)
579 		bzero(addmask_key + m0, last_zeroed - m0);
580 	*addmask_key = last_zeroed = mlen;
581 	x = rn_search(addmask_key, rn_masktop);
582 	if (bcmp(addmask_key, x->rn_key, mlen) != 0)
583 		x = 0;
584 	if (x || search)
585 		return (x);
586 	R_Zalloc(x, radix_node_cache, max_keylen + 2 * sizeof (*x));
587 
588 	if ((saved_x = x) == 0)
589 		return (0);
590 	netmask = cp = (caddr_t)(x + 2);
591 	bcopy(addmask_key, cp, mlen);
592 	x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
593 	if (maskduplicated) {
594 #ifdef	_KERNEL
595 		cmn_err(CE_WARN, "rn_addmask: mask impossibly already in tree");
596 #else
597 		syslog(LOG_ERR, "rn_addmask: mask impossibly already in tree");
598 #endif	/* _KERNEL */
599 		Free(saved_x, radix_node_cache);
600 		return (x);
601 	}
602 	/*
603 	 * Calculate index of mask, and check for normalcy.
604 	 * First find the first byte with a 0 bit, then if there are
605 	 * more bits left (remember we already trimmed the trailing 0's),
606 	 * the pattern must be one of those in normal_chars[], or we have
607 	 * a non-contiguous mask.
608 	 */
609 	cplim = netmask + mlen;
610 	isnormal = 1;
611 	for (cp = netmask + skip; (cp < cplim) && *(uchar_t *)cp == 0xff; )
612 		cp++;
613 	if (cp != cplim) {
614 		static uint8_t normal_chars[] = {
615 			0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
616 
617 		for (j = 0x80; (j & *cp) != 0; j >>= 1)
618 			b++;
619 		if (*cp != normal_chars[b] || cp != (cplim - 1))
620 			isnormal = 0;
621 	}
622 	b += (cp - netmask) << 3;
623 	x->rn_bit = -1 - b;
624 	if (isnormal)
625 		x->rn_flags |= RNF_NORMAL;
626 	return (x);
627 }
628 
629 /* arbitrary ordering for non-contiguous masks */
630 static boolean_t
631 rn_lexobetter(m_arg, n_arg)
632 	void *m_arg, *n_arg;
633 {
634 	uchar_t *mp = m_arg, *np = n_arg, *lim;
635 
636 	if (LEN(mp) > LEN(np))
637 		/* not really, but need to check longer one first */
638 		return (B_TRUE);
639 	if (LEN(mp) == LEN(np))
640 		for (lim = mp + LEN(mp); mp < lim; )
641 			if (*mp++ > *np++)
642 				return (B_TRUE);
643 	return (B_FALSE);
644 }
645 
646 static struct radix_mask *
647 rn_new_radix_mask(tt, next)
648 	struct radix_node *tt;
649 	struct radix_mask *next;
650 {
651 	struct radix_mask *m;
652 
653 	MKGet(m);
654 	if (m == 0) {
655 #ifndef	_KERNEL
656 		syslog(LOG_ERR, "Mask for route not entered\n");
657 #endif	/* _KERNEL */
658 		return (0);
659 	}
660 	bzero(m, sizeof (*m));
661 	m->rm_bit = tt->rn_bit;
662 	m->rm_flags = tt->rn_flags;
663 	if (tt->rn_flags & RNF_NORMAL)
664 		m->rm_leaf = tt;
665 	else
666 		m->rm_mask = tt->rn_mask;
667 	m->rm_mklist = next;
668 	tt->rn_mklist = m;
669 	return (m);
670 }
671 
672 static struct radix_node *
673 rn_addroute(v_arg, n_arg, head, treenodes)
674 	void *v_arg, *n_arg;
675 	struct radix_node_head *head;
676 	struct radix_node treenodes[2];
677 {
678 	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
679 	struct radix_node *t, *x = 0, *tt;
680 	struct radix_node *saved_tt, *top = head->rnh_treetop;
681 	short b = 0, b_leaf = 0;
682 	int keyduplicated;
683 	caddr_t mmask;
684 	struct radix_mask *m, **mp;
685 
686 	/*
687 	 * In dealing with non-contiguous masks, there may be
688 	 * many different routes which have the same mask.
689 	 * We will find it useful to have a unique pointer to
690 	 * the mask to speed avoiding duplicate references at
691 	 * nodes and possibly save time in calculating indices.
692 	 */
693 	if (netmask)  {
694 		if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
695 			return (0);
696 		b_leaf = x->rn_bit;
697 		b = -1 - x->rn_bit;
698 		netmask = x->rn_key;
699 	}
700 	/*
701 	 * Deal with duplicated keys: attach node to previous instance
702 	 */
703 	saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
704 	if (keyduplicated) {
705 		for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
706 			if (tt->rn_mask == netmask)
707 				return (0);
708 			if (netmask == 0 ||
709 			    (tt->rn_mask &&
710 			    /* index (netmask) > node */
711 			    ((b_leaf < tt->rn_bit) ||
712 			    rn_refines(netmask, tt->rn_mask) ||
713 			    rn_lexobetter(netmask, tt->rn_mask))))
714 				break;
715 		}
716 		/*
717 		 * If the mask is not duplicated, we wouldn't
718 		 * find it among possible duplicate key entries
719 		 * anyway, so the above test doesn't hurt.
720 		 *
721 		 * We sort the masks for a duplicated key the same way as
722 		 * in a masklist -- most specific to least specific.
723 		 * This may require the unfortunate nuisance of relocating
724 		 * the head of the list.
725 		 *
726 		 * We also reverse, or doubly link the list through the
727 		 * parent pointer.
728 		 */
729 		if (tt == saved_tt) {
730 			struct	radix_node *xx = x;
731 			/* link in at head of list */
732 			(tt = treenodes)->rn_dupedkey = t;
733 			tt->rn_flags = t->rn_flags;
734 			tt->rn_parent = x = t->rn_parent;
735 			t->rn_parent = tt; /* parent */
736 			if (x->rn_left == t)
737 				x->rn_left = tt;
738 			else
739 				x->rn_right = tt;
740 			saved_tt = tt; x = xx;
741 		} else {
742 			(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
743 			t->rn_dupedkey = tt;
744 			/* Set rn_parent value for tt and tt->rn_dupedkey */
745 			tt->rn_parent = t;
746 			if (tt->rn_dupedkey)
747 				tt->rn_dupedkey->rn_parent = tt;
748 		}
749 		tt->rn_key = v;
750 		tt->rn_bit = -1;
751 		tt->rn_flags = RNF_ACTIVE;
752 	}
753 	/*
754 	 * Put mask in tree.
755 	 */
756 	if (netmask) {
757 		tt->rn_mask = netmask;
758 		tt->rn_bit = x->rn_bit;
759 		tt->rn_flags |= x->rn_flags & RNF_NORMAL;
760 	}
761 	t = saved_tt->rn_parent;
762 	if (keyduplicated)
763 		goto key_exists;
764 	b_leaf = -1 - t->rn_bit;
765 	if (t->rn_right == saved_tt)
766 		x = t->rn_left;
767 	else
768 		x = t->rn_right;
769 	/* Promote general routes from below */
770 	if (x->rn_bit < 0) {
771 	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
772 		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
773 			*mp = m = rn_new_radix_mask(x, 0);
774 			if (m)
775 				mp = &m->rm_mklist;
776 		}
777 	} else if (x->rn_mklist) {
778 		/*
779 		 * Skip over masks whose index is > that of new node
780 		 */
781 		for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
782 			if (m->rm_bit >= b_leaf)
783 				break;
784 		t->rn_mklist = m; *mp = 0;
785 	}
786 key_exists:
787 	/* Add new route to highest possible ancestor's list */
788 	if ((netmask == 0) || (b > t->rn_bit))
789 		return (tt); /* can't lift at all */
790 	b_leaf = tt->rn_bit;
791 	do {
792 		x = t;
793 		t = t->rn_parent;
794 	} while (b <= t->rn_bit && x != top);
795 	/*
796 	 * Search through routes associated with node to
797 	 * insert new route according to index.
798 	 * Need same criteria as when sorting dupedkeys to avoid
799 	 * double loop on deletion.
800 	 */
801 	for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
802 		if (m->rm_bit < b_leaf)
803 			continue;
804 		if (m->rm_bit > b_leaf)
805 			break;
806 		if (m->rm_flags & RNF_NORMAL) {
807 			mmask = m->rm_leaf->rn_mask;
808 			if (tt->rn_flags & RNF_NORMAL) {
809 #ifdef	_KERNEL
810 				cmn_err(CE_WARN, "Non-unique normal route, "
811 				    "mask not entered\n");
812 #else
813 				syslog(LOG_ERR, "Non-unique normal route, "
814 				    "mask not entered\n");
815 #endif	/* _KERNEL */
816 				return (tt);
817 			}
818 		} else
819 			mmask = m->rm_mask;
820 		if (mmask == netmask) {
821 			m->rm_refs++;
822 			tt->rn_mklist = m;
823 			return (tt);
824 		}
825 		if (rn_refines(netmask, mmask) ||
826 		    rn_lexobetter(netmask, mmask))
827 			break;
828 	}
829 	*mp = rn_new_radix_mask(tt, *mp);
830 	return (tt);
831 }
832 
833 static struct radix_node *
834 rn_delete(v_arg, netmask_arg, head)
835 	void *v_arg, *netmask_arg;
836 	struct radix_node_head *head;
837 {
838 	struct radix_node *t, *p, *x, *tt;
839 	struct radix_mask *m, *saved_m, **mp;
840 	struct radix_node *dupedkey, *saved_tt, *top;
841 	caddr_t v, netmask;
842 	int b, head_off, vlen;
843 
844 	v = v_arg;
845 	netmask = netmask_arg;
846 	x = head->rnh_treetop;
847 	tt = rn_search(v, x);
848 	head_off = x->rn_offset;
849 	vlen =  LEN(v);
850 	saved_tt = tt;
851 	top = x;
852 	if (tt == 0 ||
853 	    bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
854 		return (0);
855 	/*
856 	 * Delete our route from mask lists.
857 	 */
858 	if (netmask) {
859 		if ((x = rn_addmask(netmask, 1, head_off)) == 0)
860 			return (0);
861 		netmask = x->rn_key;
862 		while (tt->rn_mask != netmask)
863 			if ((tt = tt->rn_dupedkey) == 0)
864 				return (0);
865 	}
866 	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
867 		goto on1;
868 	if (tt->rn_flags & RNF_NORMAL) {
869 		if (m->rm_leaf != tt || m->rm_refs > 0) {
870 #ifdef	_KERNEL
871 			cmn_err(CE_WARN,
872 			    "rn_delete: inconsistent annotation\n");
873 #else
874 			syslog(LOG_ERR, "rn_delete: inconsistent annotation\n");
875 #endif	/* _KERNEL */
876 			return (0);  /* dangling ref could cause disaster */
877 		}
878 	} else {
879 		if (m->rm_mask != tt->rn_mask) {
880 #ifdef	_KERNEL
881 			cmn_err(CE_WARN,
882 			    "rn_delete: inconsistent annotation 2\n");
883 #else
884 			syslog(LOG_ERR,
885 			    "rn_delete: inconsistent annotation 2\n");
886 #endif	/* _KERNEL */
887 			goto on1;
888 		}
889 		if (--m->rm_refs >= 0)
890 			goto on1;
891 	}
892 	b = -1 - tt->rn_bit;
893 	t = saved_tt->rn_parent;
894 	if (b > t->rn_bit)
895 		goto on1; /* Wasn't lifted at all */
896 	do {
897 		x = t;
898 		t = t->rn_parent;
899 	} while (b <= t->rn_bit && x != top);
900 	for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
901 		if (m == saved_m) {
902 			*mp = m->rm_mklist;
903 			MKFree(m);
904 			break;
905 		}
906 	if (m == 0) {
907 #ifdef	_KERNEL
908 		cmn_err(CE_WARN, "rn_delete: couldn't find our annotation\n");
909 #else
910 		syslog(LOG_ERR, "rn_delete: couldn't find our annotation\n");
911 #endif	/* _KERNEL */
912 		if (tt->rn_flags & RNF_NORMAL)
913 			return (0); /* Dangling ref to us */
914 	}
915 on1:
916 	/*
917 	 * Eliminate us from tree
918 	 */
919 	if (tt->rn_flags & RNF_ROOT)
920 		return (0);
921 	t = tt->rn_parent;
922 	dupedkey = saved_tt->rn_dupedkey;
923 	if (dupedkey) {
924 		/*
925 		 * Here, tt is the deletion target and
926 		 * saved_tt is the head of the dupekey chain.
927 		 */
928 		if (tt == saved_tt) {
929 			/* remove from head of chain */
930 			x = dupedkey; x->rn_parent = t;
931 			if (t->rn_left == tt)
932 				t->rn_left = x;
933 			else
934 				t->rn_right = x;
935 		} else {
936 			/* find node in front of tt on the chain */
937 			for (x = p = saved_tt; p && p->rn_dupedkey != tt; )
938 				p = p->rn_dupedkey;
939 			if (p) {
940 				p->rn_dupedkey = tt->rn_dupedkey;
941 				if (tt->rn_dupedkey)		/* parent */
942 					tt->rn_dupedkey->rn_parent = p;
943 								/* parent */
944 			} else
945 #ifdef	_KERNEL
946 				cmn_err(CE_WARN,
947 				    "rn_delete: couldn't find us\n");
948 #else
949 				syslog(LOG_ERR,
950 				    "rn_delete: couldn't find us\n");
951 #endif	/* _KERNEL */
952 		}
953 		t = tt + 1;
954 		if (t->rn_flags & RNF_ACTIVE) {
955 			*++x = *t;
956 			p = t->rn_parent;
957 			if (p->rn_left == t)
958 				p->rn_left = x;
959 			else
960 				p->rn_right = x;
961 			x->rn_left->rn_parent = x;
962 			x->rn_right->rn_parent = x;
963 		}
964 		goto out;
965 	}
966 	if (t->rn_left == tt)
967 		x = t->rn_right;
968 	else
969 		x = t->rn_left;
970 	p = t->rn_parent;
971 	if (p->rn_right == t)
972 		p->rn_right = x;
973 	else
974 		p->rn_left = x;
975 	x->rn_parent = p;
976 	/*
977 	 * Demote routes attached to us.
978 	 */
979 	if (t->rn_mklist) {
980 		if (x->rn_bit >= 0) {
981 			for (mp = &x->rn_mklist; (m = *mp) != NULL; )
982 				mp = &m->rm_mklist;
983 			*mp = t->rn_mklist;
984 		} else {
985 			/*
986 			 * If there are any key,mask pairs in a sibling
987 			 * duped-key chain, some subset will appear sorted
988 			 * in the same order attached to our mklist
989 			 */
990 			for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
991 				if (m == x->rn_mklist) {
992 					struct radix_mask *mm = m->rm_mklist;
993 					x->rn_mklist = 0;
994 					if (--(m->rm_refs) < 0)
995 						MKFree(m);
996 					m = mm;
997 				}
998 			if (m)
999 #ifdef	_KERNEL
1000 				cmn_err(CE_WARN,
1001 				    "rn_delete: Orphaned Mask %p at %p\n",
1002 				    (void *)m, (void *)x);
1003 #else
1004 				syslog(LOG_ERR,
1005 				    "rn_delete: Orphaned Mask %p at %p\n",
1006 				    (void *)m, (void *)x);
1007 #endif	/* _KERNEL */
1008 		}
1009 	}
1010 	/*
1011 	 * We may be holding an active internal node in the tree.
1012 	 */
1013 	x = tt + 1;
1014 	if (t != x) {
1015 		*t = *x;
1016 		t->rn_left->rn_parent = t;
1017 		t->rn_right->rn_parent = t;
1018 		p = x->rn_parent;
1019 		if (p->rn_left == x)
1020 			p->rn_left = t;
1021 		else
1022 			p->rn_right = t;
1023 	}
1024 out:
1025 	tt->rn_flags &= ~RNF_ACTIVE;
1026 	tt[1].rn_flags &= ~RNF_ACTIVE;
1027 	return (tt);
1028 }
1029 
1030 /*
1031  * Walk the radix tree; For the kernel routing table, we hold additional
1032  * refs on the ire_bucket to ensure that the walk function f() does not
1033  * run into trashed memory. The kernel routing table is identified by
1034  * a rnh_treetop that has RNF_SUNW_FT set in the rn_flags.
1035  * Note that all refs takein in rn_walktree are released before it returns,
1036  * so that f() will need to take any additional references on memory
1037  * to be passed back to the caller of rn_walktree.
1038  */
1039 static int
1040 rn_walktree(h, f, w)
1041 	struct radix_node_head *h;
1042 	walktree_f_t *f;
1043 	void *w;
1044 {
1045 	return (rn_walktree_mt(h, f, w, NULL, NULL));
1046 }
1047 static int
1048 rn_walktree_mt(h, f, w, lockf, unlockf)
1049 	struct radix_node_head *h;
1050 	walktree_f_t *f;
1051 	void *w;
1052 	lockf_t lockf, unlockf;
1053 {
1054 	int error;
1055 	struct radix_node *base, *next;
1056 	struct radix_node *rn = h->rnh_treetop;
1057 	boolean_t is_mt = B_FALSE;
1058 
1059 	if (lockf != NULL) {
1060 		ASSERT(unlockf != NULL);
1061 		is_mt = B_TRUE;
1062 	}
1063 	/*
1064 	 * This gets complicated because we may delete the node
1065 	 * while applying the function f to it, so we need to calculate
1066 	 * the successor node in advance.
1067 	 */
1068 	RADIX_NODE_HEAD_RLOCK(h);
1069 	/* First time through node, go left */
1070 	while (rn->rn_bit >= 0) {
1071 		rn = rn->rn_left;
1072 	}
1073 
1074 	if (is_mt)
1075 		(*lockf)(rn);
1076 
1077 	for (;;) {
1078 		base = rn;
1079 		/* If at right child go back up, otherwise, go right */
1080 		while (rn->rn_parent->rn_right == rn &&
1081 		    (rn->rn_flags & RNF_ROOT) == 0) {
1082 			rn = rn->rn_parent;
1083 		}
1084 		/* Find the next *leaf* since next node might vanish, too */
1085 		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0; ) {
1086 			rn = rn->rn_left;
1087 		}
1088 		next = rn;
1089 
1090 		if (is_mt && next != NULL)
1091 			(*lockf)(next);
1092 
1093 		/* Process leaves */
1094 		while ((rn = base) != NULL) {
1095 			base = rn->rn_dupedkey;
1096 
1097 			if (is_mt && base != NULL)
1098 				(*lockf)(base);
1099 
1100 			RADIX_NODE_HEAD_UNLOCK(h);
1101 			if (!(rn->rn_flags & RNF_ROOT) &&
1102 			    (error = (*f)(rn, w))) {
1103 				if (is_mt) {
1104 					(*unlockf)(rn);
1105 					if (base != NULL)
1106 						(*unlockf)(base);
1107 					if (next != NULL)
1108 						(*unlockf)(next);
1109 				}
1110 				return (error);
1111 			}
1112 			if (is_mt)
1113 				(*unlockf)(rn);
1114 			RADIX_NODE_HEAD_RLOCK(h);
1115 		}
1116 		rn = next;
1117 		if (rn->rn_flags & RNF_ROOT) {
1118 			RADIX_NODE_HEAD_UNLOCK(h);
1119 			/*
1120 			 * no ref to release, since we never take a ref
1121 			 * on the root node- it can't be deleted.
1122 			 */
1123 			return (0);
1124 		}
1125 	}
1126 	/* NOTREACHED */
1127 }
1128 
1129 /*
1130  * Allocate and initialize an empty tree. This has 3 nodes, which are
1131  * part of the radix_node_head (in the order <left,root,right>) and are
1132  * marked RNF_ROOT so they cannot be freed.
1133  * The leaves have all-zero and all-one keys, with significant
1134  * bits starting at 'off'.
1135  * Return 1 on success, 0 on error.
1136  */
1137 int
1138 rn_inithead(head, off)
1139 	void **head;
1140 	int off;
1141 {
1142 	struct radix_node_head *rnh;
1143 	struct radix_node *t, *tt, *ttt;
1144 	if (*head)
1145 		return (1);
1146 	R_ZallocSleep(rnh, struct radix_node_head *, sizeof (*rnh));
1147 	if (rnh == 0)
1148 		return (0);
1149 #ifdef _KERNEL
1150 	RADIX_NODE_HEAD_LOCK_INIT(rnh);
1151 #endif
1152 	*head = rnh;
1153 	t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
1154 	ttt = rnh->rnh_nodes + 2;
1155 	t->rn_right = ttt;
1156 	t->rn_parent = t;
1157 	tt = t->rn_left;	/* ... which in turn is rnh->rnh_nodes */
1158 	tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
1159 	tt->rn_bit = -1 - off;
1160 	*ttt = *tt;
1161 	ttt->rn_key = rn_ones;
1162 	rnh->rnh_addaddr = rn_addroute;
1163 	rnh->rnh_deladdr = rn_delete;
1164 	rnh->rnh_matchaddr = rn_match;
1165 	rnh->rnh_matchaddr_args = rn_match_args;
1166 	rnh->rnh_lookup = rn_lookup;
1167 	rnh->rnh_walktree = rn_walktree;
1168 	rnh->rnh_walktree_mt = rn_walktree_mt;
1169 	rnh->rnh_walktree_from = NULL;  /* not implemented */
1170 	rnh->rnh_treetop = t;
1171 	return (1);
1172 }
1173 
1174 void
1175 rn_init()
1176 {
1177 	char *cp, *cplim;
1178 
1179 #ifdef	_KERNEL
1180 	radix_mask_cache = kmem_cache_create("radix_mask",
1181 	    sizeof (struct radix_mask), 0, NULL, NULL, NULL, NULL, NULL, 0);
1182 	radix_node_cache = kmem_cache_create("radix_node",
1183 	    max_keylen + 2 * sizeof (struct radix_node),
1184 	    0, NULL, NULL, NULL, NULL, NULL, 0);
1185 #endif /* _KERNEL */
1186 	R_ZallocSleep(rn_zeros, char *, 2 * max_keylen);
1187 
1188 	ASSERT(rn_zeros != NULL);
1189 	bzero(rn_zeros, 2 * max_keylen);
1190 	rn_ones = cp = rn_zeros + max_keylen;
1191 	cplim = rn_ones + max_keylen;
1192 	while (cp < cplim)
1193 		*cp++ = -1;
1194 	if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
1195 		panic("rn_init: could not init mask_rnhead ");
1196 }
1197 
1198 int
1199 rn_freenode(n, p)
1200 	struct radix_node *n;
1201 	void *p;
1202 {
1203 	struct	radix_node_head *rnh = p;
1204 	struct	radix_node *d;
1205 
1206 	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
1207 	if (d != NULL) {
1208 		Free(d, radix_node_cache);
1209 	}
1210 	return (0);
1211 }
1212 
1213 
1214 void
1215 rn_freehead(rnh)
1216 	struct radix_node_head *rnh;
1217 {
1218 	(void) rn_walktree(rnh, rn_freenode, rnh);
1219 
1220 	rnh->rnh_addaddr = NULL;
1221 	rnh->rnh_deladdr = NULL;
1222 	rnh->rnh_matchaddr = NULL;
1223 	rnh->rnh_lookup = NULL;
1224 	rnh->rnh_walktree = NULL;
1225 
1226 #ifdef	_KERNEL
1227 	RADIX_NODE_HEAD_DESTROY(rnh);
1228 	FreeHead(rnh, sizeof (*rnh));
1229 #else
1230 	Free(rnh, NULL);
1231 #endif	/* _KERNEL */
1232 }
1233 
1234 void
1235 rn_fini()
1236 {
1237 	struct radix_mask *m;
1238 
1239 	if (rn_zeros != NULL) {
1240 #ifdef _KERNEL
1241 		FreeHead(rn_zeros, 2 * max_keylen);
1242 #else
1243 		Free(rn_zeros, NULL);
1244 #endif
1245 		rn_zeros = NULL;
1246 	}
1247 
1248 
1249 	if (mask_rnhead != NULL) {
1250 		rn_freehead(mask_rnhead);
1251 		mask_rnhead = NULL;
1252 	}
1253 
1254 	while ((m = rn_mkfreelist) != NULL) {
1255 		rn_mkfreelist = m->rm_mklist;
1256 		Free(m, NULL);
1257 	}
1258 }
1259