xref: /linux/net/ipv4/fib_trie.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  */
45 
46 #define VERSION "0.323"
47 
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/sock.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
76 
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
79 
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 
85 static DEFINE_RWLOCK(fib_lock);
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96                      ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
101 
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
104 
105 struct node {
106         t_key key;
107 	unsigned long _parent;
108 };
109 
110 struct leaf {
111         t_key key;
112 	unsigned long _parent;
113 	struct hlist_head list;
114 };
115 
116 struct leaf_info {
117 	struct hlist_node hlist;
118 	int plen;
119 	struct list_head falh;
120 };
121 
122 struct tnode {
123         t_key key;
124 	unsigned long _parent;
125         unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
126         unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
127         unsigned short full_children;  /* KEYLENGTH bits needed */
128         unsigned short empty_children; /* KEYLENGTH bits needed */
129         struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 };
140 #endif
141 
142 struct trie_stat {
143 	unsigned int totdepth;
144 	unsigned int maxdepth;
145 	unsigned int tnodes;
146 	unsigned int leaves;
147 	unsigned int nullpointers;
148 	unsigned int nodesizes[MAX_CHILDS];
149 };
150 
151 struct trie {
152         struct node *trie;
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 	struct trie_use_stats stats;
155 #endif
156         int size;
157 	unsigned int revision;
158 };
159 
160 static int trie_debug = 0;
161 
162 static int tnode_full(struct tnode *tn, struct node *n);
163 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165 static int tnode_child_length(struct tnode *tn);
166 static struct node *resize(struct trie *t, struct tnode *tn);
167 static struct tnode *inflate(struct trie *t, struct tnode *tn);
168 static struct tnode *halve(struct trie *t, struct tnode *tn);
169 static void tnode_free(struct tnode *tn);
170 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172 extern int fib_detect_death(struct fib_info *fi, int order,
173                             struct fib_info **last_resort, int *last_idx, int *dflt);
174 
175 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
176                struct nlmsghdr *n, struct netlink_skb_parms *req);
177 
178 static kmem_cache_t *fn_alias_kmem;
179 static struct trie *trie_local = NULL, *trie_main = NULL;
180 
181 static void trie_bug(char *err)
182 {
183 	printk("Trie Bug: %s\n", err);
184 	BUG();
185 }
186 
187 static inline struct node *tnode_get_child(struct tnode *tn, int i)
188 {
189         if (i >=  1<<tn->bits)
190                 trie_bug("tnode_get_child");
191 
192         return tn->child[i];
193 }
194 
195 static inline int tnode_child_length(struct tnode *tn)
196 {
197         return 1<<tn->bits;
198 }
199 
200 /*
201   _________________________________________________________________
202   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
203   ----------------------------------------------------------------
204     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
205 
206   _________________________________________________________________
207   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
208   -----------------------------------------------------------------
209    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
210 
211   tp->pos = 7
212   tp->bits = 3
213   n->pos = 15
214   n->bits=4
215   KEYLENGTH=32
216 */
217 
218 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
219 {
220         if (offset < KEYLENGTH)
221 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
222         else
223 		return 0;
224 }
225 
226 static inline int tkey_equals(t_key a, t_key b)
227 {
228   return a == b;
229 }
230 
231 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232 {
233      if (bits == 0 || offset >= KEYLENGTH)
234             return 1;
235         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
236         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
237 }
238 
239 static inline int tkey_mismatch(t_key a, int offset, t_key b)
240 {
241 	t_key diff = a ^ b;
242 	int i = offset;
243 
244 	if(!diff)
245 	  return 0;
246 	while((diff << i) >> (KEYLENGTH-1) == 0)
247 		i++;
248 	return i;
249 }
250 
251 /* Candiate for fib_semantics */
252 
253 static void fn_free_alias(struct fib_alias *fa)
254 {
255 	fib_release_info(fa->fa_info);
256 	kmem_cache_free(fn_alias_kmem, fa);
257 }
258 
259 /*
260   To understand this stuff, an understanding of keys and all their bits is
261   necessary. Every node in the trie has a key associated with it, but not
262   all of the bits in that key are significant.
263 
264   Consider a node 'n' and its parent 'tp'.
265 
266   If n is a leaf, every bit in its key is significant. Its presence is
267   necessitaded by path compression, since during a tree traversal (when
268   searching for a leaf - unless we are doing an insertion) we will completely
269   ignore all skipped bits we encounter. Thus we need to verify, at the end of
270   a potentially successful search, that we have indeed been walking the
271   correct key path.
272 
273   Note that we can never "miss" the correct key in the tree if present by
274   following the wrong path. Path compression ensures that segments of the key
275   that are the same for all keys with a given prefix are skipped, but the
276   skipped part *is* identical for each node in the subtrie below the skipped
277   bit! trie_insert() in this implementation takes care of that - note the
278   call to tkey_sub_equals() in trie_insert().
279 
280   if n is an internal node - a 'tnode' here, the various parts of its key
281   have many different meanings.
282 
283   Example:
284   _________________________________________________________________
285   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
286   -----------------------------------------------------------------
287     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
288 
289   _________________________________________________________________
290   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
291   -----------------------------------------------------------------
292    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
293 
294   tp->pos = 7
295   tp->bits = 3
296   n->pos = 15
297   n->bits=4
298 
299   First, let's just ignore the bits that come before the parent tp, that is
300   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
301   not use them for anything.
302 
303   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
304   index into the parent's child array. That is, they will be used to find
305   'n' among tp's children.
306 
307   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308   for the node n.
309 
310   All the bits we have seen so far are significant to the node n. The rest
311   of the bits are really not needed or indeed known in n->key.
312 
313   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
314   n's child array, and will of course be different for each child.
315 
316   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
317   at this point.
318 
319 */
320 
321 static void check_tnode(struct tnode *tn)
322 {
323 	if(tn && tn->pos+tn->bits > 32) {
324 		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
325 	}
326 }
327 
328 static int halve_threshold = 25;
329 static int inflate_threshold = 50;
330 
331 static struct leaf *leaf_new(void)
332 {
333 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
334 	if(l) {
335 		NODE_INIT_PARENT(l, T_LEAF);
336 		INIT_HLIST_HEAD(&l->list);
337 	}
338 	return l;
339 }
340 
341 static struct leaf_info *leaf_info_new(int plen)
342 {
343 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
344 	li->plen = plen;
345 	INIT_LIST_HEAD(&li->falh);
346 	return li;
347 }
348 
349 static inline void free_leaf(struct leaf *l)
350 {
351 	kfree(l);
352 }
353 
354 static inline void free_leaf_info(struct leaf_info *li)
355 {
356 	kfree(li);
357 }
358 
359 static struct tnode* tnode_new(t_key key, int pos, int bits)
360 {
361 	int nchildren = 1<<bits;
362 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
363 	struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
364 
365 	if(tn)  {
366 		memset(tn, 0, sz);
367 		NODE_INIT_PARENT(tn, T_TNODE);
368 		tn->pos = pos;
369 		tn->bits = bits;
370 		tn->key = key;
371 		tn->full_children = 0;
372 		tn->empty_children = 1<<bits;
373 	}
374 	if(trie_debug > 0)
375 		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
376 		       (unsigned int) (sizeof(struct node) * 1<<bits));
377 	return tn;
378 }
379 
380 static void tnode_free(struct tnode *tn)
381 {
382 	if(!tn) {
383 		trie_bug("tnode_free\n");
384 	}
385 	if(IS_LEAF(tn)) {
386 		free_leaf((struct leaf *)tn);
387 		if(trie_debug > 0 )
388 			printk("FL %p \n", tn);
389 	}
390 	else if(IS_TNODE(tn)) {
391 		kfree(tn);
392 		if(trie_debug > 0 )
393 			printk("FT %p \n", tn);
394 	}
395 	else {
396 		trie_bug("tnode_free\n");
397 	}
398 }
399 
400 /*
401  * Check whether a tnode 'n' is "full", i.e. it is an internal node
402  * and no bits are skipped. See discussion in dyntree paper p. 6
403  */
404 
405 static inline int tnode_full(struct tnode *tn, struct node *n)
406 {
407 	if(n == NULL || IS_LEAF(n))
408 		return 0;
409 
410 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
411 }
412 
413 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
414 {
415 	tnode_put_child_reorg(tn, i, n, -1);
416 }
417 
418  /*
419   * Add a child at position i overwriting the old value.
420   * Update the value of full_children and empty_children.
421   */
422 
423 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
424 {
425 	struct node *chi;
426 	int isfull;
427 
428 	if(i >=  1<<tn->bits) {
429 		printk("bits=%d, i=%d\n", tn->bits, i);
430 		trie_bug("tnode_put_child_reorg bits");
431 	}
432 	write_lock_bh(&fib_lock);
433 	chi = tn->child[i];
434 
435 	/* update emptyChildren */
436 	if (n == NULL && chi != NULL)
437 		tn->empty_children++;
438 	else if (n != NULL && chi == NULL)
439 		tn->empty_children--;
440 
441 	/* update fullChildren */
442         if (wasfull == -1)
443 		wasfull = tnode_full(tn, chi);
444 
445 	isfull = tnode_full(tn, n);
446 	if (wasfull && !isfull)
447 		tn->full_children--;
448 
449 	else if (!wasfull && isfull)
450 		tn->full_children++;
451 	if(n)
452 		NODE_SET_PARENT(n, tn);
453 
454 	tn->child[i] = n;
455 	write_unlock_bh(&fib_lock);
456 }
457 
458 static struct node *resize(struct trie *t, struct tnode *tn)
459 {
460 	int i;
461 
462  	if (!tn)
463 		return NULL;
464 
465 	if(trie_debug)
466 		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 		      tn, inflate_threshold, halve_threshold);
468 
469 	/* No children */
470 	if (tn->empty_children == tnode_child_length(tn)) {
471 		tnode_free(tn);
472 		return NULL;
473 	}
474 	/* One child */
475 	if (tn->empty_children == tnode_child_length(tn) - 1)
476 		for (i = 0; i < tnode_child_length(tn); i++) {
477 
478 			write_lock_bh(&fib_lock);
479 			if (tn->child[i] != NULL) {
480 
481 				/* compress one level */
482 				struct node *n = tn->child[i];
483 				if(n)
484 					NODE_INIT_PARENT(n, NODE_TYPE(n));
485 
486 				write_unlock_bh(&fib_lock);
487 				tnode_free(tn);
488 				return n;
489 			}
490 			write_unlock_bh(&fib_lock);
491 		}
492 	/*
493 	 * Double as long as the resulting node has a number of
494 	 * nonempty nodes that are above the threshold.
495 	 */
496 
497 	/*
498 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 	 * Telecommunications, page 6:
501 	 * "A node is doubled if the ratio of non-empty children to all
502 	 * children in the *doubled* node is at least 'high'."
503 	 *
504 	 * 'high' in this instance is the variable 'inflate_threshold'. It
505 	 * is expressed as a percentage, so we multiply it with
506 	 * tnode_child_length() and instead of multiplying by 2 (since the
507 	 * child array will be doubled by inflate()) and multiplying
508 	 * the left-hand side by 100 (to handle the percentage thing) we
509 	 * multiply the left-hand side by 50.
510 	 *
511 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 	 * - tn->empty_children is of course the number of non-null children
513 	 * in the current node. tn->full_children is the number of "full"
514 	 * children, that is non-null tnodes with a skip value of 0.
515 	 * All of those will be doubled in the resulting inflated tnode, so
516 	 * we just count them one extra time here.
517 	 *
518 	 * A clearer way to write this would be:
519 	 *
520 	 * to_be_doubled = tn->full_children;
521 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 	 *     tn->full_children;
523 	 *
524 	 * new_child_length = tnode_child_length(tn) * 2;
525 	 *
526 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 	 *      new_child_length;
528 	 * if (new_fill_factor >= inflate_threshold)
529 	 *
530 	 * ...and so on, tho it would mess up the while() loop.
531 	 *
532 	 * anyway,
533 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 	 *      inflate_threshold
535 	 *
536 	 * avoid a division:
537 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 	 *      inflate_threshold * new_child_length
539 	 *
540 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 	 *    tn->full_children ) >= inflate_threshold * new_child_length
543 	 *
544 	 * expand new_child_length:
545 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 	 *    tn->full_children ) >=
547 	 *      inflate_threshold * tnode_child_length(tn) * 2
548 	 *
549 	 * shorten again:
550 	 * 50 * (tn->full_children + tnode_child_length(tn) -
551 	 *    tn->empty_children ) >= inflate_threshold *
552 	 *    tnode_child_length(tn)
553 	 *
554 	 */
555 
556 	check_tnode(tn);
557 
558 	while ((tn->full_children > 0 &&
559 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 				inflate_threshold * tnode_child_length(tn))) {
561 
562 		tn = inflate(t, tn);
563 	}
564 
565 	check_tnode(tn);
566 
567 	/*
568 	 * Halve as long as the number of empty children in this
569 	 * node is above threshold.
570 	 */
571 	while (tn->bits > 1 &&
572 	       100 * (tnode_child_length(tn) - tn->empty_children) <
573 	       halve_threshold * tnode_child_length(tn))
574 
575 		tn = halve(t, tn);
576 
577 	/* Only one child remains */
578 
579 	if (tn->empty_children == tnode_child_length(tn) - 1)
580 		for (i = 0; i < tnode_child_length(tn); i++) {
581 
582 			write_lock_bh(&fib_lock);
583 			if (tn->child[i] != NULL) {
584 				/* compress one level */
585 				struct node *n = tn->child[i];
586 
587 				if(n)
588 					NODE_INIT_PARENT(n, NODE_TYPE(n));
589 
590 				write_unlock_bh(&fib_lock);
591 				tnode_free(tn);
592 				return n;
593 			}
594 			write_unlock_bh(&fib_lock);
595 		}
596 
597 	return (struct node *) tn;
598 }
599 
600 static struct tnode *inflate(struct trie *t, struct tnode *tn)
601 {
602 	struct tnode *inode;
603 	struct tnode *oldtnode = tn;
604 	int olen = tnode_child_length(tn);
605 	int i;
606 
607   	if(trie_debug)
608 		printk("In inflate\n");
609 
610 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
611 
612 	if (!tn)
613 		trie_bug("tnode_new failed");
614 
615 	for(i = 0; i < olen; i++) {
616 		struct node *node = tnode_get_child(oldtnode, i);
617 
618 		/* An empty child */
619 		if (node == NULL)
620 			continue;
621 
622 		/* A leaf or an internal node with skipped bits */
623 
624 		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
625 		   tn->pos + tn->bits - 1) {
626 			if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
627 					     1) == 0)
628 				put_child(t, tn, 2*i, node);
629 			else
630 				put_child(t, tn, 2*i+1, node);
631 			continue;
632 		}
633 
634 		/* An internal node with two children */
635 		inode = (struct tnode *) node;
636 
637 		if (inode->bits == 1) {
638 			put_child(t, tn, 2*i, inode->child[0]);
639 			put_child(t, tn, 2*i+1, inode->child[1]);
640 
641 			tnode_free(inode);
642 		}
643 
644 			/* An internal node with more than two children */
645 		else {
646 			struct tnode *left, *right;
647 			int size, j;
648 
649 			/* We will replace this node 'inode' with two new
650 			 * ones, 'left' and 'right', each with half of the
651 			 * original children. The two new nodes will have
652 			 * a position one bit further down the key and this
653 			 * means that the "significant" part of their keys
654 			 * (see the discussion near the top of this file)
655 			 * will differ by one bit, which will be "0" in
656 			 * left's key and "1" in right's key. Since we are
657 			 * moving the key position by one step, the bit that
658 			 * we are moving away from - the bit at position
659 			 * (inode->pos) - is the one that will differ between
660 			 * left and right. So... we synthesize that bit in the
661 			 * two  new keys.
662 			 * The mask 'm' below will be a single "one" bit at
663 			 * the position (inode->pos)
664 			 */
665 
666 			t_key m = TKEY_GET_MASK(inode->pos, 1);
667 
668 			/* Use the old key, but set the new significant
669 			 *   bit to zero.
670 			 */
671 			left = tnode_new(inode->key&(~m), inode->pos + 1,
672 					 inode->bits - 1);
673 
674 			if(!left)
675 				trie_bug("tnode_new failed");
676 
677 
678 			/* Use the old key, but set the new significant
679 			 * bit to one.
680 			 */
681 			right = tnode_new(inode->key|m, inode->pos + 1,
682 					  inode->bits - 1);
683 
684 			if(!right)
685 				trie_bug("tnode_new failed");
686 
687 			size = tnode_child_length(left);
688 			for(j = 0; j < size; j++) {
689 				put_child(t, left, j, inode->child[j]);
690 				put_child(t, right, j, inode->child[j + size]);
691 			}
692 			put_child(t, tn, 2*i, resize(t, left));
693 			put_child(t, tn, 2*i+1, resize(t, right));
694 
695 			tnode_free(inode);
696 		}
697 	}
698 	tnode_free(oldtnode);
699 	return tn;
700 }
701 
702 static struct tnode *halve(struct trie *t, struct tnode *tn)
703 {
704 	struct tnode *oldtnode = tn;
705 	struct node *left, *right;
706 	int i;
707 	int olen = tnode_child_length(tn);
708 
709 	if(trie_debug) printk("In halve\n");
710 
711 	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
712 
713 	if(!tn)
714 		trie_bug("tnode_new failed");
715 
716 	for(i = 0; i < olen; i += 2) {
717 		left = tnode_get_child(oldtnode, i);
718 		right = tnode_get_child(oldtnode, i+1);
719 
720 		/* At least one of the children is empty */
721 		if (left == NULL) {
722 			if (right == NULL)    /* Both are empty */
723 				continue;
724 			put_child(t, tn, i/2, right);
725 		} else if (right == NULL)
726 			put_child(t, tn, i/2, left);
727 
728 		/* Two nonempty children */
729 		else {
730 			struct tnode *newBinNode =
731 				tnode_new(left->key, tn->pos + tn->bits, 1);
732 
733 			if(!newBinNode)
734 				trie_bug("tnode_new failed");
735 
736 			put_child(t, newBinNode, 0, left);
737 			put_child(t, newBinNode, 1, right);
738 			put_child(t, tn, i/2, resize(t, newBinNode));
739 		}
740 	}
741 	tnode_free(oldtnode);
742 	return tn;
743 }
744 
745 static void *trie_init(struct trie *t)
746 {
747 	if(t) {
748 		t->size = 0;
749 		t->trie = NULL;
750 		t->revision = 0;
751 #ifdef CONFIG_IP_FIB_TRIE_STATS
752        		memset(&t->stats, 0, sizeof(struct trie_use_stats));
753 #endif
754 	}
755 	return t;
756 }
757 
758 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
759 {
760 	struct hlist_node *node;
761 	struct leaf_info *li;
762 
763 	hlist_for_each_entry(li, node, head, hlist) {
764 
765 		if ( li->plen == plen )
766 			return li;
767 	}
768 	return NULL;
769 }
770 
771 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
772 {
773 	struct list_head *fa_head=NULL;
774 	struct leaf_info *li = find_leaf_info(&l->list, plen);
775 
776 	if(li)
777 		fa_head = &li->falh;
778 
779 	return fa_head;
780 }
781 
782 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
783 {
784 	struct leaf_info *li=NULL, *last=NULL;
785 	struct hlist_node *node, *tmp;
786 
787 	write_lock_bh(&fib_lock);
788 
789 	if(hlist_empty(head))
790 		hlist_add_head(&new->hlist, head);
791 	else {
792 		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
793 
794 			if (new->plen > li->plen)
795 				break;
796 
797 			last = li;
798 		}
799 		if(last)
800 			hlist_add_after(&last->hlist, &new->hlist);
801 		else
802 			hlist_add_before(&new->hlist, &li->hlist);
803 	}
804 	write_unlock_bh(&fib_lock);
805 }
806 
807 static struct leaf *
808 fib_find_node(struct trie *t, u32 key)
809 {
810 	int pos;
811 	struct tnode *tn;
812 	struct node *n;
813 
814 	pos = 0;
815 	n=t->trie;
816 
817 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
818 		tn = (struct tnode *) n;
819 
820 		check_tnode(tn);
821 
822 		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
823 			pos=tn->pos + tn->bits;
824 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
825 		}
826 		else
827 			break;
828 	}
829 	/* Case we have found a leaf. Compare prefixes */
830 
831 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
832 		struct leaf *l = (struct leaf *) n;
833 		return l;
834 	}
835 	return NULL;
836 }
837 
838 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
839 {
840 	int i = 0;
841 	int wasfull;
842 	t_key cindex, key;
843 	struct tnode *tp = NULL;
844 
845 	if(!tn)
846 		BUG();
847 
848 	key = tn->key;
849 	i = 0;
850 
851 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
852 
853 		if( i > 10 ) {
854 			printk("Rebalance tn=%p \n", tn);
855 			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
856 
857 			printk("Rebalance tp=%p \n", tp);
858 			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
859 		}
860 
861 		if( i > 12 ) BUG();
862 		i++;
863 
864 		tp = NODE_PARENT(tn);
865 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
866 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
867 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
868 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
869 
870 		if(!NODE_PARENT(tn))
871 			break;
872 
873 		tn = NODE_PARENT(tn);
874 	}
875 	/* Handle last (top) tnode */
876 	if (IS_TNODE(tn))
877 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
878 
879 	return (struct node*) tn;
880 }
881 
882 static struct list_head *
883 fib_insert_node(struct trie *t, u32 key, int plen)
884 {
885 	int pos, newpos;
886 	struct tnode *tp = NULL, *tn = NULL;
887 	struct node *n;
888 	struct leaf *l;
889 	int missbit;
890 	struct list_head *fa_head=NULL;
891 	struct leaf_info *li;
892 	t_key cindex;
893 
894 	pos = 0;
895 	n=t->trie;
896 
897 	/* If we point to NULL, stop. Either the tree is empty and we should
898 	 * just put a new leaf in if, or we have reached an empty child slot,
899 	 * and we should just put our new leaf in that.
900 	 * If we point to a T_TNODE, check if it matches our key. Note that
901 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
902 	 * not be the parent's 'pos'+'bits'!
903 	 *
904 	 * If it does match the current key, get pos/bits from it, extract
905 	 * the index from our key, push the T_TNODE and walk the tree.
906 	 *
907 	 * If it doesn't, we have to replace it with a new T_TNODE.
908 	 *
909 	 * If we point to a T_LEAF, it might or might not have the same key
910 	 * as we do. If it does, just change the value, update the T_LEAF's
911 	 * value, and return it.
912 	 * If it doesn't, we need to replace it with a T_TNODE.
913 	 */
914 
915 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
916 		tn = (struct tnode *) n;
917 
918 		check_tnode(tn);
919 
920 		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
921 			tp = tn;
922 			pos=tn->pos + tn->bits;
923 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
924 
925 			if(n && NODE_PARENT(n) != tn) {
926 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
927 				BUG();
928 			}
929 		}
930 		else
931 			break;
932 	}
933 
934 	/*
935 	 * n  ----> NULL, LEAF or TNODE
936 	 *
937 	 * tp is n's (parent) ----> NULL or TNODE
938 	 */
939 
940 	if(tp && IS_LEAF(tp))
941 		BUG();
942 
943 	t->revision++;
944 
945 	/* Case 1: n is a leaf. Compare prefixes */
946 
947 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
948 		struct leaf *l = ( struct leaf *)  n;
949 
950 		li = leaf_info_new(plen);
951 
952 		if(! li)
953 			BUG();
954 
955 		fa_head = &li->falh;
956 		insert_leaf_info(&l->list, li);
957 		goto done;
958 	}
959 	t->size++;
960 	l = leaf_new();
961 
962 	if(! l)
963 		BUG();
964 
965 	l->key = key;
966 	li = leaf_info_new(plen);
967 
968 	if(! li)
969 		BUG();
970 
971 	fa_head = &li->falh;
972 	insert_leaf_info(&l->list, li);
973 
974 	/* Case 2: n is NULL, and will just insert a new leaf */
975 	if (t->trie && n == NULL) {
976 
977 		NODE_SET_PARENT(l, tp);
978 
979 		if (!tp)
980 			BUG();
981 
982 		else {
983 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
984 			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
985 		}
986 	}
987 	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
988 	else {
989 		/*
990 		 *  Add a new tnode here
991 		 *  first tnode need some special handling
992 		 */
993 
994 		if (tp)
995 			pos=tp->pos+tp->bits;
996 		else
997 			pos=0;
998 		if(n) {
999 			newpos = tkey_mismatch(key, pos, n->key);
1000 			tn = tnode_new(n->key, newpos, 1);
1001 		}
1002 		else {
1003 			newpos = 0;
1004 			tn = tnode_new(key, newpos, 1); /* First tnode */
1005 		}
1006 		if(!tn)
1007 			trie_bug("tnode_pfx_new failed");
1008 
1009 		NODE_SET_PARENT(tn, tp);
1010 
1011 		missbit=tkey_extract_bits(key, newpos, 1);
1012 		put_child(t, tn, missbit, (struct node *)l);
1013 		put_child(t, tn, 1-missbit, n);
1014 
1015 		if(tp) {
1016 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1018 		}
1019 		else {
1020 			t->trie = (struct node*) tn; /* First tnode */
1021 			tp = tn;
1022 		}
1023 	}
1024 	if(tp && tp->pos+tp->bits > 32) {
1025 		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1026 		       tp, tp->pos, tp->bits, key, plen);
1027 	}
1028 	/* Rebalance the trie */
1029 	t->trie = trie_rebalance(t, tp);
1030 done:;
1031 	return fa_head;
1032 }
1033 
1034 static int
1035 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1036 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1037 {
1038 	struct trie *t = (struct trie *) tb->tb_data;
1039 	struct fib_alias *fa, *new_fa;
1040 	struct list_head *fa_head=NULL;
1041 	struct fib_info *fi;
1042 	int plen = r->rtm_dst_len;
1043 	int type = r->rtm_type;
1044 	u8 tos = r->rtm_tos;
1045 	u32 key, mask;
1046 	int err;
1047 	struct leaf *l;
1048 
1049 	if (plen > 32)
1050 		return -EINVAL;
1051 
1052 	key = 0;
1053 	if (rta->rta_dst)
1054 		memcpy(&key, rta->rta_dst, 4);
1055 
1056 	key = ntohl(key);
1057 
1058 	if(trie_debug)
1059 		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1060 
1061 	mask =  ntohl( inet_make_mask(plen) );
1062 
1063 	if(key & ~mask)
1064 		return -EINVAL;
1065 
1066 	key = key & mask;
1067 
1068 	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1069 		goto err;
1070 
1071 	l = fib_find_node(t, key);
1072 	fa = NULL;
1073 
1074 	if(l) {
1075 		fa_head = get_fa_head(l, plen);
1076 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1077 	}
1078 
1079 	/* Now fa, if non-NULL, points to the first fib alias
1080 	 * with the same keys [prefix,tos,priority], if such key already
1081 	 * exists or to the node before which we will insert new one.
1082 	 *
1083 	 * If fa is NULL, we will need to allocate a new one and
1084 	 * insert to the head of f.
1085 	 *
1086 	 * If f is NULL, no fib node matched the destination key
1087 	 * and we need to allocate a new one of those as well.
1088 	 */
1089 
1090 	if (fa &&
1091 	    fa->fa_info->fib_priority == fi->fib_priority) {
1092 		struct fib_alias *fa_orig;
1093 
1094 		err = -EEXIST;
1095 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1096 			goto out;
1097 
1098 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1099 			struct fib_info *fi_drop;
1100 			u8 state;
1101 
1102 			write_lock_bh(&fib_lock);
1103 
1104 			fi_drop = fa->fa_info;
1105 			fa->fa_info = fi;
1106 			fa->fa_type = type;
1107 			fa->fa_scope = r->rtm_scope;
1108 			state = fa->fa_state;
1109 			fa->fa_state &= ~FA_S_ACCESSED;
1110 
1111 			write_unlock_bh(&fib_lock);
1112 
1113 			fib_release_info(fi_drop);
1114 			if (state & FA_S_ACCESSED)
1115 			  rt_cache_flush(-1);
1116 
1117 			    goto succeeded;
1118 		}
1119 		/* Error if we find a perfect match which
1120 		 * uses the same scope, type, and nexthop
1121 		 * information.
1122 		 */
1123 		fa_orig = fa;
1124 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1125 			if (fa->fa_tos != tos)
1126 				break;
1127 			if (fa->fa_info->fib_priority != fi->fib_priority)
1128 				break;
1129 			if (fa->fa_type == type &&
1130 			    fa->fa_scope == r->rtm_scope &&
1131 			    fa->fa_info == fi) {
1132 				goto out;
1133 			}
1134 		}
1135 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1136 			fa = fa_orig;
1137 	}
1138 	err = -ENOENT;
1139 	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1140 		goto out;
1141 
1142 	err = -ENOBUFS;
1143 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1144 	if (new_fa == NULL)
1145 		goto out;
1146 
1147 	new_fa->fa_info = fi;
1148 	new_fa->fa_tos = tos;
1149 	new_fa->fa_type = type;
1150 	new_fa->fa_scope = r->rtm_scope;
1151 	new_fa->fa_state = 0;
1152 #if 0
1153 	new_fa->dst  = NULL;
1154 #endif
1155 	/*
1156 	 * Insert new entry to the list.
1157 	 */
1158 
1159 	if(!fa_head)
1160 		fa_head = fib_insert_node(t, key, plen);
1161 
1162 	write_lock_bh(&fib_lock);
1163 
1164 	list_add_tail(&new_fa->fa_list,
1165 		 (fa ? &fa->fa_list : fa_head));
1166 
1167 	write_unlock_bh(&fib_lock);
1168 
1169 	rt_cache_flush(-1);
1170 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1171 succeeded:
1172 	return 0;
1173 out:
1174 	fib_release_info(fi);
1175 err:;
1176 	return err;
1177 }
1178 
1179 static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1180 			     struct fib_result *res, int *err)
1181 {
1182 	int i;
1183 	t_key mask;
1184 	struct leaf_info *li;
1185 	struct hlist_head *hhead = &l->list;
1186 	struct hlist_node *node;
1187 
1188 	hlist_for_each_entry(li, node, hhead, hlist) {
1189 
1190 		i = li->plen;
1191 		mask = ntohl(inet_make_mask(i));
1192 		if (l->key != (key & mask))
1193 			continue;
1194 
1195 		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1196 			*plen = i;
1197 #ifdef CONFIG_IP_FIB_TRIE_STATS
1198 			t->stats.semantic_match_passed++;
1199 #endif
1200 			return 1;
1201 		}
1202 #ifdef CONFIG_IP_FIB_TRIE_STATS
1203 		t->stats.semantic_match_miss++;
1204 #endif
1205 	}
1206 	return 0;
1207 }
1208 
1209 static int
1210 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1211 {
1212 	struct trie *t = (struct trie *) tb->tb_data;
1213 	int plen, ret = 0;
1214 	struct node *n;
1215 	struct tnode *pn;
1216 	int pos, bits;
1217 	t_key key=ntohl(flp->fl4_dst);
1218 	int chopped_off;
1219 	t_key cindex = 0;
1220 	int current_prefix_length = KEYLENGTH;
1221 	n = t->trie;
1222 
1223 	read_lock(&fib_lock);
1224 	if(!n)
1225 		goto failed;
1226 
1227 #ifdef CONFIG_IP_FIB_TRIE_STATS
1228 	t->stats.gets++;
1229 #endif
1230 
1231 	/* Just a leaf? */
1232 	if (IS_LEAF(n)) {
1233 		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1234 			goto found;
1235 		goto failed;
1236 	}
1237 	pn = (struct tnode *) n;
1238 	chopped_off = 0;
1239 
1240         while (pn) {
1241 
1242 		pos = pn->pos;
1243 		bits = pn->bits;
1244 
1245 		if(!chopped_off)
1246 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1247 
1248 		n = tnode_get_child(pn, cindex);
1249 
1250 		if (n == NULL) {
1251 #ifdef CONFIG_IP_FIB_TRIE_STATS
1252 			t->stats.null_node_hit++;
1253 #endif
1254 			goto backtrace;
1255 		}
1256 
1257 		if (IS_TNODE(n)) {
1258 #define HL_OPTIMIZE
1259 #ifdef HL_OPTIMIZE
1260 			struct tnode *cn = (struct tnode *)n;
1261 			t_key node_prefix, key_prefix, pref_mismatch;
1262 			int mp;
1263 
1264 			/*
1265 			 * It's a tnode, and we can do some extra checks here if we
1266 			 * like, to avoid descending into a dead-end branch.
1267 			 * This tnode is in the parent's child array at index
1268 			 * key[p_pos..p_pos+p_bits] but potentially with some bits
1269 			 * chopped off, so in reality the index may be just a
1270 			 * subprefix, padded with zero at the end.
1271 			 * We can also take a look at any skipped bits in this
1272 			 * tnode - everything up to p_pos is supposed to be ok,
1273 			 * and the non-chopped bits of the index (se previous
1274 			 * paragraph) are also guaranteed ok, but the rest is
1275 			 * considered unknown.
1276 			 *
1277 			 * The skipped bits are key[pos+bits..cn->pos].
1278 			 */
1279 
1280 			/* If current_prefix_length < pos+bits, we are already doing
1281 			 * actual prefix  matching, which means everything from
1282 			 * pos+(bits-chopped_off) onward must be zero along some
1283 			 * branch of this subtree - otherwise there is *no* valid
1284 			 * prefix present. Here we can only check the skipped
1285 			 * bits. Remember, since we have already indexed into the
1286 			 * parent's child array, we know that the bits we chopped of
1287 			 * *are* zero.
1288 			 */
1289 
1290 			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1291 
1292 			if (current_prefix_length < pos+bits) {
1293 				if (tkey_extract_bits(cn->key, current_prefix_length,
1294 						      cn->pos - current_prefix_length) != 0 ||
1295 				    !(cn->child[0]))
1296 					goto backtrace;
1297 			}
1298 
1299 			/*
1300 			 * If chopped_off=0, the index is fully validated and we
1301 			 * only need to look at the skipped bits for this, the new,
1302 			 * tnode. What we actually want to do is to find out if
1303 			 * these skipped bits match our key perfectly, or if we will
1304 			 * have to count on finding a matching prefix further down,
1305 			 * because if we do, we would like to have some way of
1306 			 * verifying the existence of such a prefix at this point.
1307 			 */
1308 
1309 			/* The only thing we can do at this point is to verify that
1310 			 * any such matching prefix can indeed be a prefix to our
1311 			 * key, and if the bits in the node we are inspecting that
1312 			 * do not match our key are not ZERO, this cannot be true.
1313 			 * Thus, find out where there is a mismatch (before cn->pos)
1314 			 * and verify that all the mismatching bits are zero in the
1315 			 * new tnode's key.
1316 			 */
1317 
1318 			/* Note: We aren't very concerned about the piece of the key
1319 			 * that precede pn->pos+pn->bits, since these have already been
1320 			 * checked. The bits after cn->pos aren't checked since these are
1321 			 * by definition "unknown" at this point. Thus, what we want to
1322 			 * see is if we are about to enter the "prefix matching" state,
1323 			 * and in that case verify that the skipped bits that will prevail
1324 			 * throughout this subtree are zero, as they have to be if we are
1325 			 * to find a matching prefix.
1326 			 */
1327 
1328 			node_prefix = MASK_PFX(cn->key, cn->pos);
1329 			key_prefix =  MASK_PFX(key, cn->pos);
1330 			pref_mismatch = key_prefix^node_prefix;
1331 			mp = 0;
1332 
1333 			/* In short: If skipped bits in this node do not match the search
1334 			 * key, enter the "prefix matching" state.directly.
1335 			 */
1336 			if (pref_mismatch) {
1337 				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1338 					mp++;
1339 					pref_mismatch = pref_mismatch <<1;
1340 				}
1341 				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1342 
1343 				if (key_prefix != 0)
1344 					goto backtrace;
1345 
1346 				if (current_prefix_length >= cn->pos)
1347 					current_prefix_length=mp;
1348 		       }
1349 #endif
1350 		       pn = (struct tnode *)n; /* Descend */
1351 		       chopped_off = 0;
1352 		       continue;
1353 		}
1354 		if (IS_LEAF(n)) {
1355 			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1356 				goto found;
1357 	       }
1358 backtrace:
1359 		chopped_off++;
1360 
1361 		/* As zero don't change the child key (cindex) */
1362 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1363 			chopped_off++;
1364 		}
1365 
1366 		/* Decrease current_... with bits chopped off */
1367 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1368 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1369 
1370 		/*
1371 		 * Either we do the actual chop off according or if we have
1372 		 * chopped off all bits in this tnode walk up to our parent.
1373 		 */
1374 
1375 		if(chopped_off <= pn->bits)
1376 			cindex &= ~(1 << (chopped_off-1));
1377 		else {
1378 			if( NODE_PARENT(pn) == NULL)
1379 				goto failed;
1380 
1381 			/* Get Child's index */
1382 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1383 			pn = NODE_PARENT(pn);
1384 			chopped_off = 0;
1385 
1386 #ifdef CONFIG_IP_FIB_TRIE_STATS
1387 			t->stats.backtrack++;
1388 #endif
1389 			goto backtrace;
1390 		}
1391 	}
1392 failed:
1393 	ret =  1;
1394 found:
1395 	read_unlock(&fib_lock);
1396 	return ret;
1397 }
1398 
1399 static int trie_leaf_remove(struct trie *t, t_key key)
1400 {
1401 	t_key cindex;
1402 	struct tnode *tp = NULL;
1403 	struct node *n = t->trie;
1404 	struct leaf *l;
1405 
1406 	if(trie_debug)
1407 		printk("entering trie_leaf_remove(%p)\n", n);
1408 
1409 	/* Note that in the case skipped bits, those bits are *not* checked!
1410 	 * When we finish this, we will have NULL or a T_LEAF, and the
1411 	 * T_LEAF may or may not match our key.
1412 	 */
1413 
1414         while (n != NULL && IS_TNODE(n)) {
1415 		struct tnode *tn = (struct tnode *) n;
1416 		check_tnode(tn);
1417 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1418 
1419 			if(n && NODE_PARENT(n) != tn) {
1420 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1421 				BUG();
1422 			}
1423         }
1424 	l = (struct leaf *) n;
1425 
1426 	if(!n || !tkey_equals(l->key, key))
1427 		return 0;
1428 
1429 	/*
1430 	 * Key found.
1431 	 * Remove the leaf and rebalance the tree
1432 	 */
1433 
1434 	t->revision++;
1435 	t->size--;
1436 
1437 	tp = NODE_PARENT(n);
1438 	tnode_free((struct tnode *) n);
1439 
1440 	if(tp) {
1441 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1442 		put_child(t, (struct tnode *)tp, cindex, NULL);
1443 		t->trie = trie_rebalance(t, tp);
1444 	}
1445 	else
1446 		t->trie = NULL;
1447 
1448 	return 1;
1449 }
1450 
1451 static int
1452 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1453 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1454 {
1455 	struct trie *t = (struct trie *) tb->tb_data;
1456 	u32 key, mask;
1457 	int plen = r->rtm_dst_len;
1458 	u8 tos = r->rtm_tos;
1459 	struct fib_alias *fa, *fa_to_delete;
1460 	struct list_head *fa_head;
1461 	struct leaf *l;
1462 
1463 	if (plen > 32)
1464 		return -EINVAL;
1465 
1466 	key = 0;
1467 	if (rta->rta_dst)
1468 		memcpy(&key, rta->rta_dst, 4);
1469 
1470 	key = ntohl(key);
1471 	mask =  ntohl( inet_make_mask(plen) );
1472 
1473 	if(key & ~mask)
1474 		return -EINVAL;
1475 
1476 	key = key & mask;
1477 	l = fib_find_node(t, key);
1478 
1479 	if(!l)
1480 		return -ESRCH;
1481 
1482 	fa_head = get_fa_head(l, plen);
1483 	fa = fib_find_alias(fa_head, tos, 0);
1484 
1485 	if (!fa)
1486 		return -ESRCH;
1487 
1488 	if (trie_debug)
1489 		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1490 
1491 	fa_to_delete = NULL;
1492 	fa_head = fa->fa_list.prev;
1493 	list_for_each_entry(fa, fa_head, fa_list) {
1494 		struct fib_info *fi = fa->fa_info;
1495 
1496 		if (fa->fa_tos != tos)
1497 			break;
1498 
1499 		if ((!r->rtm_type ||
1500 		     fa->fa_type == r->rtm_type) &&
1501 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1502 		     fa->fa_scope == r->rtm_scope) &&
1503 		    (!r->rtm_protocol ||
1504 		     fi->fib_protocol == r->rtm_protocol) &&
1505 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1506 			fa_to_delete = fa;
1507 			break;
1508 		}
1509 	}
1510 
1511 	if (fa_to_delete) {
1512 		int kill_li = 0;
1513 		struct leaf_info *li;
1514 
1515 		fa = fa_to_delete;
1516 		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1517 
1518 		l = fib_find_node(t, key);
1519 		li = find_leaf_info(&l->list, plen);
1520 
1521 		write_lock_bh(&fib_lock);
1522 
1523 		list_del(&fa->fa_list);
1524 
1525 		if(list_empty(fa_head)) {
1526 			hlist_del(&li->hlist);
1527 			kill_li = 1;
1528 		}
1529 		write_unlock_bh(&fib_lock);
1530 
1531 		if(kill_li)
1532 			free_leaf_info(li);
1533 
1534 		if(hlist_empty(&l->list))
1535 			trie_leaf_remove(t, key);
1536 
1537 		if (fa->fa_state & FA_S_ACCESSED)
1538 			rt_cache_flush(-1);
1539 
1540 		fn_free_alias(fa);
1541 		return 0;
1542 	}
1543 	return -ESRCH;
1544 }
1545 
1546 static int trie_flush_list(struct trie *t, struct list_head *head)
1547 {
1548 	struct fib_alias *fa, *fa_node;
1549 	int found = 0;
1550 
1551 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1552 		struct fib_info *fi = fa->fa_info;
1553 
1554 		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1555 
1556  			write_lock_bh(&fib_lock);
1557 			list_del(&fa->fa_list);
1558 			write_unlock_bh(&fib_lock);
1559 
1560 			fn_free_alias(fa);
1561 			found++;
1562 		}
1563 	}
1564 	return found;
1565 }
1566 
1567 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1568 {
1569 	int found = 0;
1570 	struct hlist_head *lih = &l->list;
1571 	struct hlist_node *node, *tmp;
1572 	struct leaf_info *li = NULL;
1573 
1574 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1575 
1576 		found += trie_flush_list(t, &li->falh);
1577 
1578 		if (list_empty(&li->falh)) {
1579 
1580  			write_lock_bh(&fib_lock);
1581 			hlist_del(&li->hlist);
1582 			write_unlock_bh(&fib_lock);
1583 
1584 			free_leaf_info(li);
1585 		}
1586 	}
1587 	return found;
1588 }
1589 
1590 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1591 {
1592 	struct node *c = (struct node *) thisleaf;
1593 	struct tnode *p;
1594 	int idx;
1595 
1596 	if(c == NULL) {
1597 		if(t->trie == NULL)
1598 			return NULL;
1599 
1600 		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1601 			return (struct leaf *) t->trie;
1602 
1603 		p = (struct tnode*) t->trie;  /* Start */
1604 	}
1605 	else
1606 		p = (struct tnode *) NODE_PARENT(c);
1607 	while (p) {
1608 		int pos, last;
1609 
1610 		/*  Find the next child of the parent */
1611 		if(c)
1612 			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1613 		else
1614 			pos = 0;
1615 
1616 		last = 1 << p->bits;
1617 		for(idx = pos; idx < last ; idx++) {
1618 			if( p->child[idx]) {
1619 
1620 				/* Decend if tnode */
1621 
1622 				while (IS_TNODE(p->child[idx])) {
1623 					p = (struct tnode*) p->child[idx];
1624 					idx = 0;
1625 
1626 					/* Rightmost non-NULL branch */
1627 					if( p && IS_TNODE(p) )
1628 						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1629 
1630 					/* Done with this tnode? */
1631 					if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1632 						goto up;
1633 				}
1634 				return (struct leaf*) p->child[idx];
1635 			}
1636 		}
1637 up:
1638 		/* No more children go up one step  */
1639 		c = (struct node*) p;
1640 		p = (struct tnode *) NODE_PARENT(p);
1641 	}
1642 	return NULL; /* Ready. Root of trie */
1643 }
1644 
1645 static int fn_trie_flush(struct fib_table *tb)
1646 {
1647 	struct trie *t = (struct trie *) tb->tb_data;
1648 	struct leaf *ll = NULL, *l = NULL;
1649 	int found = 0, h;
1650 
1651 	t->revision++;
1652 
1653 	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1654 		found += trie_flush_leaf(t, l);
1655 
1656 		if (ll && hlist_empty(&ll->list))
1657 			trie_leaf_remove(t, ll->key);
1658 		ll = l;
1659 	}
1660 
1661 	if (ll && hlist_empty(&ll->list))
1662 		trie_leaf_remove(t, ll->key);
1663 
1664 	if(trie_debug)
1665 		printk("trie_flush found=%d\n", found);
1666 	return found;
1667 }
1668 
1669 static int trie_last_dflt=-1;
1670 
1671 static void
1672 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1673 {
1674 	struct trie *t = (struct trie *) tb->tb_data;
1675 	int order, last_idx;
1676 	struct fib_info *fi = NULL;
1677 	struct fib_info *last_resort;
1678 	struct fib_alias *fa = NULL;
1679 	struct list_head *fa_head;
1680 	struct leaf *l;
1681 
1682 	last_idx = -1;
1683 	last_resort = NULL;
1684 	order = -1;
1685 
1686 	read_lock(&fib_lock);
1687 
1688 	l = fib_find_node(t, 0);
1689 	if(!l)
1690 		goto out;
1691 
1692 	fa_head = get_fa_head(l, 0);
1693 	if(!fa_head)
1694 		goto out;
1695 
1696 	if (list_empty(fa_head))
1697 		goto out;
1698 
1699 	list_for_each_entry(fa, fa_head, fa_list) {
1700 		struct fib_info *next_fi = fa->fa_info;
1701 
1702 		if (fa->fa_scope != res->scope ||
1703 		    fa->fa_type != RTN_UNICAST)
1704 			continue;
1705 
1706 		if (next_fi->fib_priority > res->fi->fib_priority)
1707 			break;
1708 		if (!next_fi->fib_nh[0].nh_gw ||
1709 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1710 			continue;
1711 		fa->fa_state |= FA_S_ACCESSED;
1712 
1713 		if (fi == NULL) {
1714 			if (next_fi != res->fi)
1715 				break;
1716 		} else if (!fib_detect_death(fi, order, &last_resort,
1717 					     &last_idx, &trie_last_dflt)) {
1718 			if (res->fi)
1719 				fib_info_put(res->fi);
1720 			res->fi = fi;
1721 			atomic_inc(&fi->fib_clntref);
1722 			trie_last_dflt = order;
1723 			goto out;
1724 		}
1725 		fi = next_fi;
1726 		order++;
1727 	}
1728 	if (order <= 0 || fi == NULL) {
1729 		trie_last_dflt = -1;
1730 		goto out;
1731 	}
1732 
1733 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1734 		if (res->fi)
1735 			fib_info_put(res->fi);
1736 		res->fi = fi;
1737 		atomic_inc(&fi->fib_clntref);
1738 		trie_last_dflt = order;
1739 		goto out;
1740 	}
1741 	if (last_idx >= 0) {
1742 		if (res->fi)
1743 			fib_info_put(res->fi);
1744 		res->fi = last_resort;
1745 		if (last_resort)
1746 			atomic_inc(&last_resort->fib_clntref);
1747 	}
1748 	trie_last_dflt = last_idx;
1749  out:;
1750 	read_unlock(&fib_lock);
1751 }
1752 
1753 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1754 			   struct sk_buff *skb, struct netlink_callback *cb)
1755 {
1756 	int i, s_i;
1757 	struct fib_alias *fa;
1758 
1759 	u32 xkey=htonl(key);
1760 
1761 	s_i=cb->args[3];
1762 	i = 0;
1763 
1764 	list_for_each_entry(fa, fah, fa_list) {
1765 		if (i < s_i) {
1766 			i++;
1767 			continue;
1768 		}
1769 		if (fa->fa_info->fib_nh == NULL) {
1770 			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1771 			i++;
1772 			continue;
1773 		}
1774 		if (fa->fa_info == NULL) {
1775 			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1776 			i++;
1777 			continue;
1778 		}
1779 
1780 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1781 				  cb->nlh->nlmsg_seq,
1782 				  RTM_NEWROUTE,
1783 				  tb->tb_id,
1784 				  fa->fa_type,
1785 				  fa->fa_scope,
1786 				  &xkey,
1787 				  plen,
1788 				  fa->fa_tos,
1789 				  fa->fa_info, 0) < 0) {
1790 			cb->args[3] = i;
1791 			return -1;
1792 			}
1793 		i++;
1794 	}
1795 	cb->args[3]=i;
1796 	return skb->len;
1797 }
1798 
1799 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1800 			     struct netlink_callback *cb)
1801 {
1802 	int h, s_h;
1803 	struct list_head *fa_head;
1804 	struct leaf *l = NULL;
1805 	s_h=cb->args[2];
1806 
1807 	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1808 
1809 		if (h < s_h)
1810 			continue;
1811 		if (h > s_h)
1812 			memset(&cb->args[3], 0,
1813 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1814 
1815 		fa_head = get_fa_head(l, plen);
1816 
1817 		if(!fa_head)
1818 			continue;
1819 
1820 		if(list_empty(fa_head))
1821 			continue;
1822 
1823 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1824 			cb->args[2]=h;
1825 			return -1;
1826 		}
1827 	}
1828 	cb->args[2]=h;
1829 	return skb->len;
1830 }
1831 
1832 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1833 {
1834 	int m, s_m;
1835 	struct trie *t = (struct trie *) tb->tb_data;
1836 
1837 	s_m = cb->args[1];
1838 
1839 	read_lock(&fib_lock);
1840 	for (m=0; m<=32; m++) {
1841 
1842 		if (m < s_m)
1843 			continue;
1844 		if (m > s_m)
1845 			memset(&cb->args[2], 0,
1846 			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
1847 
1848 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1849 			cb->args[1] = m;
1850 			goto out;
1851 		}
1852 	}
1853 	read_unlock(&fib_lock);
1854 	cb->args[1] = m;
1855 	return skb->len;
1856  out:
1857 	read_unlock(&fib_lock);
1858 	return -1;
1859 }
1860 
1861 /* Fix more generic FIB names for init later */
1862 
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864 struct fib_table * fib_hash_init(int id)
1865 #else
1866 struct fib_table * __init fib_hash_init(int id)
1867 #endif
1868 {
1869 	struct fib_table *tb;
1870 	struct trie *t;
1871 
1872 	if (fn_alias_kmem == NULL)
1873 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1874 						  sizeof(struct fib_alias),
1875 						  0, SLAB_HWCACHE_ALIGN,
1876 						  NULL, NULL);
1877 
1878 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1879 		     GFP_KERNEL);
1880 	if (tb == NULL)
1881 		return NULL;
1882 
1883 	tb->tb_id = id;
1884 	tb->tb_lookup = fn_trie_lookup;
1885 	tb->tb_insert = fn_trie_insert;
1886 	tb->tb_delete = fn_trie_delete;
1887 	tb->tb_flush = fn_trie_flush;
1888 	tb->tb_select_default = fn_trie_select_default;
1889 	tb->tb_dump = fn_trie_dump;
1890 	memset(tb->tb_data, 0, sizeof(struct trie));
1891 
1892 	t = (struct trie *) tb->tb_data;
1893 
1894 	trie_init(t);
1895 
1896 	if (id == RT_TABLE_LOCAL)
1897                 trie_local=t;
1898 	  else if (id == RT_TABLE_MAIN)
1899                 trie_main=t;
1900 
1901 	if (id == RT_TABLE_LOCAL)
1902 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1903 
1904 	return tb;
1905 }
1906 
1907 /* Trie dump functions */
1908 
1909 static void putspace_seq(struct seq_file *seq, int n)
1910 {
1911 	while (n--) seq_printf(seq, " ");
1912 }
1913 
1914 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1915 {
1916 	while (bits--)
1917 		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1918 }
1919 
1920 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1921 		   int pend, int cindex, int bits)
1922 {
1923 	putspace_seq(seq, indent);
1924 	if (IS_LEAF(n))
1925 		seq_printf(seq, "|");
1926 	else
1927 		seq_printf(seq, "+");
1928 	if (bits) {
1929 		seq_printf(seq, "%d/", cindex);
1930 		printbin_seq(seq, cindex, bits);
1931 		seq_printf(seq, ": ");
1932 	}
1933 	else
1934 		seq_printf(seq, "<root>: ");
1935 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1936 
1937 	if (IS_LEAF(n))
1938 		seq_printf(seq, "key=%d.%d.%d.%d\n",
1939 			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
1940 	else {
1941 		int plen=((struct tnode *)n)->pos;
1942 		t_key prf=MASK_PFX(n->key, plen);
1943 		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
1944 			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
1945 	}
1946 	if (IS_LEAF(n)) {
1947 		struct leaf *l=(struct leaf *)n;
1948 		struct fib_alias *fa;
1949 		int i;
1950 		for (i=32; i>=0; i--)
1951 		  if(find_leaf_info(&l->list, i)) {
1952 
1953 				struct list_head *fa_head = get_fa_head(l, i);
1954 
1955 				if(!fa_head)
1956 					continue;
1957 
1958 				if(list_empty(fa_head))
1959 					continue;
1960 
1961 				putspace_seq(seq, indent+2);
1962 				seq_printf(seq, "{/%d...dumping}\n", i);
1963 
1964 
1965 				list_for_each_entry(fa, fa_head, fa_list) {
1966 					putspace_seq(seq, indent+2);
1967 					if (fa->fa_info->fib_nh == NULL) {
1968 						seq_printf(seq, "Error _fib_nh=NULL\n");
1969 						continue;
1970 					}
1971 					if (fa->fa_info == NULL) {
1972 						seq_printf(seq, "Error fa_info=NULL\n");
1973 						continue;
1974 					}
1975 
1976 					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
1977 					      fa->fa_type,
1978 					      fa->fa_scope,
1979 					      fa->fa_tos);
1980 				}
1981 			}
1982 	}
1983 	else if (IS_TNODE(n)) {
1984 		struct tnode *tn=(struct tnode *)n;
1985 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
1986 		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
1987 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
1988 		seq_printf(seq, "}\n");
1989 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
1990 		seq_printf(seq, "{pos=%d", tn->pos);
1991 		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
1992 		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
1993 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
1994 		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
1995 	}
1996 }
1997 
1998 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
1999 {
2000 	struct node *n=t->trie;
2001 	int cindex=0;
2002 	int indent=1;
2003 	int pend=0;
2004 	int depth = 0;
2005 
2006   	read_lock(&fib_lock);
2007 
2008 	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2009 	if (n) {
2010 		printnode_seq(seq, indent, n, pend, cindex, 0);
2011 		if (IS_TNODE(n)) {
2012 			struct tnode *tn=(struct tnode *)n;
2013 			pend = tn->pos+tn->bits;
2014 			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2015 			indent += 3;
2016 			depth++;
2017 
2018 			while (tn && cindex < (1 << tn->bits)) {
2019 				if (tn->child[cindex]) {
2020 
2021 					/* Got a child */
2022 
2023 					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2024 					if (IS_LEAF(tn->child[cindex])) {
2025 						cindex++;
2026 
2027 					}
2028 					else {
2029 						/*
2030 						 * New tnode. Decend one level
2031 						 */
2032 
2033 						depth++;
2034 						n=tn->child[cindex];
2035 						tn=(struct tnode *)n;
2036 						pend=tn->pos+tn->bits;
2037 						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2038 						indent+=3;
2039 						cindex=0;
2040 					}
2041 				}
2042 				else
2043 					cindex++;
2044 
2045 				/*
2046 				 * Test if we are done
2047 				 */
2048 
2049 				while (cindex >= (1 << tn->bits)) {
2050 
2051 					/*
2052 					 * Move upwards and test for root
2053 					 * pop off all traversed  nodes
2054 					 */
2055 
2056 					if (NODE_PARENT(tn) == NULL) {
2057 						tn = NULL;
2058 						n = NULL;
2059 						break;
2060 					}
2061 					else {
2062 						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2063 						tn = NODE_PARENT(tn);
2064 						cindex++;
2065 						n=(struct node *)tn;
2066 						pend=tn->pos+tn->bits;
2067 						indent-=3;
2068 						depth--;
2069 					}
2070 				}
2071 			}
2072 		}
2073 		else n = NULL;
2074 	}
2075 	else seq_printf(seq, "------ trie is empty\n");
2076 
2077   	read_unlock(&fib_lock);
2078 }
2079 
2080 static struct trie_stat *trie_stat_new(void)
2081 {
2082 	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2083 	int i;
2084 
2085 	if(s) {
2086 		s->totdepth = 0;
2087 		s->maxdepth = 0;
2088 		s->tnodes = 0;
2089 		s->leaves = 0;
2090 		s->nullpointers = 0;
2091 
2092 		for(i=0; i< MAX_CHILDS; i++)
2093 			s->nodesizes[i] = 0;
2094 	}
2095 	return s;
2096 }
2097 
2098 static struct trie_stat *trie_collect_stats(struct trie *t)
2099 {
2100 	struct node *n=t->trie;
2101 	struct trie_stat *s = trie_stat_new();
2102 	int cindex = 0;
2103 	int indent = 1;
2104 	int pend = 0;
2105 	int depth = 0;
2106 
2107 	read_lock(&fib_lock);
2108 
2109 	if (s) {
2110 		if (n) {
2111 			if (IS_TNODE(n)) {
2112 				struct tnode *tn = (struct tnode *)n;
2113 				pend=tn->pos+tn->bits;
2114 				indent += 3;
2115 				s->nodesizes[tn->bits]++;
2116 				depth++;
2117 
2118 				while (tn && cindex < (1 << tn->bits)) {
2119 					if (tn->child[cindex]) {
2120 						/* Got a child */
2121 
2122 						if (IS_LEAF(tn->child[cindex])) {
2123 							cindex++;
2124 
2125 							/* stats */
2126 							if (depth > s->maxdepth)
2127 								s->maxdepth = depth;
2128 							s->totdepth += depth;
2129 							s->leaves++;
2130 						}
2131 
2132 						else {
2133 							/*
2134 							 * New tnode. Decend one level
2135 							 */
2136 
2137 							s->tnodes++;
2138 							s->nodesizes[tn->bits]++;
2139 							depth++;
2140 
2141 							n = tn->child[cindex];
2142 							tn = (struct tnode *)n;
2143 							pend = tn->pos+tn->bits;
2144 
2145 							indent += 3;
2146 							cindex = 0;
2147 						}
2148 					}
2149 					else {
2150 						cindex++;
2151 						s->nullpointers++;
2152 					}
2153 
2154 					/*
2155 					 * Test if we are done
2156 					 */
2157 
2158 					while (cindex >= (1 << tn->bits)) {
2159 
2160 						/*
2161 						 * Move upwards and test for root
2162 						 * pop off all traversed  nodes
2163 						 */
2164 
2165 
2166 						if (NODE_PARENT(tn) == NULL) {
2167 							tn = NULL;
2168 							n = NULL;
2169 							break;
2170 						}
2171 						else {
2172 							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2173 							tn = NODE_PARENT(tn);
2174 							cindex++;
2175 							n = (struct node *)tn;
2176 							pend=tn->pos+tn->bits;
2177 							indent -= 3;
2178 							depth--;
2179 						}
2180  					}
2181 				}
2182 			}
2183 			else n = NULL;
2184 		}
2185 	}
2186 
2187 	read_unlock(&fib_lock);
2188 	return s;
2189 }
2190 
2191 #ifdef CONFIG_PROC_FS
2192 
2193 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2194 {
2195 	return NULL;
2196 }
2197 
2198 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2199 {
2200 	return NULL;
2201 }
2202 
2203 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2204 {
2205 	void *v = NULL;
2206 
2207 	if (ip_fib_main_table)
2208 		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2209 	return v;
2210 }
2211 
2212 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213 {
2214 	++*pos;
2215 	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2216 }
2217 
2218 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2219 {
2220 
2221 }
2222 
2223 /*
2224  *	This outputs /proc/net/fib_triestats
2225  *
2226  *	It always works in backward compatibility mode.
2227  *	The format of the file is not supposed to be changed.
2228  */
2229 
2230 static void collect_and_show(struct trie *t, struct seq_file *seq)
2231 {
2232 	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2233 	int i, max, pointers;
2234         struct trie_stat *stat;
2235 	int avdepth;
2236 
2237 	stat = trie_collect_stats(t);
2238 
2239 	bytes=0;
2240 	seq_printf(seq, "trie=%p\n", t);
2241 
2242 	if (stat) {
2243 		if (stat->leaves)
2244 			avdepth=stat->totdepth*100 / stat->leaves;
2245 		else
2246 			avdepth=0;
2247 		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2248 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2249 
2250 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2251 		bytes += sizeof(struct leaf) * stat->leaves;
2252 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2253 		bytes += sizeof(struct tnode) * stat->tnodes;
2254 
2255 		max = MAX_CHILDS-1;
2256 
2257 		while (max >= 0 && stat->nodesizes[max] == 0)
2258 			max--;
2259 		pointers = 0;
2260 
2261 		for (i = 1; i <= max; i++)
2262 			if (stat->nodesizes[i] != 0) {
2263 				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2264 				pointers += (1<<i) * stat->nodesizes[i];
2265 			}
2266 		seq_printf(seq, "\n");
2267 		seq_printf(seq, "Pointers: %d\n", pointers);
2268 		bytes += sizeof(struct node *) * pointers;
2269 		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2270 		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2271 
2272 		kfree(stat);
2273 	}
2274 
2275 #ifdef CONFIG_IP_FIB_TRIE_STATS
2276 	seq_printf(seq, "Counters:\n---------\n");
2277 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2278 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2279 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2280 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2281 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2282 #ifdef CLEAR_STATS
2283 	memset(&(t->stats), 0, sizeof(t->stats));
2284 #endif
2285 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2286 }
2287 
2288 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2289 {
2290 	char bf[128];
2291 
2292 	if (v == SEQ_START_TOKEN) {
2293 		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2294 			   sizeof(struct leaf), sizeof(struct tnode));
2295 		if (trie_local)
2296 			collect_and_show(trie_local, seq);
2297 
2298 		if (trie_main)
2299 			collect_and_show(trie_main, seq);
2300 	}
2301 	else {
2302 		snprintf(bf, sizeof(bf),
2303 			 "*\t%08X\t%08X", 200, 400);
2304 
2305 		seq_printf(seq, "%-127s\n", bf);
2306 	}
2307 	return 0;
2308 }
2309 
2310 static struct seq_operations fib_triestat_seq_ops = {
2311 	.start  = fib_triestat_seq_start,
2312 	.next   = fib_triestat_seq_next,
2313 	.stop   = fib_triestat_seq_stop,
2314 	.show   = fib_triestat_seq_show,
2315 };
2316 
2317 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2318 {
2319 	struct seq_file *seq;
2320 	int rc = -ENOMEM;
2321 
2322 	rc = seq_open(file, &fib_triestat_seq_ops);
2323 	if (rc)
2324 		goto out_kfree;
2325 
2326 	seq	     = file->private_data;
2327 out:
2328 	return rc;
2329 out_kfree:
2330 	goto out;
2331 }
2332 
2333 static struct file_operations fib_triestat_seq_fops = {
2334 	.owner		= THIS_MODULE,
2335 	.open           = fib_triestat_seq_open,
2336 	.read           = seq_read,
2337 	.llseek         = seq_lseek,
2338 	.release	= seq_release_private,
2339 };
2340 
2341 int __init fib_stat_proc_init(void)
2342 {
2343 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2344 		return -ENOMEM;
2345 	return 0;
2346 }
2347 
2348 void __init fib_stat_proc_exit(void)
2349 {
2350 	proc_net_remove("fib_triestat");
2351 }
2352 
2353 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2354 {
2355 	return NULL;
2356 }
2357 
2358 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2359 {
2360 	return NULL;
2361 }
2362 
2363 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2364 {
2365 	void *v = NULL;
2366 
2367 	if (ip_fib_main_table)
2368 		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2369 	return v;
2370 }
2371 
2372 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2373 {
2374 	++*pos;
2375 	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2376 }
2377 
2378 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2379 {
2380 
2381 }
2382 
2383 /*
2384  *	This outputs /proc/net/fib_trie.
2385  *
2386  *	It always works in backward compatibility mode.
2387  *	The format of the file is not supposed to be changed.
2388  */
2389 
2390 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391 {
2392 	char bf[128];
2393 
2394 	if (v == SEQ_START_TOKEN) {
2395 		if (trie_local)
2396 			trie_dump_seq(seq, trie_local);
2397 
2398 		if (trie_main)
2399 			trie_dump_seq(seq, trie_main);
2400 	}
2401 
2402 	else {
2403 		snprintf(bf, sizeof(bf),
2404 			 "*\t%08X\t%08X", 200, 400);
2405 		seq_printf(seq, "%-127s\n", bf);
2406 	}
2407 
2408 	return 0;
2409 }
2410 
2411 static struct seq_operations fib_trie_seq_ops = {
2412 	.start  = fib_trie_seq_start,
2413 	.next   = fib_trie_seq_next,
2414 	.stop   = fib_trie_seq_stop,
2415 	.show   = fib_trie_seq_show,
2416 };
2417 
2418 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2419 {
2420 	struct seq_file *seq;
2421 	int rc = -ENOMEM;
2422 
2423 	rc = seq_open(file, &fib_trie_seq_ops);
2424 	if (rc)
2425 		goto out_kfree;
2426 
2427 	seq	     = file->private_data;
2428 out:
2429 	return rc;
2430 out_kfree:
2431 	goto out;
2432 }
2433 
2434 static struct file_operations fib_trie_seq_fops = {
2435 	.owner		= THIS_MODULE,
2436 	.open           = fib_trie_seq_open,
2437 	.read           = seq_read,
2438 	.llseek         = seq_lseek,
2439 	.release	= seq_release_private,
2440 };
2441 
2442 int __init fib_proc_init(void)
2443 {
2444 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2445 		return -ENOMEM;
2446 	return 0;
2447 }
2448 
2449 void __init fib_proc_exit(void)
2450 {
2451 	proc_net_remove("fib_trie");
2452 }
2453 
2454 #endif /* CONFIG_PROC_FS */
2455