xref: /linux/net/ipv4/fib_trie.c (revision d67b569f5f620c0fb95d5212642746b7ba9d29e4)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  */
45 
46 #define VERSION "0.325"
47 
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/sock.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
76 
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
79 
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 
85 static DEFINE_RWLOCK(fib_lock);
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_PARENT(_node) \
93 ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(_node, _ptr) \
95 ((_node)->_parent = (((unsigned long)(_ptr)) | \
96                      ((_node)->_parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(_node, _type) \
98 ((_node)->_parent = (_type))
99 #define NODE_TYPE(_node) \
100 ((_node)->_parent & NODE_TYPE_MASK)
101 
102 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
103 #define IS_LEAF(n) (n->_parent & T_LEAF)
104 
105 struct node {
106         t_key key;
107 	unsigned long _parent;
108 };
109 
110 struct leaf {
111         t_key key;
112 	unsigned long _parent;
113 	struct hlist_head list;
114 };
115 
116 struct leaf_info {
117 	struct hlist_node hlist;
118 	int plen;
119 	struct list_head falh;
120 };
121 
122 struct tnode {
123         t_key key;
124 	unsigned long _parent;
125         unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
126         unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
127         unsigned short full_children;  /* KEYLENGTH bits needed */
128         unsigned short empty_children; /* KEYLENGTH bits needed */
129         struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int nodesizes[MAX_CHILDS];
150 };
151 
152 struct trie {
153         struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 	struct trie_use_stats stats;
156 #endif
157         int size;
158 	unsigned int revision;
159 };
160 
161 static int trie_debug = 0;
162 
163 static int tnode_full(struct tnode *tn, struct node *n);
164 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
166 static int tnode_child_length(struct tnode *tn);
167 static struct node *resize(struct trie *t, struct tnode *tn);
168 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
169 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
170 static void tnode_free(struct tnode *tn);
171 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
172 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
173 extern int fib_detect_death(struct fib_info *fi, int order,
174                             struct fib_info **last_resort, int *last_idx, int *dflt);
175 
176 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
177                struct nlmsghdr *n, struct netlink_skb_parms *req);
178 
179 static kmem_cache_t *fn_alias_kmem;
180 static struct trie *trie_local = NULL, *trie_main = NULL;
181 
182 static void trie_bug(char *err)
183 {
184 	printk("Trie Bug: %s\n", err);
185 	BUG();
186 }
187 
188 static inline struct node *tnode_get_child(struct tnode *tn, int i)
189 {
190         if (i >=  1<<tn->bits)
191                 trie_bug("tnode_get_child");
192 
193         return tn->child[i];
194 }
195 
196 static inline int tnode_child_length(struct tnode *tn)
197 {
198         return 1<<tn->bits;
199 }
200 
201 /*
202   _________________________________________________________________
203   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
204   ----------------------------------------------------------------
205     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
206 
207   _________________________________________________________________
208   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
209   -----------------------------------------------------------------
210    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
211 
212   tp->pos = 7
213   tp->bits = 3
214   n->pos = 15
215   n->bits=4
216   KEYLENGTH=32
217 */
218 
219 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
220 {
221         if (offset < KEYLENGTH)
222 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
223         else
224 		return 0;
225 }
226 
227 static inline int tkey_equals(t_key a, t_key b)
228 {
229   return a == b;
230 }
231 
232 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
233 {
234      if (bits == 0 || offset >= KEYLENGTH)
235             return 1;
236         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
237         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238 }
239 
240 static inline int tkey_mismatch(t_key a, int offset, t_key b)
241 {
242 	t_key diff = a ^ b;
243 	int i = offset;
244 
245 	if(!diff)
246 	  return 0;
247 	while((diff << i) >> (KEYLENGTH-1) == 0)
248 		i++;
249 	return i;
250 }
251 
252 /* Candiate for fib_semantics */
253 
254 static void fn_free_alias(struct fib_alias *fa)
255 {
256 	fib_release_info(fa->fa_info);
257 	kmem_cache_free(fn_alias_kmem, fa);
258 }
259 
260 /*
261   To understand this stuff, an understanding of keys and all their bits is
262   necessary. Every node in the trie has a key associated with it, but not
263   all of the bits in that key are significant.
264 
265   Consider a node 'n' and its parent 'tp'.
266 
267   If n is a leaf, every bit in its key is significant. Its presence is
268   necessitaded by path compression, since during a tree traversal (when
269   searching for a leaf - unless we are doing an insertion) we will completely
270   ignore all skipped bits we encounter. Thus we need to verify, at the end of
271   a potentially successful search, that we have indeed been walking the
272   correct key path.
273 
274   Note that we can never "miss" the correct key in the tree if present by
275   following the wrong path. Path compression ensures that segments of the key
276   that are the same for all keys with a given prefix are skipped, but the
277   skipped part *is* identical for each node in the subtrie below the skipped
278   bit! trie_insert() in this implementation takes care of that - note the
279   call to tkey_sub_equals() in trie_insert().
280 
281   if n is an internal node - a 'tnode' here, the various parts of its key
282   have many different meanings.
283 
284   Example:
285   _________________________________________________________________
286   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287   -----------------------------------------------------------------
288     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
289 
290   _________________________________________________________________
291   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292   -----------------------------------------------------------------
293    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
294 
295   tp->pos = 7
296   tp->bits = 3
297   n->pos = 15
298   n->bits=4
299 
300   First, let's just ignore the bits that come before the parent tp, that is
301   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302   not use them for anything.
303 
304   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305   index into the parent's child array. That is, they will be used to find
306   'n' among tp's children.
307 
308   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309   for the node n.
310 
311   All the bits we have seen so far are significant to the node n. The rest
312   of the bits are really not needed or indeed known in n->key.
313 
314   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315   n's child array, and will of course be different for each child.
316 
317   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
318   at this point.
319 
320 */
321 
322 static void check_tnode(struct tnode *tn)
323 {
324 	if(tn && tn->pos+tn->bits > 32) {
325 		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
326 	}
327 }
328 
329 static int halve_threshold = 25;
330 static int inflate_threshold = 50;
331 
332 static struct leaf *leaf_new(void)
333 {
334 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
335 	if(l) {
336 		NODE_INIT_PARENT(l, T_LEAF);
337 		INIT_HLIST_HEAD(&l->list);
338 	}
339 	return l;
340 }
341 
342 static struct leaf_info *leaf_info_new(int plen)
343 {
344 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
345 	if(li) {
346 		li->plen = plen;
347 		INIT_LIST_HEAD(&li->falh);
348 	}
349 	return li;
350 }
351 
352 static inline void free_leaf(struct leaf *l)
353 {
354 	kfree(l);
355 }
356 
357 static inline void free_leaf_info(struct leaf_info *li)
358 {
359 	kfree(li);
360 }
361 
362 static struct tnode *tnode_alloc(unsigned int size)
363 {
364 	if (size <= PAGE_SIZE) {
365 		return kmalloc(size, GFP_KERNEL);
366 	} else {
367 		return (struct tnode *)
368 		       __get_free_pages(GFP_KERNEL, get_order(size));
369 	}
370 }
371 
372 static void __tnode_free(struct tnode *tn)
373 {
374 	unsigned int size = sizeof(struct tnode) +
375 	                    (1<<tn->bits) * sizeof(struct node *);
376 
377 	if (size <= PAGE_SIZE)
378 		kfree(tn);
379 	else
380 		free_pages((unsigned long)tn, get_order(size));
381 }
382 
383 static struct tnode* tnode_new(t_key key, int pos, int bits)
384 {
385 	int nchildren = 1<<bits;
386 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
387 	struct tnode *tn = tnode_alloc(sz);
388 
389 	if(tn)  {
390 		memset(tn, 0, sz);
391 		NODE_INIT_PARENT(tn, T_TNODE);
392 		tn->pos = pos;
393 		tn->bits = bits;
394 		tn->key = key;
395 		tn->full_children = 0;
396 		tn->empty_children = 1<<bits;
397 	}
398 	if(trie_debug > 0)
399 		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400 		       (unsigned int) (sizeof(struct node) * 1<<bits));
401 	return tn;
402 }
403 
404 static void tnode_free(struct tnode *tn)
405 {
406 	if(!tn) {
407 		trie_bug("tnode_free\n");
408 	}
409 	if(IS_LEAF(tn)) {
410 		free_leaf((struct leaf *)tn);
411 		if(trie_debug > 0 )
412 			printk("FL %p \n", tn);
413 	}
414 	else if(IS_TNODE(tn)) {
415 		__tnode_free(tn);
416 		if(trie_debug > 0 )
417 			printk("FT %p \n", tn);
418 	}
419 	else {
420 		trie_bug("tnode_free\n");
421 	}
422 }
423 
424 /*
425  * Check whether a tnode 'n' is "full", i.e. it is an internal node
426  * and no bits are skipped. See discussion in dyntree paper p. 6
427  */
428 
429 static inline int tnode_full(struct tnode *tn, struct node *n)
430 {
431 	if(n == NULL || IS_LEAF(n))
432 		return 0;
433 
434 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
435 }
436 
437 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
438 {
439 	tnode_put_child_reorg(tn, i, n, -1);
440 }
441 
442  /*
443   * Add a child at position i overwriting the old value.
444   * Update the value of full_children and empty_children.
445   */
446 
447 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
448 {
449 	struct node *chi;
450 	int isfull;
451 
452 	if(i >=  1<<tn->bits) {
453 		printk("bits=%d, i=%d\n", tn->bits, i);
454 		trie_bug("tnode_put_child_reorg bits");
455 	}
456 	write_lock_bh(&fib_lock);
457 	chi = tn->child[i];
458 
459 	/* update emptyChildren */
460 	if (n == NULL && chi != NULL)
461 		tn->empty_children++;
462 	else if (n != NULL && chi == NULL)
463 		tn->empty_children--;
464 
465 	/* update fullChildren */
466         if (wasfull == -1)
467 		wasfull = tnode_full(tn, chi);
468 
469 	isfull = tnode_full(tn, n);
470 	if (wasfull && !isfull)
471 		tn->full_children--;
472 
473 	else if (!wasfull && isfull)
474 		tn->full_children++;
475 	if(n)
476 		NODE_SET_PARENT(n, tn);
477 
478 	tn->child[i] = n;
479 	write_unlock_bh(&fib_lock);
480 }
481 
482 static struct node *resize(struct trie *t, struct tnode *tn)
483 {
484 	int i;
485 	int err = 0;
486 
487  	if (!tn)
488 		return NULL;
489 
490 	if(trie_debug)
491 		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
492 		      tn, inflate_threshold, halve_threshold);
493 
494 	/* No children */
495 	if (tn->empty_children == tnode_child_length(tn)) {
496 		tnode_free(tn);
497 		return NULL;
498 	}
499 	/* One child */
500 	if (tn->empty_children == tnode_child_length(tn) - 1)
501 		for (i = 0; i < tnode_child_length(tn); i++) {
502 
503 			write_lock_bh(&fib_lock);
504 			if (tn->child[i] != NULL) {
505 
506 				/* compress one level */
507 				struct node *n = tn->child[i];
508 				if(n)
509 					NODE_INIT_PARENT(n, NODE_TYPE(n));
510 
511 				write_unlock_bh(&fib_lock);
512 				tnode_free(tn);
513 				return n;
514 			}
515 			write_unlock_bh(&fib_lock);
516 		}
517 	/*
518 	 * Double as long as the resulting node has a number of
519 	 * nonempty nodes that are above the threshold.
520 	 */
521 
522 	/*
523 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525 	 * Telecommunications, page 6:
526 	 * "A node is doubled if the ratio of non-empty children to all
527 	 * children in the *doubled* node is at least 'high'."
528 	 *
529 	 * 'high' in this instance is the variable 'inflate_threshold'. It
530 	 * is expressed as a percentage, so we multiply it with
531 	 * tnode_child_length() and instead of multiplying by 2 (since the
532 	 * child array will be doubled by inflate()) and multiplying
533 	 * the left-hand side by 100 (to handle the percentage thing) we
534 	 * multiply the left-hand side by 50.
535 	 *
536 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
537 	 * - tn->empty_children is of course the number of non-null children
538 	 * in the current node. tn->full_children is the number of "full"
539 	 * children, that is non-null tnodes with a skip value of 0.
540 	 * All of those will be doubled in the resulting inflated tnode, so
541 	 * we just count them one extra time here.
542 	 *
543 	 * A clearer way to write this would be:
544 	 *
545 	 * to_be_doubled = tn->full_children;
546 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
547 	 *     tn->full_children;
548 	 *
549 	 * new_child_length = tnode_child_length(tn) * 2;
550 	 *
551 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
552 	 *      new_child_length;
553 	 * if (new_fill_factor >= inflate_threshold)
554 	 *
555 	 * ...and so on, tho it would mess up the while() loop.
556 	 *
557 	 * anyway,
558 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
559 	 *      inflate_threshold
560 	 *
561 	 * avoid a division:
562 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563 	 *      inflate_threshold * new_child_length
564 	 *
565 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
566 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
567 	 *    tn->full_children ) >= inflate_threshold * new_child_length
568 	 *
569 	 * expand new_child_length:
570 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
571 	 *    tn->full_children ) >=
572 	 *      inflate_threshold * tnode_child_length(tn) * 2
573 	 *
574 	 * shorten again:
575 	 * 50 * (tn->full_children + tnode_child_length(tn) -
576 	 *    tn->empty_children ) >= inflate_threshold *
577 	 *    tnode_child_length(tn)
578 	 *
579 	 */
580 
581 	check_tnode(tn);
582 
583 	err = 0;
584 	while ((tn->full_children > 0 &&
585 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
586 				inflate_threshold * tnode_child_length(tn))) {
587 
588 		tn = inflate(t, tn, &err);
589 
590 		if(err) {
591 #ifdef CONFIG_IP_FIB_TRIE_STATS
592 			t->stats.resize_node_skipped++;
593 #endif
594 			break;
595 		}
596 	}
597 
598 	check_tnode(tn);
599 
600 	/*
601 	 * Halve as long as the number of empty children in this
602 	 * node is above threshold.
603 	 */
604 
605 	err = 0;
606 	while (tn->bits > 1 &&
607 	       100 * (tnode_child_length(tn) - tn->empty_children) <
608 	       halve_threshold * tnode_child_length(tn)) {
609 
610 		tn = halve(t, tn, &err);
611 
612 		if(err) {
613 #ifdef CONFIG_IP_FIB_TRIE_STATS
614 			t->stats.resize_node_skipped++;
615 #endif
616 			break;
617 		}
618 	}
619 
620 
621 	/* Only one child remains */
622 
623 	if (tn->empty_children == tnode_child_length(tn) - 1)
624 		for (i = 0; i < tnode_child_length(tn); i++) {
625 
626 			write_lock_bh(&fib_lock);
627 			if (tn->child[i] != NULL) {
628 				/* compress one level */
629 				struct node *n = tn->child[i];
630 
631 				if(n)
632 					NODE_INIT_PARENT(n, NODE_TYPE(n));
633 
634 				write_unlock_bh(&fib_lock);
635 				tnode_free(tn);
636 				return n;
637 			}
638 			write_unlock_bh(&fib_lock);
639 		}
640 
641 	return (struct node *) tn;
642 }
643 
644 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
645 {
646 	struct tnode *inode;
647 	struct tnode *oldtnode = tn;
648 	int olen = tnode_child_length(tn);
649 	int i;
650 
651   	if(trie_debug)
652 		printk("In inflate\n");
653 
654 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
655 
656 	if (!tn) {
657 		*err = -ENOMEM;
658 		return oldtnode;
659 	}
660 
661 	/*
662 	 * Preallocate and store tnodes before the actual work so we
663 	 * don't get into an inconsistent state if memory allocation
664 	 * fails. In case of failure we return the oldnode and  inflate
665 	 * of tnode is ignored.
666 	 */
667 
668 	for(i = 0; i < olen; i++) {
669 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670 
671 		if (inode &&
672 		    IS_TNODE(inode) &&
673 		    inode->pos == oldtnode->pos + oldtnode->bits &&
674 		    inode->bits > 1) {
675 			struct tnode *left, *right;
676 
677 			t_key m = TKEY_GET_MASK(inode->pos, 1);
678 
679 			left = tnode_new(inode->key&(~m), inode->pos + 1,
680 					 inode->bits - 1);
681 
682 			if(!left) {
683 				*err = -ENOMEM;
684 				break;
685 			}
686 
687 			right = tnode_new(inode->key|m, inode->pos + 1,
688 					  inode->bits - 1);
689 
690 			if(!right) {
691 				*err = -ENOMEM;
692 				break;
693 			}
694 
695 			put_child(t, tn, 2*i, (struct node *) left);
696 			put_child(t, tn, 2*i+1, (struct node *) right);
697 		}
698 	}
699 
700 	if(*err) {
701 		int size = tnode_child_length(tn);
702 		int j;
703 
704 		for(j = 0; j < size; j++)
705 			if( tn->child[j])
706 				tnode_free((struct tnode *)tn->child[j]);
707 
708 		tnode_free(tn);
709 
710 		*err = -ENOMEM;
711 		return oldtnode;
712 	}
713 
714 	for(i = 0; i < olen; i++) {
715 		struct node *node = tnode_get_child(oldtnode, i);
716 
717 		/* An empty child */
718 		if (node == NULL)
719 			continue;
720 
721 		/* A leaf or an internal node with skipped bits */
722 
723 		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
724 		   tn->pos + tn->bits - 1) {
725 			if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
726 					     1) == 0)
727 				put_child(t, tn, 2*i, node);
728 			else
729 				put_child(t, tn, 2*i+1, node);
730 			continue;
731 		}
732 
733 		/* An internal node with two children */
734 		inode = (struct tnode *) node;
735 
736 		if (inode->bits == 1) {
737 			put_child(t, tn, 2*i, inode->child[0]);
738 			put_child(t, tn, 2*i+1, inode->child[1]);
739 
740 			tnode_free(inode);
741 		}
742 
743 			/* An internal node with more than two children */
744 		else {
745 			struct tnode *left, *right;
746 			int size, j;
747 
748 			/* We will replace this node 'inode' with two new
749 			 * ones, 'left' and 'right', each with half of the
750 			 * original children. The two new nodes will have
751 			 * a position one bit further down the key and this
752 			 * means that the "significant" part of their keys
753 			 * (see the discussion near the top of this file)
754 			 * will differ by one bit, which will be "0" in
755 			 * left's key and "1" in right's key. Since we are
756 			 * moving the key position by one step, the bit that
757 			 * we are moving away from - the bit at position
758 			 * (inode->pos) - is the one that will differ between
759 			 * left and right. So... we synthesize that bit in the
760 			 * two  new keys.
761 			 * The mask 'm' below will be a single "one" bit at
762 			 * the position (inode->pos)
763 			 */
764 
765 			/* Use the old key, but set the new significant
766 			 *   bit to zero.
767 			 */
768 
769 			left = (struct tnode *) tnode_get_child(tn, 2*i);
770 			put_child(t, tn, 2*i, NULL);
771 
772 			if(!left)
773 				BUG();
774 
775 			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
776 			put_child(t, tn, 2*i+1, NULL);
777 
778 			if(!right)
779 				BUG();
780 
781 			size = tnode_child_length(left);
782 			for(j = 0; j < size; j++) {
783 				put_child(t, left, j, inode->child[j]);
784 				put_child(t, right, j, inode->child[j + size]);
785 			}
786 			put_child(t, tn, 2*i, resize(t, left));
787 			put_child(t, tn, 2*i+1, resize(t, right));
788 
789 			tnode_free(inode);
790 		}
791 	}
792 	tnode_free(oldtnode);
793 	return tn;
794 }
795 
796 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
797 {
798 	struct tnode *oldtnode = tn;
799 	struct node *left, *right;
800 	int i;
801 	int olen = tnode_child_length(tn);
802 
803 	if(trie_debug) printk("In halve\n");
804 
805 	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806 
807 	if (!tn) {
808 		*err = -ENOMEM;
809 		return oldtnode;
810 	}
811 
812 	/*
813 	 * Preallocate and store tnodes before the actual work so we
814 	 * don't get into an inconsistent state if memory allocation
815 	 * fails. In case of failure we return the oldnode and halve
816 	 * of tnode is ignored.
817 	 */
818 
819 	for(i = 0; i < olen; i += 2) {
820 		left = tnode_get_child(oldtnode, i);
821 		right = tnode_get_child(oldtnode, i+1);
822 
823 		/* Two nonempty children */
824 		if( left && right)  {
825 			struct tnode *newBinNode =
826 				tnode_new(left->key, tn->pos + tn->bits, 1);
827 
828 			if(!newBinNode) {
829 				*err = -ENOMEM;
830 				break;
831 			}
832 			put_child(t, tn, i/2, (struct node *)newBinNode);
833 		}
834 	}
835 
836 	if(*err) {
837 		int size = tnode_child_length(tn);
838 		int j;
839 
840 		for(j = 0; j < size; j++)
841 			if( tn->child[j])
842 				tnode_free((struct tnode *)tn->child[j]);
843 
844 		tnode_free(tn);
845 
846 		*err = -ENOMEM;
847 		return oldtnode;
848 	}
849 
850 	for(i = 0; i < olen; i += 2) {
851 		left = tnode_get_child(oldtnode, i);
852 		right = tnode_get_child(oldtnode, i+1);
853 
854 		/* At least one of the children is empty */
855 		if (left == NULL) {
856 			if (right == NULL)    /* Both are empty */
857 				continue;
858 			put_child(t, tn, i/2, right);
859 		} else if (right == NULL)
860 			put_child(t, tn, i/2, left);
861 
862 		/* Two nonempty children */
863 		else {
864 			struct tnode *newBinNode =
865 				(struct tnode *) tnode_get_child(tn, i/2);
866 			put_child(t, tn, i/2, NULL);
867 
868 			if(!newBinNode)
869 				BUG();
870 
871 			put_child(t, newBinNode, 0, left);
872 			put_child(t, newBinNode, 1, right);
873 			put_child(t, tn, i/2, resize(t, newBinNode));
874 		}
875 	}
876 	tnode_free(oldtnode);
877 	return tn;
878 }
879 
880 static void *trie_init(struct trie *t)
881 {
882 	if(t) {
883 		t->size = 0;
884 		t->trie = NULL;
885 		t->revision = 0;
886 #ifdef CONFIG_IP_FIB_TRIE_STATS
887        		memset(&t->stats, 0, sizeof(struct trie_use_stats));
888 #endif
889 	}
890 	return t;
891 }
892 
893 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
894 {
895 	struct hlist_node *node;
896 	struct leaf_info *li;
897 
898 	hlist_for_each_entry(li, node, head, hlist) {
899 
900 		if ( li->plen == plen )
901 			return li;
902 	}
903 	return NULL;
904 }
905 
906 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
907 {
908 	struct list_head *fa_head=NULL;
909 	struct leaf_info *li = find_leaf_info(&l->list, plen);
910 
911 	if(li)
912 		fa_head = &li->falh;
913 
914 	return fa_head;
915 }
916 
917 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
918 {
919 	struct leaf_info *li=NULL, *last=NULL;
920 	struct hlist_node *node, *tmp;
921 
922 	write_lock_bh(&fib_lock);
923 
924 	if(hlist_empty(head))
925 		hlist_add_head(&new->hlist, head);
926 	else {
927 		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
928 
929 			if (new->plen > li->plen)
930 				break;
931 
932 			last = li;
933 		}
934 		if(last)
935 			hlist_add_after(&last->hlist, &new->hlist);
936 		else
937 			hlist_add_before(&new->hlist, &li->hlist);
938 	}
939 	write_unlock_bh(&fib_lock);
940 }
941 
942 static struct leaf *
943 fib_find_node(struct trie *t, u32 key)
944 {
945 	int pos;
946 	struct tnode *tn;
947 	struct node *n;
948 
949 	pos = 0;
950 	n=t->trie;
951 
952 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
953 		tn = (struct tnode *) n;
954 
955 		check_tnode(tn);
956 
957 		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958 			pos=tn->pos + tn->bits;
959 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960 		}
961 		else
962 			break;
963 	}
964 	/* Case we have found a leaf. Compare prefixes */
965 
966 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
967 		struct leaf *l = (struct leaf *) n;
968 		return l;
969 	}
970 	return NULL;
971 }
972 
973 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
974 {
975 	int i = 0;
976 	int wasfull;
977 	t_key cindex, key;
978 	struct tnode *tp = NULL;
979 
980 	if(!tn)
981 		BUG();
982 
983 	key = tn->key;
984 	i = 0;
985 
986 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
987 
988 		if( i > 10 ) {
989 			printk("Rebalance tn=%p \n", tn);
990 			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
991 
992 			printk("Rebalance tp=%p \n", tp);
993 			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
994 		}
995 
996 		if( i > 12 ) BUG();
997 		i++;
998 
999 		tp = NODE_PARENT(tn);
1000 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1001 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1002 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
1003 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
1004 
1005 		if(!NODE_PARENT(tn))
1006 			break;
1007 
1008 		tn = NODE_PARENT(tn);
1009 	}
1010 	/* Handle last (top) tnode */
1011 	if (IS_TNODE(tn))
1012 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
1013 
1014 	return (struct node*) tn;
1015 }
1016 
1017 static  struct list_head *
1018 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1019 {
1020 	int pos, newpos;
1021 	struct tnode *tp = NULL, *tn = NULL;
1022 	struct node *n;
1023 	struct leaf *l;
1024 	int missbit;
1025 	struct list_head *fa_head=NULL;
1026 	struct leaf_info *li;
1027 	t_key cindex;
1028 
1029 	pos = 0;
1030 	n=t->trie;
1031 
1032 	/* If we point to NULL, stop. Either the tree is empty and we should
1033 	 * just put a new leaf in if, or we have reached an empty child slot,
1034 	 * and we should just put our new leaf in that.
1035 	 * If we point to a T_TNODE, check if it matches our key. Note that
1036 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1037 	 * not be the parent's 'pos'+'bits'!
1038 	 *
1039 	 * If it does match the current key, get pos/bits from it, extract
1040 	 * the index from our key, push the T_TNODE and walk the tree.
1041 	 *
1042 	 * If it doesn't, we have to replace it with a new T_TNODE.
1043 	 *
1044 	 * If we point to a T_LEAF, it might or might not have the same key
1045 	 * as we do. If it does, just change the value, update the T_LEAF's
1046 	 * value, and return it.
1047 	 * If it doesn't, we need to replace it with a T_TNODE.
1048 	 */
1049 
1050 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1051 		tn = (struct tnode *) n;
1052 
1053 		check_tnode(tn);
1054 
1055 		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1056 			tp = tn;
1057 			pos=tn->pos + tn->bits;
1058 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1059 
1060 			if(n && NODE_PARENT(n) != tn) {
1061 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1062 				BUG();
1063 			}
1064 		}
1065 		else
1066 			break;
1067 	}
1068 
1069 	/*
1070 	 * n  ----> NULL, LEAF or TNODE
1071 	 *
1072 	 * tp is n's (parent) ----> NULL or TNODE
1073 	 */
1074 
1075 	if(tp && IS_LEAF(tp))
1076 		BUG();
1077 
1078 
1079 	/* Case 1: n is a leaf. Compare prefixes */
1080 
1081 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1082 		struct leaf *l = ( struct leaf *)  n;
1083 
1084 		li = leaf_info_new(plen);
1085 
1086 		if(! li) {
1087 			*err = -ENOMEM;
1088 			goto err;
1089 		}
1090 
1091 		fa_head = &li->falh;
1092 		insert_leaf_info(&l->list, li);
1093 		goto done;
1094 	}
1095 	t->size++;
1096 	l = leaf_new();
1097 
1098 	if(! l) {
1099 		*err = -ENOMEM;
1100 		goto err;
1101 	}
1102 
1103 	l->key = key;
1104 	li = leaf_info_new(plen);
1105 
1106 	if(! li) {
1107 		tnode_free((struct tnode *) l);
1108 		*err = -ENOMEM;
1109 		goto err;
1110 	}
1111 
1112 	fa_head = &li->falh;
1113 	insert_leaf_info(&l->list, li);
1114 
1115 	/* Case 2: n is NULL, and will just insert a new leaf */
1116 	if (t->trie && n == NULL) {
1117 
1118 		NODE_SET_PARENT(l, tp);
1119 
1120 		if (!tp)
1121 			BUG();
1122 
1123 		else {
1124 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125 			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1126 		}
1127 	}
1128 	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1129 	else {
1130 		/*
1131 		 *  Add a new tnode here
1132 		 *  first tnode need some special handling
1133 		 */
1134 
1135 		if (tp)
1136 			pos=tp->pos+tp->bits;
1137 		else
1138 			pos=0;
1139 		if(n) {
1140 			newpos = tkey_mismatch(key, pos, n->key);
1141 			tn = tnode_new(n->key, newpos, 1);
1142 		}
1143 		else {
1144 			newpos = 0;
1145 			tn = tnode_new(key, newpos, 1); /* First tnode */
1146 		}
1147 
1148 		if(!tn) {
1149 			free_leaf_info(li);
1150 			tnode_free((struct tnode *) l);
1151 			*err = -ENOMEM;
1152 			goto err;
1153 		}
1154 
1155 		NODE_SET_PARENT(tn, tp);
1156 
1157 		missbit=tkey_extract_bits(key, newpos, 1);
1158 		put_child(t, tn, missbit, (struct node *)l);
1159 		put_child(t, tn, 1-missbit, n);
1160 
1161 		if(tp) {
1162 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1164 		}
1165 		else {
1166 			t->trie = (struct node*) tn; /* First tnode */
1167 			tp = tn;
1168 		}
1169 	}
1170 	if(tp && tp->pos+tp->bits > 32) {
1171 		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1172 		       tp, tp->pos, tp->bits, key, plen);
1173 	}
1174 	/* Rebalance the trie */
1175 	t->trie = trie_rebalance(t, tp);
1176 done:
1177 	t->revision++;
1178 err:;
1179 	return fa_head;
1180 }
1181 
1182 static int
1183 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1184 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1185 {
1186 	struct trie *t = (struct trie *) tb->tb_data;
1187 	struct fib_alias *fa, *new_fa;
1188 	struct list_head *fa_head=NULL;
1189 	struct fib_info *fi;
1190 	int plen = r->rtm_dst_len;
1191 	int type = r->rtm_type;
1192 	u8 tos = r->rtm_tos;
1193 	u32 key, mask;
1194 	int err;
1195 	struct leaf *l;
1196 
1197 	if (plen > 32)
1198 		return -EINVAL;
1199 
1200 	key = 0;
1201 	if (rta->rta_dst)
1202 		memcpy(&key, rta->rta_dst, 4);
1203 
1204 	key = ntohl(key);
1205 
1206 	if(trie_debug)
1207 		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1208 
1209 	mask =  ntohl( inet_make_mask(plen) );
1210 
1211 	if(key & ~mask)
1212 		return -EINVAL;
1213 
1214 	key = key & mask;
1215 
1216 	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
1217 		goto err;
1218 
1219 	l = fib_find_node(t, key);
1220 	fa = NULL;
1221 
1222 	if(l) {
1223 		fa_head = get_fa_head(l, plen);
1224 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1225 	}
1226 
1227 	/* Now fa, if non-NULL, points to the first fib alias
1228 	 * with the same keys [prefix,tos,priority], if such key already
1229 	 * exists or to the node before which we will insert new one.
1230 	 *
1231 	 * If fa is NULL, we will need to allocate a new one and
1232 	 * insert to the head of f.
1233 	 *
1234 	 * If f is NULL, no fib node matched the destination key
1235 	 * and we need to allocate a new one of those as well.
1236 	 */
1237 
1238 	if (fa &&
1239 	    fa->fa_info->fib_priority == fi->fib_priority) {
1240 		struct fib_alias *fa_orig;
1241 
1242 		err = -EEXIST;
1243 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1244 			goto out;
1245 
1246 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1247 			struct fib_info *fi_drop;
1248 			u8 state;
1249 
1250 			write_lock_bh(&fib_lock);
1251 
1252 			fi_drop = fa->fa_info;
1253 			fa->fa_info = fi;
1254 			fa->fa_type = type;
1255 			fa->fa_scope = r->rtm_scope;
1256 			state = fa->fa_state;
1257 			fa->fa_state &= ~FA_S_ACCESSED;
1258 
1259 			write_unlock_bh(&fib_lock);
1260 
1261 			fib_release_info(fi_drop);
1262 			if (state & FA_S_ACCESSED)
1263 			  rt_cache_flush(-1);
1264 
1265 			    goto succeeded;
1266 		}
1267 		/* Error if we find a perfect match which
1268 		 * uses the same scope, type, and nexthop
1269 		 * information.
1270 		 */
1271 		fa_orig = fa;
1272 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1273 			if (fa->fa_tos != tos)
1274 				break;
1275 			if (fa->fa_info->fib_priority != fi->fib_priority)
1276 				break;
1277 			if (fa->fa_type == type &&
1278 			    fa->fa_scope == r->rtm_scope &&
1279 			    fa->fa_info == fi) {
1280 				goto out;
1281 			}
1282 		}
1283 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1284 			fa = fa_orig;
1285 	}
1286 	err = -ENOENT;
1287 	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
1288 		goto out;
1289 
1290 	err = -ENOBUFS;
1291 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1292 	if (new_fa == NULL)
1293 		goto out;
1294 
1295 	new_fa->fa_info = fi;
1296 	new_fa->fa_tos = tos;
1297 	new_fa->fa_type = type;
1298 	new_fa->fa_scope = r->rtm_scope;
1299 	new_fa->fa_state = 0;
1300 #if 0
1301 	new_fa->dst  = NULL;
1302 #endif
1303 	/*
1304 	 * Insert new entry to the list.
1305 	 */
1306 
1307 	if(!fa_head) {
1308 		fa_head = fib_insert_node(t, &err, key, plen);
1309 		err = 0;
1310 		if(err)
1311 			goto out_free_new_fa;
1312 	}
1313 
1314 	write_lock_bh(&fib_lock);
1315 
1316 	list_add_tail(&new_fa->fa_list,
1317 		 (fa ? &fa->fa_list : fa_head));
1318 
1319 	write_unlock_bh(&fib_lock);
1320 
1321 	rt_cache_flush(-1);
1322 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1323 succeeded:
1324 	return 0;
1325 
1326 out_free_new_fa:
1327 	kmem_cache_free(fn_alias_kmem, new_fa);
1328 out:
1329 	fib_release_info(fi);
1330 err:;
1331 	return err;
1332 }
1333 
1334 static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
1335 			     struct fib_result *res, int *err)
1336 {
1337 	int i;
1338 	t_key mask;
1339 	struct leaf_info *li;
1340 	struct hlist_head *hhead = &l->list;
1341 	struct hlist_node *node;
1342 
1343 	hlist_for_each_entry(li, node, hhead, hlist) {
1344 
1345 		i = li->plen;
1346 		mask = ntohl(inet_make_mask(i));
1347 		if (l->key != (key & mask))
1348 			continue;
1349 
1350 		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
1351 			*plen = i;
1352 #ifdef CONFIG_IP_FIB_TRIE_STATS
1353 			t->stats.semantic_match_passed++;
1354 #endif
1355 			return 1;
1356 		}
1357 #ifdef CONFIG_IP_FIB_TRIE_STATS
1358 		t->stats.semantic_match_miss++;
1359 #endif
1360 	}
1361 	return 0;
1362 }
1363 
1364 static int
1365 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1366 {
1367 	struct trie *t = (struct trie *) tb->tb_data;
1368 	int plen, ret = 0;
1369 	struct node *n;
1370 	struct tnode *pn;
1371 	int pos, bits;
1372 	t_key key=ntohl(flp->fl4_dst);
1373 	int chopped_off;
1374 	t_key cindex = 0;
1375 	int current_prefix_length = KEYLENGTH;
1376 	n = t->trie;
1377 
1378 	read_lock(&fib_lock);
1379 	if(!n)
1380 		goto failed;
1381 
1382 #ifdef CONFIG_IP_FIB_TRIE_STATS
1383 	t->stats.gets++;
1384 #endif
1385 
1386 	/* Just a leaf? */
1387 	if (IS_LEAF(n)) {
1388 		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
1389 			goto found;
1390 		goto failed;
1391 	}
1392 	pn = (struct tnode *) n;
1393 	chopped_off = 0;
1394 
1395         while (pn) {
1396 
1397 		pos = pn->pos;
1398 		bits = pn->bits;
1399 
1400 		if(!chopped_off)
1401 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1402 
1403 		n = tnode_get_child(pn, cindex);
1404 
1405 		if (n == NULL) {
1406 #ifdef CONFIG_IP_FIB_TRIE_STATS
1407 			t->stats.null_node_hit++;
1408 #endif
1409 			goto backtrace;
1410 		}
1411 
1412 		if (IS_TNODE(n)) {
1413 #define HL_OPTIMIZE
1414 #ifdef HL_OPTIMIZE
1415 			struct tnode *cn = (struct tnode *)n;
1416 			t_key node_prefix, key_prefix, pref_mismatch;
1417 			int mp;
1418 
1419 			/*
1420 			 * It's a tnode, and we can do some extra checks here if we
1421 			 * like, to avoid descending into a dead-end branch.
1422 			 * This tnode is in the parent's child array at index
1423 			 * key[p_pos..p_pos+p_bits] but potentially with some bits
1424 			 * chopped off, so in reality the index may be just a
1425 			 * subprefix, padded with zero at the end.
1426 			 * We can also take a look at any skipped bits in this
1427 			 * tnode - everything up to p_pos is supposed to be ok,
1428 			 * and the non-chopped bits of the index (se previous
1429 			 * paragraph) are also guaranteed ok, but the rest is
1430 			 * considered unknown.
1431 			 *
1432 			 * The skipped bits are key[pos+bits..cn->pos].
1433 			 */
1434 
1435 			/* If current_prefix_length < pos+bits, we are already doing
1436 			 * actual prefix  matching, which means everything from
1437 			 * pos+(bits-chopped_off) onward must be zero along some
1438 			 * branch of this subtree - otherwise there is *no* valid
1439 			 * prefix present. Here we can only check the skipped
1440 			 * bits. Remember, since we have already indexed into the
1441 			 * parent's child array, we know that the bits we chopped of
1442 			 * *are* zero.
1443 			 */
1444 
1445 			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1446 
1447 			if (current_prefix_length < pos+bits) {
1448 				if (tkey_extract_bits(cn->key, current_prefix_length,
1449 						      cn->pos - current_prefix_length) != 0 ||
1450 				    !(cn->child[0]))
1451 					goto backtrace;
1452 			}
1453 
1454 			/*
1455 			 * If chopped_off=0, the index is fully validated and we
1456 			 * only need to look at the skipped bits for this, the new,
1457 			 * tnode. What we actually want to do is to find out if
1458 			 * these skipped bits match our key perfectly, or if we will
1459 			 * have to count on finding a matching prefix further down,
1460 			 * because if we do, we would like to have some way of
1461 			 * verifying the existence of such a prefix at this point.
1462 			 */
1463 
1464 			/* The only thing we can do at this point is to verify that
1465 			 * any such matching prefix can indeed be a prefix to our
1466 			 * key, and if the bits in the node we are inspecting that
1467 			 * do not match our key are not ZERO, this cannot be true.
1468 			 * Thus, find out where there is a mismatch (before cn->pos)
1469 			 * and verify that all the mismatching bits are zero in the
1470 			 * new tnode's key.
1471 			 */
1472 
1473 			/* Note: We aren't very concerned about the piece of the key
1474 			 * that precede pn->pos+pn->bits, since these have already been
1475 			 * checked. The bits after cn->pos aren't checked since these are
1476 			 * by definition "unknown" at this point. Thus, what we want to
1477 			 * see is if we are about to enter the "prefix matching" state,
1478 			 * and in that case verify that the skipped bits that will prevail
1479 			 * throughout this subtree are zero, as they have to be if we are
1480 			 * to find a matching prefix.
1481 			 */
1482 
1483 			node_prefix = MASK_PFX(cn->key, cn->pos);
1484 			key_prefix =  MASK_PFX(key, cn->pos);
1485 			pref_mismatch = key_prefix^node_prefix;
1486 			mp = 0;
1487 
1488 			/* In short: If skipped bits in this node do not match the search
1489 			 * key, enter the "prefix matching" state.directly.
1490 			 */
1491 			if (pref_mismatch) {
1492 				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1493 					mp++;
1494 					pref_mismatch = pref_mismatch <<1;
1495 				}
1496 				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1497 
1498 				if (key_prefix != 0)
1499 					goto backtrace;
1500 
1501 				if (current_prefix_length >= cn->pos)
1502 					current_prefix_length=mp;
1503 		       }
1504 #endif
1505 		       pn = (struct tnode *)n; /* Descend */
1506 		       chopped_off = 0;
1507 		       continue;
1508 		}
1509 		if (IS_LEAF(n)) {
1510 			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
1511 				goto found;
1512 	       }
1513 backtrace:
1514 		chopped_off++;
1515 
1516 		/* As zero don't change the child key (cindex) */
1517 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
1518 			chopped_off++;
1519 		}
1520 
1521 		/* Decrease current_... with bits chopped off */
1522 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1523 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1524 
1525 		/*
1526 		 * Either we do the actual chop off according or if we have
1527 		 * chopped off all bits in this tnode walk up to our parent.
1528 		 */
1529 
1530 		if(chopped_off <= pn->bits)
1531 			cindex &= ~(1 << (chopped_off-1));
1532 		else {
1533 			if( NODE_PARENT(pn) == NULL)
1534 				goto failed;
1535 
1536 			/* Get Child's index */
1537 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1538 			pn = NODE_PARENT(pn);
1539 			chopped_off = 0;
1540 
1541 #ifdef CONFIG_IP_FIB_TRIE_STATS
1542 			t->stats.backtrack++;
1543 #endif
1544 			goto backtrace;
1545 		}
1546 	}
1547 failed:
1548 	ret =  1;
1549 found:
1550 	read_unlock(&fib_lock);
1551 	return ret;
1552 }
1553 
1554 static int trie_leaf_remove(struct trie *t, t_key key)
1555 {
1556 	t_key cindex;
1557 	struct tnode *tp = NULL;
1558 	struct node *n = t->trie;
1559 	struct leaf *l;
1560 
1561 	if(trie_debug)
1562 		printk("entering trie_leaf_remove(%p)\n", n);
1563 
1564 	/* Note that in the case skipped bits, those bits are *not* checked!
1565 	 * When we finish this, we will have NULL or a T_LEAF, and the
1566 	 * T_LEAF may or may not match our key.
1567 	 */
1568 
1569         while (n != NULL && IS_TNODE(n)) {
1570 		struct tnode *tn = (struct tnode *) n;
1571 		check_tnode(tn);
1572 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1573 
1574 			if(n && NODE_PARENT(n) != tn) {
1575 				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1576 				BUG();
1577 			}
1578         }
1579 	l = (struct leaf *) n;
1580 
1581 	if(!n || !tkey_equals(l->key, key))
1582 		return 0;
1583 
1584 	/*
1585 	 * Key found.
1586 	 * Remove the leaf and rebalance the tree
1587 	 */
1588 
1589 	t->revision++;
1590 	t->size--;
1591 
1592 	tp = NODE_PARENT(n);
1593 	tnode_free((struct tnode *) n);
1594 
1595 	if(tp) {
1596 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1597 		put_child(t, (struct tnode *)tp, cindex, NULL);
1598 		t->trie = trie_rebalance(t, tp);
1599 	}
1600 	else
1601 		t->trie = NULL;
1602 
1603 	return 1;
1604 }
1605 
1606 static int
1607 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1608 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1609 {
1610 	struct trie *t = (struct trie *) tb->tb_data;
1611 	u32 key, mask;
1612 	int plen = r->rtm_dst_len;
1613 	u8 tos = r->rtm_tos;
1614 	struct fib_alias *fa, *fa_to_delete;
1615 	struct list_head *fa_head;
1616 	struct leaf *l;
1617 
1618 	if (plen > 32)
1619 		return -EINVAL;
1620 
1621 	key = 0;
1622 	if (rta->rta_dst)
1623 		memcpy(&key, rta->rta_dst, 4);
1624 
1625 	key = ntohl(key);
1626 	mask =  ntohl( inet_make_mask(plen) );
1627 
1628 	if(key & ~mask)
1629 		return -EINVAL;
1630 
1631 	key = key & mask;
1632 	l = fib_find_node(t, key);
1633 
1634 	if(!l)
1635 		return -ESRCH;
1636 
1637 	fa_head = get_fa_head(l, plen);
1638 	fa = fib_find_alias(fa_head, tos, 0);
1639 
1640 	if (!fa)
1641 		return -ESRCH;
1642 
1643 	if (trie_debug)
1644 		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1645 
1646 	fa_to_delete = NULL;
1647 	fa_head = fa->fa_list.prev;
1648 	list_for_each_entry(fa, fa_head, fa_list) {
1649 		struct fib_info *fi = fa->fa_info;
1650 
1651 		if (fa->fa_tos != tos)
1652 			break;
1653 
1654 		if ((!r->rtm_type ||
1655 		     fa->fa_type == r->rtm_type) &&
1656 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1657 		     fa->fa_scope == r->rtm_scope) &&
1658 		    (!r->rtm_protocol ||
1659 		     fi->fib_protocol == r->rtm_protocol) &&
1660 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1661 			fa_to_delete = fa;
1662 			break;
1663 		}
1664 	}
1665 
1666 	if (fa_to_delete) {
1667 		int kill_li = 0;
1668 		struct leaf_info *li;
1669 
1670 		fa = fa_to_delete;
1671 		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1672 
1673 		l = fib_find_node(t, key);
1674 		li = find_leaf_info(&l->list, plen);
1675 
1676 		write_lock_bh(&fib_lock);
1677 
1678 		list_del(&fa->fa_list);
1679 
1680 		if(list_empty(fa_head)) {
1681 			hlist_del(&li->hlist);
1682 			kill_li = 1;
1683 		}
1684 		write_unlock_bh(&fib_lock);
1685 
1686 		if(kill_li)
1687 			free_leaf_info(li);
1688 
1689 		if(hlist_empty(&l->list))
1690 			trie_leaf_remove(t, key);
1691 
1692 		if (fa->fa_state & FA_S_ACCESSED)
1693 			rt_cache_flush(-1);
1694 
1695 		fn_free_alias(fa);
1696 		return 0;
1697 	}
1698 	return -ESRCH;
1699 }
1700 
1701 static int trie_flush_list(struct trie *t, struct list_head *head)
1702 {
1703 	struct fib_alias *fa, *fa_node;
1704 	int found = 0;
1705 
1706 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1707 		struct fib_info *fi = fa->fa_info;
1708 
1709 		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1710 
1711  			write_lock_bh(&fib_lock);
1712 			list_del(&fa->fa_list);
1713 			write_unlock_bh(&fib_lock);
1714 
1715 			fn_free_alias(fa);
1716 			found++;
1717 		}
1718 	}
1719 	return found;
1720 }
1721 
1722 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1723 {
1724 	int found = 0;
1725 	struct hlist_head *lih = &l->list;
1726 	struct hlist_node *node, *tmp;
1727 	struct leaf_info *li = NULL;
1728 
1729 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1730 
1731 		found += trie_flush_list(t, &li->falh);
1732 
1733 		if (list_empty(&li->falh)) {
1734 
1735  			write_lock_bh(&fib_lock);
1736 			hlist_del(&li->hlist);
1737 			write_unlock_bh(&fib_lock);
1738 
1739 			free_leaf_info(li);
1740 		}
1741 	}
1742 	return found;
1743 }
1744 
1745 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1746 {
1747 	struct node *c = (struct node *) thisleaf;
1748 	struct tnode *p;
1749 	int idx;
1750 
1751 	if(c == NULL) {
1752 		if(t->trie == NULL)
1753 			return NULL;
1754 
1755 		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
1756 			return (struct leaf *) t->trie;
1757 
1758 		p = (struct tnode*) t->trie;  /* Start */
1759 	}
1760 	else
1761 		p = (struct tnode *) NODE_PARENT(c);
1762 	while (p) {
1763 		int pos, last;
1764 
1765 		/*  Find the next child of the parent */
1766 		if(c)
1767 			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1768 		else
1769 			pos = 0;
1770 
1771 		last = 1 << p->bits;
1772 		for(idx = pos; idx < last ; idx++) {
1773 			if( p->child[idx]) {
1774 
1775 				/* Decend if tnode */
1776 
1777 				while (IS_TNODE(p->child[idx])) {
1778 					p = (struct tnode*) p->child[idx];
1779 					idx = 0;
1780 
1781 					/* Rightmost non-NULL branch */
1782 					if( p && IS_TNODE(p) )
1783 						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
1784 
1785 					/* Done with this tnode? */
1786 					if( idx >= (1 << p->bits) || p->child[idx] == NULL )
1787 						goto up;
1788 				}
1789 				return (struct leaf*) p->child[idx];
1790 			}
1791 		}
1792 up:
1793 		/* No more children go up one step  */
1794 		c = (struct node*) p;
1795 		p = (struct tnode *) NODE_PARENT(p);
1796 	}
1797 	return NULL; /* Ready. Root of trie */
1798 }
1799 
1800 static int fn_trie_flush(struct fib_table *tb)
1801 {
1802 	struct trie *t = (struct trie *) tb->tb_data;
1803 	struct leaf *ll = NULL, *l = NULL;
1804 	int found = 0, h;
1805 
1806 	t->revision++;
1807 
1808 	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1809 		found += trie_flush_leaf(t, l);
1810 
1811 		if (ll && hlist_empty(&ll->list))
1812 			trie_leaf_remove(t, ll->key);
1813 		ll = l;
1814 	}
1815 
1816 	if (ll && hlist_empty(&ll->list))
1817 		trie_leaf_remove(t, ll->key);
1818 
1819 	if(trie_debug)
1820 		printk("trie_flush found=%d\n", found);
1821 	return found;
1822 }
1823 
1824 static int trie_last_dflt=-1;
1825 
1826 static void
1827 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1828 {
1829 	struct trie *t = (struct trie *) tb->tb_data;
1830 	int order, last_idx;
1831 	struct fib_info *fi = NULL;
1832 	struct fib_info *last_resort;
1833 	struct fib_alias *fa = NULL;
1834 	struct list_head *fa_head;
1835 	struct leaf *l;
1836 
1837 	last_idx = -1;
1838 	last_resort = NULL;
1839 	order = -1;
1840 
1841 	read_lock(&fib_lock);
1842 
1843 	l = fib_find_node(t, 0);
1844 	if(!l)
1845 		goto out;
1846 
1847 	fa_head = get_fa_head(l, 0);
1848 	if(!fa_head)
1849 		goto out;
1850 
1851 	if (list_empty(fa_head))
1852 		goto out;
1853 
1854 	list_for_each_entry(fa, fa_head, fa_list) {
1855 		struct fib_info *next_fi = fa->fa_info;
1856 
1857 		if (fa->fa_scope != res->scope ||
1858 		    fa->fa_type != RTN_UNICAST)
1859 			continue;
1860 
1861 		if (next_fi->fib_priority > res->fi->fib_priority)
1862 			break;
1863 		if (!next_fi->fib_nh[0].nh_gw ||
1864 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1865 			continue;
1866 		fa->fa_state |= FA_S_ACCESSED;
1867 
1868 		if (fi == NULL) {
1869 			if (next_fi != res->fi)
1870 				break;
1871 		} else if (!fib_detect_death(fi, order, &last_resort,
1872 					     &last_idx, &trie_last_dflt)) {
1873 			if (res->fi)
1874 				fib_info_put(res->fi);
1875 			res->fi = fi;
1876 			atomic_inc(&fi->fib_clntref);
1877 			trie_last_dflt = order;
1878 			goto out;
1879 		}
1880 		fi = next_fi;
1881 		order++;
1882 	}
1883 	if (order <= 0 || fi == NULL) {
1884 		trie_last_dflt = -1;
1885 		goto out;
1886 	}
1887 
1888 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1889 		if (res->fi)
1890 			fib_info_put(res->fi);
1891 		res->fi = fi;
1892 		atomic_inc(&fi->fib_clntref);
1893 		trie_last_dflt = order;
1894 		goto out;
1895 	}
1896 	if (last_idx >= 0) {
1897 		if (res->fi)
1898 			fib_info_put(res->fi);
1899 		res->fi = last_resort;
1900 		if (last_resort)
1901 			atomic_inc(&last_resort->fib_clntref);
1902 	}
1903 	trie_last_dflt = last_idx;
1904  out:;
1905 	read_unlock(&fib_lock);
1906 }
1907 
1908 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1909 			   struct sk_buff *skb, struct netlink_callback *cb)
1910 {
1911 	int i, s_i;
1912 	struct fib_alias *fa;
1913 
1914 	u32 xkey=htonl(key);
1915 
1916 	s_i=cb->args[3];
1917 	i = 0;
1918 
1919 	list_for_each_entry(fa, fah, fa_list) {
1920 		if (i < s_i) {
1921 			i++;
1922 			continue;
1923 		}
1924 		if (fa->fa_info->fib_nh == NULL) {
1925 			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1926 			i++;
1927 			continue;
1928 		}
1929 		if (fa->fa_info == NULL) {
1930 			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1931 			i++;
1932 			continue;
1933 		}
1934 
1935 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1936 				  cb->nlh->nlmsg_seq,
1937 				  RTM_NEWROUTE,
1938 				  tb->tb_id,
1939 				  fa->fa_type,
1940 				  fa->fa_scope,
1941 				  &xkey,
1942 				  plen,
1943 				  fa->fa_tos,
1944 				  fa->fa_info, 0) < 0) {
1945 			cb->args[3] = i;
1946 			return -1;
1947 			}
1948 		i++;
1949 	}
1950 	cb->args[3]=i;
1951 	return skb->len;
1952 }
1953 
1954 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1955 			     struct netlink_callback *cb)
1956 {
1957 	int h, s_h;
1958 	struct list_head *fa_head;
1959 	struct leaf *l = NULL;
1960 	s_h=cb->args[2];
1961 
1962 	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
1963 
1964 		if (h < s_h)
1965 			continue;
1966 		if (h > s_h)
1967 			memset(&cb->args[3], 0,
1968 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1969 
1970 		fa_head = get_fa_head(l, plen);
1971 
1972 		if(!fa_head)
1973 			continue;
1974 
1975 		if(list_empty(fa_head))
1976 			continue;
1977 
1978 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1979 			cb->args[2]=h;
1980 			return -1;
1981 		}
1982 	}
1983 	cb->args[2]=h;
1984 	return skb->len;
1985 }
1986 
1987 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1988 {
1989 	int m, s_m;
1990 	struct trie *t = (struct trie *) tb->tb_data;
1991 
1992 	s_m = cb->args[1];
1993 
1994 	read_lock(&fib_lock);
1995 	for (m=0; m<=32; m++) {
1996 
1997 		if (m < s_m)
1998 			continue;
1999 		if (m > s_m)
2000 			memset(&cb->args[2], 0,
2001 			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
2002 
2003 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
2004 			cb->args[1] = m;
2005 			goto out;
2006 		}
2007 	}
2008 	read_unlock(&fib_lock);
2009 	cb->args[1] = m;
2010 	return skb->len;
2011  out:
2012 	read_unlock(&fib_lock);
2013 	return -1;
2014 }
2015 
2016 /* Fix more generic FIB names for init later */
2017 
2018 #ifdef CONFIG_IP_MULTIPLE_TABLES
2019 struct fib_table * fib_hash_init(int id)
2020 #else
2021 struct fib_table * __init fib_hash_init(int id)
2022 #endif
2023 {
2024 	struct fib_table *tb;
2025 	struct trie *t;
2026 
2027 	if (fn_alias_kmem == NULL)
2028 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2029 						  sizeof(struct fib_alias),
2030 						  0, SLAB_HWCACHE_ALIGN,
2031 						  NULL, NULL);
2032 
2033 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2034 		     GFP_KERNEL);
2035 	if (tb == NULL)
2036 		return NULL;
2037 
2038 	tb->tb_id = id;
2039 	tb->tb_lookup = fn_trie_lookup;
2040 	tb->tb_insert = fn_trie_insert;
2041 	tb->tb_delete = fn_trie_delete;
2042 	tb->tb_flush = fn_trie_flush;
2043 	tb->tb_select_default = fn_trie_select_default;
2044 	tb->tb_dump = fn_trie_dump;
2045 	memset(tb->tb_data, 0, sizeof(struct trie));
2046 
2047 	t = (struct trie *) tb->tb_data;
2048 
2049 	trie_init(t);
2050 
2051 	if (id == RT_TABLE_LOCAL)
2052                 trie_local=t;
2053 	  else if (id == RT_TABLE_MAIN)
2054                 trie_main=t;
2055 
2056 	if (id == RT_TABLE_LOCAL)
2057 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2058 
2059 	return tb;
2060 }
2061 
2062 /* Trie dump functions */
2063 
2064 static void putspace_seq(struct seq_file *seq, int n)
2065 {
2066 	while (n--) seq_printf(seq, " ");
2067 }
2068 
2069 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2070 {
2071 	while (bits--)
2072 		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2073 }
2074 
2075 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2076 		   int pend, int cindex, int bits)
2077 {
2078 	putspace_seq(seq, indent);
2079 	if (IS_LEAF(n))
2080 		seq_printf(seq, "|");
2081 	else
2082 		seq_printf(seq, "+");
2083 	if (bits) {
2084 		seq_printf(seq, "%d/", cindex);
2085 		printbin_seq(seq, cindex, bits);
2086 		seq_printf(seq, ": ");
2087 	}
2088 	else
2089 		seq_printf(seq, "<root>: ");
2090 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2091 
2092 	if (IS_LEAF(n))
2093 		seq_printf(seq, "key=%d.%d.%d.%d\n",
2094 			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2095 	else {
2096 		int plen=((struct tnode *)n)->pos;
2097 		t_key prf=MASK_PFX(n->key, plen);
2098 		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2099 			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2100 	}
2101 	if (IS_LEAF(n)) {
2102 		struct leaf *l=(struct leaf *)n;
2103 		struct fib_alias *fa;
2104 		int i;
2105 		for (i=32; i>=0; i--)
2106 		  if(find_leaf_info(&l->list, i)) {
2107 
2108 				struct list_head *fa_head = get_fa_head(l, i);
2109 
2110 				if(!fa_head)
2111 					continue;
2112 
2113 				if(list_empty(fa_head))
2114 					continue;
2115 
2116 				putspace_seq(seq, indent+2);
2117 				seq_printf(seq, "{/%d...dumping}\n", i);
2118 
2119 
2120 				list_for_each_entry(fa, fa_head, fa_list) {
2121 					putspace_seq(seq, indent+2);
2122 					if (fa->fa_info->fib_nh == NULL) {
2123 						seq_printf(seq, "Error _fib_nh=NULL\n");
2124 						continue;
2125 					}
2126 					if (fa->fa_info == NULL) {
2127 						seq_printf(seq, "Error fa_info=NULL\n");
2128 						continue;
2129 					}
2130 
2131 					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2132 					      fa->fa_type,
2133 					      fa->fa_scope,
2134 					      fa->fa_tos);
2135 				}
2136 			}
2137 	}
2138 	else if (IS_TNODE(n)) {
2139 		struct tnode *tn=(struct tnode *)n;
2140 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2141 		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
2142 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2143 		seq_printf(seq, "}\n");
2144 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2145 		seq_printf(seq, "{pos=%d", tn->pos);
2146 		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2147 		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2148 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2149 		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2150 	}
2151 }
2152 
2153 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2154 {
2155 	struct node *n=t->trie;
2156 	int cindex=0;
2157 	int indent=1;
2158 	int pend=0;
2159 	int depth = 0;
2160 
2161   	read_lock(&fib_lock);
2162 
2163 	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2164 	if (n) {
2165 		printnode_seq(seq, indent, n, pend, cindex, 0);
2166 		if (IS_TNODE(n)) {
2167 			struct tnode *tn=(struct tnode *)n;
2168 			pend = tn->pos+tn->bits;
2169 			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2170 			indent += 3;
2171 			depth++;
2172 
2173 			while (tn && cindex < (1 << tn->bits)) {
2174 				if (tn->child[cindex]) {
2175 
2176 					/* Got a child */
2177 
2178 					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2179 					if (IS_LEAF(tn->child[cindex])) {
2180 						cindex++;
2181 
2182 					}
2183 					else {
2184 						/*
2185 						 * New tnode. Decend one level
2186 						 */
2187 
2188 						depth++;
2189 						n=tn->child[cindex];
2190 						tn=(struct tnode *)n;
2191 						pend=tn->pos+tn->bits;
2192 						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2193 						indent+=3;
2194 						cindex=0;
2195 					}
2196 				}
2197 				else
2198 					cindex++;
2199 
2200 				/*
2201 				 * Test if we are done
2202 				 */
2203 
2204 				while (cindex >= (1 << tn->bits)) {
2205 
2206 					/*
2207 					 * Move upwards and test for root
2208 					 * pop off all traversed  nodes
2209 					 */
2210 
2211 					if (NODE_PARENT(tn) == NULL) {
2212 						tn = NULL;
2213 						n = NULL;
2214 						break;
2215 					}
2216 					else {
2217 						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2218 						tn = NODE_PARENT(tn);
2219 						cindex++;
2220 						n=(struct node *)tn;
2221 						pend=tn->pos+tn->bits;
2222 						indent-=3;
2223 						depth--;
2224 					}
2225 				}
2226 			}
2227 		}
2228 		else n = NULL;
2229 	}
2230 	else seq_printf(seq, "------ trie is empty\n");
2231 
2232   	read_unlock(&fib_lock);
2233 }
2234 
2235 static struct trie_stat *trie_stat_new(void)
2236 {
2237 	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2238 	int i;
2239 
2240 	if(s) {
2241 		s->totdepth = 0;
2242 		s->maxdepth = 0;
2243 		s->tnodes = 0;
2244 		s->leaves = 0;
2245 		s->nullpointers = 0;
2246 
2247 		for(i=0; i< MAX_CHILDS; i++)
2248 			s->nodesizes[i] = 0;
2249 	}
2250 	return s;
2251 }
2252 
2253 static struct trie_stat *trie_collect_stats(struct trie *t)
2254 {
2255 	struct node *n=t->trie;
2256 	struct trie_stat *s = trie_stat_new();
2257 	int cindex = 0;
2258 	int indent = 1;
2259 	int pend = 0;
2260 	int depth = 0;
2261 
2262 	read_lock(&fib_lock);
2263 
2264 	if (s) {
2265 		if (n) {
2266 			if (IS_TNODE(n)) {
2267 				struct tnode *tn = (struct tnode *)n;
2268 				pend=tn->pos+tn->bits;
2269 				indent += 3;
2270 				s->nodesizes[tn->bits]++;
2271 				depth++;
2272 
2273 				while (tn && cindex < (1 << tn->bits)) {
2274 					if (tn->child[cindex]) {
2275 						/* Got a child */
2276 
2277 						if (IS_LEAF(tn->child[cindex])) {
2278 							cindex++;
2279 
2280 							/* stats */
2281 							if (depth > s->maxdepth)
2282 								s->maxdepth = depth;
2283 							s->totdepth += depth;
2284 							s->leaves++;
2285 						}
2286 
2287 						else {
2288 							/*
2289 							 * New tnode. Decend one level
2290 							 */
2291 
2292 							s->tnodes++;
2293 							s->nodesizes[tn->bits]++;
2294 							depth++;
2295 
2296 							n = tn->child[cindex];
2297 							tn = (struct tnode *)n;
2298 							pend = tn->pos+tn->bits;
2299 
2300 							indent += 3;
2301 							cindex = 0;
2302 						}
2303 					}
2304 					else {
2305 						cindex++;
2306 						s->nullpointers++;
2307 					}
2308 
2309 					/*
2310 					 * Test if we are done
2311 					 */
2312 
2313 					while (cindex >= (1 << tn->bits)) {
2314 
2315 						/*
2316 						 * Move upwards and test for root
2317 						 * pop off all traversed  nodes
2318 						 */
2319 
2320 
2321 						if (NODE_PARENT(tn) == NULL) {
2322 							tn = NULL;
2323 							n = NULL;
2324 							break;
2325 						}
2326 						else {
2327 							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2328 							tn = NODE_PARENT(tn);
2329 							cindex++;
2330 							n = (struct node *)tn;
2331 							pend=tn->pos+tn->bits;
2332 							indent -= 3;
2333 							depth--;
2334 						}
2335  					}
2336 				}
2337 			}
2338 			else n = NULL;
2339 		}
2340 	}
2341 
2342 	read_unlock(&fib_lock);
2343 	return s;
2344 }
2345 
2346 #ifdef CONFIG_PROC_FS
2347 
2348 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2349 {
2350 	return NULL;
2351 }
2352 
2353 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2354 {
2355 	return NULL;
2356 }
2357 
2358 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2359 {
2360 	void *v = NULL;
2361 
2362 	if (ip_fib_main_table)
2363 		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
2364 	return v;
2365 }
2366 
2367 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2368 {
2369 	++*pos;
2370 	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
2371 }
2372 
2373 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2374 {
2375 
2376 }
2377 
2378 /*
2379  *	This outputs /proc/net/fib_triestats
2380  *
2381  *	It always works in backward compatibility mode.
2382  *	The format of the file is not supposed to be changed.
2383  */
2384 
2385 static void collect_and_show(struct trie *t, struct seq_file *seq)
2386 {
2387 	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2388 	int i, max, pointers;
2389         struct trie_stat *stat;
2390 	int avdepth;
2391 
2392 	stat = trie_collect_stats(t);
2393 
2394 	bytes=0;
2395 	seq_printf(seq, "trie=%p\n", t);
2396 
2397 	if (stat) {
2398 		if (stat->leaves)
2399 			avdepth=stat->totdepth*100 / stat->leaves;
2400 		else
2401 			avdepth=0;
2402 		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2403 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2404 
2405 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2406 		bytes += sizeof(struct leaf) * stat->leaves;
2407 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2408 		bytes += sizeof(struct tnode) * stat->tnodes;
2409 
2410 		max = MAX_CHILDS-1;
2411 
2412 		while (max >= 0 && stat->nodesizes[max] == 0)
2413 			max--;
2414 		pointers = 0;
2415 
2416 		for (i = 1; i <= max; i++)
2417 			if (stat->nodesizes[i] != 0) {
2418 				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2419 				pointers += (1<<i) * stat->nodesizes[i];
2420 			}
2421 		seq_printf(seq, "\n");
2422 		seq_printf(seq, "Pointers: %d\n", pointers);
2423 		bytes += sizeof(struct node *) * pointers;
2424 		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2425 		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2426 
2427 		kfree(stat);
2428 	}
2429 
2430 #ifdef CONFIG_IP_FIB_TRIE_STATS
2431 	seq_printf(seq, "Counters:\n---------\n");
2432 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2433 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2434 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2435 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2436 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2438 #ifdef CLEAR_STATS
2439 	memset(&(t->stats), 0, sizeof(t->stats));
2440 #endif
2441 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2442 }
2443 
2444 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2445 {
2446 	char bf[128];
2447 
2448 	if (v == SEQ_START_TOKEN) {
2449 		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2450 			   sizeof(struct leaf), sizeof(struct tnode));
2451 		if (trie_local)
2452 			collect_and_show(trie_local, seq);
2453 
2454 		if (trie_main)
2455 			collect_and_show(trie_main, seq);
2456 	}
2457 	else {
2458 		snprintf(bf, sizeof(bf),
2459 			 "*\t%08X\t%08X", 200, 400);
2460 
2461 		seq_printf(seq, "%-127s\n", bf);
2462 	}
2463 	return 0;
2464 }
2465 
2466 static struct seq_operations fib_triestat_seq_ops = {
2467 	.start  = fib_triestat_seq_start,
2468 	.next   = fib_triestat_seq_next,
2469 	.stop   = fib_triestat_seq_stop,
2470 	.show   = fib_triestat_seq_show,
2471 };
2472 
2473 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2474 {
2475 	struct seq_file *seq;
2476 	int rc = -ENOMEM;
2477 
2478 	rc = seq_open(file, &fib_triestat_seq_ops);
2479 	if (rc)
2480 		goto out_kfree;
2481 
2482 	seq	     = file->private_data;
2483 out:
2484 	return rc;
2485 out_kfree:
2486 	goto out;
2487 }
2488 
2489 static struct file_operations fib_triestat_seq_fops = {
2490 	.owner		= THIS_MODULE,
2491 	.open           = fib_triestat_seq_open,
2492 	.read           = seq_read,
2493 	.llseek         = seq_lseek,
2494 	.release	= seq_release_private,
2495 };
2496 
2497 int __init fib_stat_proc_init(void)
2498 {
2499 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2500 		return -ENOMEM;
2501 	return 0;
2502 }
2503 
2504 void __init fib_stat_proc_exit(void)
2505 {
2506 	proc_net_remove("fib_triestat");
2507 }
2508 
2509 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2510 {
2511 	return NULL;
2512 }
2513 
2514 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2515 {
2516 	return NULL;
2517 }
2518 
2519 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2520 {
2521 	void *v = NULL;
2522 
2523 	if (ip_fib_main_table)
2524 		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
2525 	return v;
2526 }
2527 
2528 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529 {
2530 	++*pos;
2531 	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
2532 }
2533 
2534 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2535 {
2536 
2537 }
2538 
2539 /*
2540  *	This outputs /proc/net/fib_trie.
2541  *
2542  *	It always works in backward compatibility mode.
2543  *	The format of the file is not supposed to be changed.
2544  */
2545 
2546 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2547 {
2548 	char bf[128];
2549 
2550 	if (v == SEQ_START_TOKEN) {
2551 		if (trie_local)
2552 			trie_dump_seq(seq, trie_local);
2553 
2554 		if (trie_main)
2555 			trie_dump_seq(seq, trie_main);
2556 	}
2557 
2558 	else {
2559 		snprintf(bf, sizeof(bf),
2560 			 "*\t%08X\t%08X", 200, 400);
2561 		seq_printf(seq, "%-127s\n", bf);
2562 	}
2563 
2564 	return 0;
2565 }
2566 
2567 static struct seq_operations fib_trie_seq_ops = {
2568 	.start  = fib_trie_seq_start,
2569 	.next   = fib_trie_seq_next,
2570 	.stop   = fib_trie_seq_stop,
2571 	.show   = fib_trie_seq_show,
2572 };
2573 
2574 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2575 {
2576 	struct seq_file *seq;
2577 	int rc = -ENOMEM;
2578 
2579 	rc = seq_open(file, &fib_trie_seq_ops);
2580 	if (rc)
2581 		goto out_kfree;
2582 
2583 	seq	     = file->private_data;
2584 out:
2585 	return rc;
2586 out_kfree:
2587 	goto out;
2588 }
2589 
2590 static struct file_operations fib_trie_seq_fops = {
2591 	.owner		= THIS_MODULE,
2592 	.open           = fib_trie_seq_open,
2593 	.read           = seq_read,
2594 	.llseek         = seq_lseek,
2595 	.release	= seq_release_private,
2596 };
2597 
2598 int __init fib_proc_init(void)
2599 {
2600 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2601 		return -ENOMEM;
2602 	return 0;
2603 }
2604 
2605 void __init fib_proc_exit(void)
2606 {
2607 	proc_net_remove("fib_trie");
2608 }
2609 
2610 #endif /* CONFIG_PROC_FS */
2611