xref: /linux/net/ipv4/fib_trie.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.407"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/sched.h>
61 #include <linux/mm.h>
62 #include <linux/string.h>
63 #include <linux/socket.h>
64 #include <linux/sockios.h>
65 #include <linux/errno.h>
66 #include <linux/in.h>
67 #include <linux/inet.h>
68 #include <linux/inetdevice.h>
69 #include <linux/netdevice.h>
70 #include <linux/if_arp.h>
71 #include <linux/proc_fs.h>
72 #include <linux/rcupdate.h>
73 #include <linux/skbuff.h>
74 #include <linux/netlink.h>
75 #include <linux/init.h>
76 #include <linux/list.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #undef CONFIG_IP_FIB_TRIE_STATS
86 #define MAX_STAT_DEPTH 32
87 
88 #define KEYLENGTH (8*sizeof(t_key))
89 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
90 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
91 
92 typedef unsigned int t_key;
93 
94 #define T_TNODE 0
95 #define T_LEAF  1
96 #define NODE_TYPE_MASK	0x1UL
97 #define NODE_PARENT(node) \
98 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
99 
100 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
101 
102 #define NODE_SET_PARENT(node, ptr)		\
103 	rcu_assign_pointer((node)->parent,	\
104 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
105 
106 #define IS_TNODE(n) (!(n->parent & T_LEAF))
107 #define IS_LEAF(n) (n->parent & T_LEAF)
108 
109 struct node {
110 	t_key key;
111 	unsigned long parent;
112 };
113 
114 struct leaf {
115 	t_key key;
116 	unsigned long parent;
117 	struct hlist_head list;
118 	struct rcu_head rcu;
119 };
120 
121 struct leaf_info {
122 	struct hlist_node hlist;
123 	struct rcu_head rcu;
124 	int plen;
125 	struct list_head falh;
126 };
127 
128 struct tnode {
129 	t_key key;
130 	unsigned long parent;
131 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
132 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
133 	unsigned short full_children;	/* KEYLENGTH bits needed */
134 	unsigned short empty_children;	/* KEYLENGTH bits needed */
135 	struct rcu_head rcu;
136 	struct node *child[0];
137 };
138 
139 #ifdef CONFIG_IP_FIB_TRIE_STATS
140 struct trie_use_stats {
141 	unsigned int gets;
142 	unsigned int backtrack;
143 	unsigned int semantic_match_passed;
144 	unsigned int semantic_match_miss;
145 	unsigned int null_node_hit;
146 	unsigned int resize_node_skipped;
147 };
148 #endif
149 
150 struct trie_stat {
151 	unsigned int totdepth;
152 	unsigned int maxdepth;
153 	unsigned int tnodes;
154 	unsigned int leaves;
155 	unsigned int nullpointers;
156 	unsigned int nodesizes[MAX_STAT_DEPTH];
157 };
158 
159 struct trie {
160 	struct node *trie;
161 #ifdef CONFIG_IP_FIB_TRIE_STATS
162 	struct trie_use_stats stats;
163 #endif
164 	int size;
165 	unsigned int revision;
166 };
167 
168 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
169 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
170 static struct node *resize(struct trie *t, struct tnode *tn);
171 static struct tnode *inflate(struct trie *t, struct tnode *tn);
172 static struct tnode *halve(struct trie *t, struct tnode *tn);
173 static void tnode_free(struct tnode *tn);
174 
175 static kmem_cache_t *fn_alias_kmem __read_mostly;
176 static struct trie *trie_local = NULL, *trie_main = NULL;
177 
178 
179 /* rcu_read_lock needs to be hold by caller from readside */
180 
181 static inline struct node *tnode_get_child(struct tnode *tn, int i)
182 {
183 	BUG_ON(i >= 1 << tn->bits);
184 
185 	return rcu_dereference(tn->child[i]);
186 }
187 
188 static inline int tnode_child_length(const struct tnode *tn)
189 {
190 	return 1 << tn->bits;
191 }
192 
193 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
194 {
195 	if (offset < KEYLENGTH)
196 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
197 	else
198 		return 0;
199 }
200 
201 static inline int tkey_equals(t_key a, t_key b)
202 {
203 	return a == b;
204 }
205 
206 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
207 {
208 	if (bits == 0 || offset >= KEYLENGTH)
209 		return 1;
210 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
211 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
212 }
213 
214 static inline int tkey_mismatch(t_key a, int offset, t_key b)
215 {
216 	t_key diff = a ^ b;
217 	int i = offset;
218 
219 	if (!diff)
220 		return 0;
221 	while ((diff << i) >> (KEYLENGTH-1) == 0)
222 		i++;
223 	return i;
224 }
225 
226 /*
227   To understand this stuff, an understanding of keys and all their bits is
228   necessary. Every node in the trie has a key associated with it, but not
229   all of the bits in that key are significant.
230 
231   Consider a node 'n' and its parent 'tp'.
232 
233   If n is a leaf, every bit in its key is significant. Its presence is
234   necessitated by path compression, since during a tree traversal (when
235   searching for a leaf - unless we are doing an insertion) we will completely
236   ignore all skipped bits we encounter. Thus we need to verify, at the end of
237   a potentially successful search, that we have indeed been walking the
238   correct key path.
239 
240   Note that we can never "miss" the correct key in the tree if present by
241   following the wrong path. Path compression ensures that segments of the key
242   that are the same for all keys with a given prefix are skipped, but the
243   skipped part *is* identical for each node in the subtrie below the skipped
244   bit! trie_insert() in this implementation takes care of that - note the
245   call to tkey_sub_equals() in trie_insert().
246 
247   if n is an internal node - a 'tnode' here, the various parts of its key
248   have many different meanings.
249 
250   Example:
251   _________________________________________________________________
252   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
253   -----------------------------------------------------------------
254     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
255 
256   _________________________________________________________________
257   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
258   -----------------------------------------------------------------
259    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
260 
261   tp->pos = 7
262   tp->bits = 3
263   n->pos = 15
264   n->bits = 4
265 
266   First, let's just ignore the bits that come before the parent tp, that is
267   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
268   not use them for anything.
269 
270   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
271   index into the parent's child array. That is, they will be used to find
272   'n' among tp's children.
273 
274   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
275   for the node n.
276 
277   All the bits we have seen so far are significant to the node n. The rest
278   of the bits are really not needed or indeed known in n->key.
279 
280   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
281   n's child array, and will of course be different for each child.
282 
283 
284   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
285   at this point.
286 
287 */
288 
289 static inline void check_tnode(const struct tnode *tn)
290 {
291 	WARN_ON(tn && tn->pos+tn->bits > 32);
292 }
293 
294 static int halve_threshold = 25;
295 static int inflate_threshold = 50;
296 static int halve_threshold_root = 15;
297 static int inflate_threshold_root = 25;
298 
299 
300 static void __alias_free_mem(struct rcu_head *head)
301 {
302 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
303 	kmem_cache_free(fn_alias_kmem, fa);
304 }
305 
306 static inline void alias_free_mem_rcu(struct fib_alias *fa)
307 {
308 	call_rcu(&fa->rcu, __alias_free_mem);
309 }
310 
311 static void __leaf_free_rcu(struct rcu_head *head)
312 {
313 	kfree(container_of(head, struct leaf, rcu));
314 }
315 
316 static void __leaf_info_free_rcu(struct rcu_head *head)
317 {
318 	kfree(container_of(head, struct leaf_info, rcu));
319 }
320 
321 static inline void free_leaf_info(struct leaf_info *leaf)
322 {
323 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
324 }
325 
326 static struct tnode *tnode_alloc(unsigned int size)
327 {
328 	struct page *pages;
329 
330 	if (size <= PAGE_SIZE)
331 		return kcalloc(size, 1, GFP_KERNEL);
332 
333 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
334 	if (!pages)
335 		return NULL;
336 
337 	return page_address(pages);
338 }
339 
340 static void __tnode_free_rcu(struct rcu_head *head)
341 {
342 	struct tnode *tn = container_of(head, struct tnode, rcu);
343 	unsigned int size = sizeof(struct tnode) +
344 		(1 << tn->bits) * sizeof(struct node *);
345 
346 	if (size <= PAGE_SIZE)
347 		kfree(tn);
348 	else
349 		free_pages((unsigned long)tn, get_order(size));
350 }
351 
352 static inline void tnode_free(struct tnode *tn)
353 {
354 	if(IS_LEAF(tn)) {
355 		struct leaf *l = (struct leaf *) tn;
356 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
357 	}
358         else
359 		call_rcu(&tn->rcu, __tnode_free_rcu);
360 }
361 
362 static struct leaf *leaf_new(void)
363 {
364 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
365 	if (l) {
366 		l->parent = T_LEAF;
367 		INIT_HLIST_HEAD(&l->list);
368 	}
369 	return l;
370 }
371 
372 static struct leaf_info *leaf_info_new(int plen)
373 {
374 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
375 	if (li) {
376 		li->plen = plen;
377 		INIT_LIST_HEAD(&li->falh);
378 	}
379 	return li;
380 }
381 
382 static struct tnode* tnode_new(t_key key, int pos, int bits)
383 {
384 	int nchildren = 1<<bits;
385 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
386 	struct tnode *tn = tnode_alloc(sz);
387 
388 	if (tn) {
389 		memset(tn, 0, sz);
390 		tn->parent = T_TNODE;
391 		tn->pos = pos;
392 		tn->bits = bits;
393 		tn->key = key;
394 		tn->full_children = 0;
395 		tn->empty_children = 1<<bits;
396 	}
397 
398 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
399 		 (unsigned int) (sizeof(struct node) * 1<<bits));
400 	return tn;
401 }
402 
403 /*
404  * Check whether a tnode 'n' is "full", i.e. it is an internal node
405  * and no bits are skipped. See discussion in dyntree paper p. 6
406  */
407 
408 static inline int tnode_full(const struct tnode *tn, const struct node *n)
409 {
410 	if (n == NULL || IS_LEAF(n))
411 		return 0;
412 
413 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
414 }
415 
416 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
417 {
418 	tnode_put_child_reorg(tn, i, n, -1);
419 }
420 
421  /*
422   * Add a child at position i overwriting the old value.
423   * Update the value of full_children and empty_children.
424   */
425 
426 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
427 {
428 	struct node *chi = tn->child[i];
429 	int isfull;
430 
431 	BUG_ON(i >= 1<<tn->bits);
432 
433 
434 	/* update emptyChildren */
435 	if (n == NULL && chi != NULL)
436 		tn->empty_children++;
437 	else if (n != NULL && chi == NULL)
438 		tn->empty_children--;
439 
440 	/* update fullChildren */
441 	if (wasfull == -1)
442 		wasfull = tnode_full(tn, chi);
443 
444 	isfull = tnode_full(tn, n);
445 	if (wasfull && !isfull)
446 		tn->full_children--;
447 	else if (!wasfull && isfull)
448 		tn->full_children++;
449 
450 	if (n)
451 		NODE_SET_PARENT(n, tn);
452 
453 	rcu_assign_pointer(tn->child[i], n);
454 }
455 
456 static struct node *resize(struct trie *t, struct tnode *tn)
457 {
458 	int i;
459 	int err = 0;
460 	struct tnode *old_tn;
461 	int inflate_threshold_use;
462 	int halve_threshold_use;
463 
464  	if (!tn)
465 		return NULL;
466 
467 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
468 		 tn, inflate_threshold, halve_threshold);
469 
470 	/* No children */
471 	if (tn->empty_children == tnode_child_length(tn)) {
472 		tnode_free(tn);
473 		return NULL;
474 	}
475 	/* One child */
476 	if (tn->empty_children == tnode_child_length(tn) - 1)
477 		for (i = 0; i < tnode_child_length(tn); i++) {
478 			struct node *n;
479 
480 			n = tn->child[i];
481 			if (!n)
482 				continue;
483 
484 			/* compress one level */
485 			NODE_SET_PARENT(n, NULL);
486 			tnode_free(tn);
487 			return n;
488 		}
489 	/*
490 	 * Double as long as the resulting node has a number of
491 	 * nonempty nodes that are above the threshold.
492 	 */
493 
494 	/*
495 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
496 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
497 	 * Telecommunications, page 6:
498 	 * "A node is doubled if the ratio of non-empty children to all
499 	 * children in the *doubled* node is at least 'high'."
500 	 *
501 	 * 'high' in this instance is the variable 'inflate_threshold'. It
502 	 * is expressed as a percentage, so we multiply it with
503 	 * tnode_child_length() and instead of multiplying by 2 (since the
504 	 * child array will be doubled by inflate()) and multiplying
505 	 * the left-hand side by 100 (to handle the percentage thing) we
506 	 * multiply the left-hand side by 50.
507 	 *
508 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
509 	 * - tn->empty_children is of course the number of non-null children
510 	 * in the current node. tn->full_children is the number of "full"
511 	 * children, that is non-null tnodes with a skip value of 0.
512 	 * All of those will be doubled in the resulting inflated tnode, so
513 	 * we just count them one extra time here.
514 	 *
515 	 * A clearer way to write this would be:
516 	 *
517 	 * to_be_doubled = tn->full_children;
518 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
519 	 *     tn->full_children;
520 	 *
521 	 * new_child_length = tnode_child_length(tn) * 2;
522 	 *
523 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
524 	 *      new_child_length;
525 	 * if (new_fill_factor >= inflate_threshold)
526 	 *
527 	 * ...and so on, tho it would mess up the while () loop.
528 	 *
529 	 * anyway,
530 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
531 	 *      inflate_threshold
532 	 *
533 	 * avoid a division:
534 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
535 	 *      inflate_threshold * new_child_length
536 	 *
537 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
538 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
539 	 *    tn->full_children) >= inflate_threshold * new_child_length
540 	 *
541 	 * expand new_child_length:
542 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
543 	 *    tn->full_children) >=
544 	 *      inflate_threshold * tnode_child_length(tn) * 2
545 	 *
546 	 * shorten again:
547 	 * 50 * (tn->full_children + tnode_child_length(tn) -
548 	 *    tn->empty_children) >= inflate_threshold *
549 	 *    tnode_child_length(tn)
550 	 *
551 	 */
552 
553 	check_tnode(tn);
554 
555 	/* Keep root node larger  */
556 
557 	if(!tn->parent)
558 		inflate_threshold_use = inflate_threshold_root;
559 	else
560 		inflate_threshold_use = inflate_threshold;
561 
562 	err = 0;
563 	while ((tn->full_children > 0 &&
564 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
565 				inflate_threshold_use * tnode_child_length(tn))) {
566 
567 		old_tn = tn;
568 		tn = inflate(t, tn);
569 		if (IS_ERR(tn)) {
570 			tn = old_tn;
571 #ifdef CONFIG_IP_FIB_TRIE_STATS
572 			t->stats.resize_node_skipped++;
573 #endif
574 			break;
575 		}
576 	}
577 
578 	check_tnode(tn);
579 
580 	/*
581 	 * Halve as long as the number of empty children in this
582 	 * node is above threshold.
583 	 */
584 
585 
586 	/* Keep root node larger  */
587 
588 	if(!tn->parent)
589 		halve_threshold_use = halve_threshold_root;
590 	else
591 		halve_threshold_use = halve_threshold;
592 
593 	err = 0;
594 	while (tn->bits > 1 &&
595 	       100 * (tnode_child_length(tn) - tn->empty_children) <
596 	       halve_threshold_use * tnode_child_length(tn)) {
597 
598 		old_tn = tn;
599 		tn = halve(t, tn);
600 		if (IS_ERR(tn)) {
601 			tn = old_tn;
602 #ifdef CONFIG_IP_FIB_TRIE_STATS
603 			t->stats.resize_node_skipped++;
604 #endif
605 			break;
606 		}
607 	}
608 
609 
610 	/* Only one child remains */
611 	if (tn->empty_children == tnode_child_length(tn) - 1)
612 		for (i = 0; i < tnode_child_length(tn); i++) {
613 			struct node *n;
614 
615 			n = tn->child[i];
616 			if (!n)
617 				continue;
618 
619 			/* compress one level */
620 
621 			NODE_SET_PARENT(n, NULL);
622 			tnode_free(tn);
623 			return n;
624 		}
625 
626 	return (struct node *) tn;
627 }
628 
629 static struct tnode *inflate(struct trie *t, struct tnode *tn)
630 {
631 	struct tnode *inode;
632 	struct tnode *oldtnode = tn;
633 	int olen = tnode_child_length(tn);
634 	int i;
635 
636 	pr_debug("In inflate\n");
637 
638 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
639 
640 	if (!tn)
641 		return ERR_PTR(-ENOMEM);
642 
643 	/*
644 	 * Preallocate and store tnodes before the actual work so we
645 	 * don't get into an inconsistent state if memory allocation
646 	 * fails. In case of failure we return the oldnode and  inflate
647 	 * of tnode is ignored.
648 	 */
649 
650 	for (i = 0; i < olen; i++) {
651 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
652 
653 		if (inode &&
654 		    IS_TNODE(inode) &&
655 		    inode->pos == oldtnode->pos + oldtnode->bits &&
656 		    inode->bits > 1) {
657 			struct tnode *left, *right;
658 			t_key m = TKEY_GET_MASK(inode->pos, 1);
659 
660 			left = tnode_new(inode->key&(~m), inode->pos + 1,
661 					 inode->bits - 1);
662 			if (!left)
663 				goto nomem;
664 
665 			right = tnode_new(inode->key|m, inode->pos + 1,
666 					  inode->bits - 1);
667 
668                         if (!right) {
669 				tnode_free(left);
670 				goto nomem;
671                         }
672 
673 			put_child(t, tn, 2*i, (struct node *) left);
674 			put_child(t, tn, 2*i+1, (struct node *) right);
675 		}
676 	}
677 
678 	for (i = 0; i < olen; i++) {
679 		struct node *node = tnode_get_child(oldtnode, i);
680 		struct tnode *left, *right;
681 		int size, j;
682 
683 		/* An empty child */
684 		if (node == NULL)
685 			continue;
686 
687 		/* A leaf or an internal node with skipped bits */
688 
689 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
690 		   tn->pos + tn->bits - 1) {
691 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
692 					     1) == 0)
693 				put_child(t, tn, 2*i, node);
694 			else
695 				put_child(t, tn, 2*i+1, node);
696 			continue;
697 		}
698 
699 		/* An internal node with two children */
700 		inode = (struct tnode *) node;
701 
702 		if (inode->bits == 1) {
703 			put_child(t, tn, 2*i, inode->child[0]);
704 			put_child(t, tn, 2*i+1, inode->child[1]);
705 
706 			tnode_free(inode);
707 			continue;
708 		}
709 
710 		/* An internal node with more than two children */
711 
712 		/* We will replace this node 'inode' with two new
713 		 * ones, 'left' and 'right', each with half of the
714 		 * original children. The two new nodes will have
715 		 * a position one bit further down the key and this
716 		 * means that the "significant" part of their keys
717 		 * (see the discussion near the top of this file)
718 		 * will differ by one bit, which will be "0" in
719 		 * left's key and "1" in right's key. Since we are
720 		 * moving the key position by one step, the bit that
721 		 * we are moving away from - the bit at position
722 		 * (inode->pos) - is the one that will differ between
723 		 * left and right. So... we synthesize that bit in the
724 		 * two  new keys.
725 		 * The mask 'm' below will be a single "one" bit at
726 		 * the position (inode->pos)
727 		 */
728 
729 		/* Use the old key, but set the new significant
730 		 *   bit to zero.
731 		 */
732 
733 		left = (struct tnode *) tnode_get_child(tn, 2*i);
734 		put_child(t, tn, 2*i, NULL);
735 
736 		BUG_ON(!left);
737 
738 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
739 		put_child(t, tn, 2*i+1, NULL);
740 
741 		BUG_ON(!right);
742 
743 		size = tnode_child_length(left);
744 		for (j = 0; j < size; j++) {
745 			put_child(t, left, j, inode->child[j]);
746 			put_child(t, right, j, inode->child[j + size]);
747 		}
748 		put_child(t, tn, 2*i, resize(t, left));
749 		put_child(t, tn, 2*i+1, resize(t, right));
750 
751 		tnode_free(inode);
752 	}
753 	tnode_free(oldtnode);
754 	return tn;
755 nomem:
756 	{
757 		int size = tnode_child_length(tn);
758 		int j;
759 
760 		for (j = 0; j < size; j++)
761 			if (tn->child[j])
762 				tnode_free((struct tnode *)tn->child[j]);
763 
764 		tnode_free(tn);
765 
766 		return ERR_PTR(-ENOMEM);
767 	}
768 }
769 
770 static struct tnode *halve(struct trie *t, struct tnode *tn)
771 {
772 	struct tnode *oldtnode = tn;
773 	struct node *left, *right;
774 	int i;
775 	int olen = tnode_child_length(tn);
776 
777 	pr_debug("In halve\n");
778 
779 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
780 
781 	if (!tn)
782 		return ERR_PTR(-ENOMEM);
783 
784 	/*
785 	 * Preallocate and store tnodes before the actual work so we
786 	 * don't get into an inconsistent state if memory allocation
787 	 * fails. In case of failure we return the oldnode and halve
788 	 * of tnode is ignored.
789 	 */
790 
791 	for (i = 0; i < olen; i += 2) {
792 		left = tnode_get_child(oldtnode, i);
793 		right = tnode_get_child(oldtnode, i+1);
794 
795 		/* Two nonempty children */
796 		if (left && right) {
797 			struct tnode *newn;
798 
799 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
800 
801 			if (!newn)
802 				goto nomem;
803 
804 			put_child(t, tn, i/2, (struct node *)newn);
805 		}
806 
807 	}
808 
809 	for (i = 0; i < olen; i += 2) {
810 		struct tnode *newBinNode;
811 
812 		left = tnode_get_child(oldtnode, i);
813 		right = tnode_get_child(oldtnode, i+1);
814 
815 		/* At least one of the children is empty */
816 		if (left == NULL) {
817 			if (right == NULL)    /* Both are empty */
818 				continue;
819 			put_child(t, tn, i/2, right);
820 			continue;
821 		}
822 
823 		if (right == NULL) {
824 			put_child(t, tn, i/2, left);
825 			continue;
826 		}
827 
828 		/* Two nonempty children */
829 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
830 		put_child(t, tn, i/2, NULL);
831 		put_child(t, newBinNode, 0, left);
832 		put_child(t, newBinNode, 1, right);
833 		put_child(t, tn, i/2, resize(t, newBinNode));
834 	}
835 	tnode_free(oldtnode);
836 	return tn;
837 nomem:
838 	{
839 		int size = tnode_child_length(tn);
840 		int j;
841 
842 		for (j = 0; j < size; j++)
843 			if (tn->child[j])
844 				tnode_free((struct tnode *)tn->child[j]);
845 
846 		tnode_free(tn);
847 
848 		return ERR_PTR(-ENOMEM);
849 	}
850 }
851 
852 static void trie_init(struct trie *t)
853 {
854 	if (!t)
855 		return;
856 
857 	t->size = 0;
858 	rcu_assign_pointer(t->trie, NULL);
859 	t->revision = 0;
860 #ifdef CONFIG_IP_FIB_TRIE_STATS
861 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
862 #endif
863 }
864 
865 /* readside must use rcu_read_lock currently dump routines
866  via get_fa_head and dump */
867 
868 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
869 {
870 	struct hlist_head *head = &l->list;
871 	struct hlist_node *node;
872 	struct leaf_info *li;
873 
874 	hlist_for_each_entry_rcu(li, node, head, hlist)
875 		if (li->plen == plen)
876 			return li;
877 
878 	return NULL;
879 }
880 
881 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
882 {
883 	struct leaf_info *li = find_leaf_info(l, plen);
884 
885 	if (!li)
886 		return NULL;
887 
888 	return &li->falh;
889 }
890 
891 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
892 {
893         struct leaf_info *li = NULL, *last = NULL;
894         struct hlist_node *node;
895 
896         if (hlist_empty(head)) {
897                 hlist_add_head_rcu(&new->hlist, head);
898         } else {
899                 hlist_for_each_entry(li, node, head, hlist) {
900                         if (new->plen > li->plen)
901                                 break;
902 
903                         last = li;
904                 }
905                 if (last)
906                         hlist_add_after_rcu(&last->hlist, &new->hlist);
907                 else
908                         hlist_add_before_rcu(&new->hlist, &li->hlist);
909         }
910 }
911 
912 /* rcu_read_lock needs to be hold by caller from readside */
913 
914 static struct leaf *
915 fib_find_node(struct trie *t, u32 key)
916 {
917 	int pos;
918 	struct tnode *tn;
919 	struct node *n;
920 
921 	pos = 0;
922 	n = rcu_dereference(t->trie);
923 
924 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
925 		tn = (struct tnode *) n;
926 
927 		check_tnode(tn);
928 
929 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
930 			pos = tn->pos + tn->bits;
931 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
932 		} else
933 			break;
934 	}
935 	/* Case we have found a leaf. Compare prefixes */
936 
937 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
938 		return (struct leaf *)n;
939 
940 	return NULL;
941 }
942 
943 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
944 {
945 	int wasfull;
946 	t_key cindex, key;
947 	struct tnode *tp = NULL;
948 
949 	key = tn->key;
950 
951 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
952 
953 		tp = NODE_PARENT(tn);
954 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
955 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
956 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
957 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
958 
959 		if (!NODE_PARENT(tn))
960 			break;
961 
962 		tn = NODE_PARENT(tn);
963 	}
964 	/* Handle last (top) tnode */
965 	if (IS_TNODE(tn))
966 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
967 
968 	return (struct node*) tn;
969 }
970 
971 /* only used from updater-side */
972 
973 static  struct list_head *
974 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
975 {
976 	int pos, newpos;
977 	struct tnode *tp = NULL, *tn = NULL;
978 	struct node *n;
979 	struct leaf *l;
980 	int missbit;
981 	struct list_head *fa_head = NULL;
982 	struct leaf_info *li;
983 	t_key cindex;
984 
985 	pos = 0;
986 	n = t->trie;
987 
988 	/* If we point to NULL, stop. Either the tree is empty and we should
989 	 * just put a new leaf in if, or we have reached an empty child slot,
990 	 * and we should just put our new leaf in that.
991 	 * If we point to a T_TNODE, check if it matches our key. Note that
992 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
993 	 * not be the parent's 'pos'+'bits'!
994 	 *
995 	 * If it does match the current key, get pos/bits from it, extract
996 	 * the index from our key, push the T_TNODE and walk the tree.
997 	 *
998 	 * If it doesn't, we have to replace it with a new T_TNODE.
999 	 *
1000 	 * If we point to a T_LEAF, it might or might not have the same key
1001 	 * as we do. If it does, just change the value, update the T_LEAF's
1002 	 * value, and return it.
1003 	 * If it doesn't, we need to replace it with a T_TNODE.
1004 	 */
1005 
1006 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1007 		tn = (struct tnode *) n;
1008 
1009 		check_tnode(tn);
1010 
1011 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1012 			tp = tn;
1013 			pos = tn->pos + tn->bits;
1014 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1015 
1016 			BUG_ON(n && NODE_PARENT(n) != tn);
1017 		} else
1018 			break;
1019 	}
1020 
1021 	/*
1022 	 * n  ----> NULL, LEAF or TNODE
1023 	 *
1024 	 * tp is n's (parent) ----> NULL or TNODE
1025 	 */
1026 
1027 	BUG_ON(tp && IS_LEAF(tp));
1028 
1029 	/* Case 1: n is a leaf. Compare prefixes */
1030 
1031 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1032 		struct leaf *l = (struct leaf *) n;
1033 
1034 		li = leaf_info_new(plen);
1035 
1036 		if (!li) {
1037 			*err = -ENOMEM;
1038 			goto err;
1039 		}
1040 
1041 		fa_head = &li->falh;
1042 		insert_leaf_info(&l->list, li);
1043 		goto done;
1044 	}
1045 	t->size++;
1046 	l = leaf_new();
1047 
1048 	if (!l) {
1049 		*err = -ENOMEM;
1050 		goto err;
1051 	}
1052 
1053 	l->key = key;
1054 	li = leaf_info_new(plen);
1055 
1056 	if (!li) {
1057 		tnode_free((struct tnode *) l);
1058 		*err = -ENOMEM;
1059 		goto err;
1060 	}
1061 
1062 	fa_head = &li->falh;
1063 	insert_leaf_info(&l->list, li);
1064 
1065 	if (t->trie && n == NULL) {
1066 		/* Case 2: n is NULL, and will just insert a new leaf */
1067 
1068 		NODE_SET_PARENT(l, tp);
1069 
1070 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1071 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1072 	} else {
1073 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1074 		/*
1075 		 *  Add a new tnode here
1076 		 *  first tnode need some special handling
1077 		 */
1078 
1079 		if (tp)
1080 			pos = tp->pos+tp->bits;
1081 		else
1082 			pos = 0;
1083 
1084 		if (n) {
1085 			newpos = tkey_mismatch(key, pos, n->key);
1086 			tn = tnode_new(n->key, newpos, 1);
1087 		} else {
1088 			newpos = 0;
1089 			tn = tnode_new(key, newpos, 1); /* First tnode */
1090 		}
1091 
1092 		if (!tn) {
1093 			free_leaf_info(li);
1094 			tnode_free((struct tnode *) l);
1095 			*err = -ENOMEM;
1096 			goto err;
1097 		}
1098 
1099 		NODE_SET_PARENT(tn, tp);
1100 
1101 		missbit = tkey_extract_bits(key, newpos, 1);
1102 		put_child(t, tn, missbit, (struct node *)l);
1103 		put_child(t, tn, 1-missbit, n);
1104 
1105 		if (tp) {
1106 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1108 		} else {
1109 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1110 			tp = tn;
1111 		}
1112 	}
1113 
1114 	if (tp && tp->pos + tp->bits > 32)
1115 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1116 		       tp, tp->pos, tp->bits, key, plen);
1117 
1118 	/* Rebalance the trie */
1119 
1120 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1121 done:
1122 	t->revision++;
1123 err:
1124 	return fa_head;
1125 }
1126 
1127 static int
1128 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1129 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1130 {
1131 	struct trie *t = (struct trie *) tb->tb_data;
1132 	struct fib_alias *fa, *new_fa;
1133 	struct list_head *fa_head = NULL;
1134 	struct fib_info *fi;
1135 	int plen = r->rtm_dst_len;
1136 	int type = r->rtm_type;
1137 	u8 tos = r->rtm_tos;
1138 	u32 key, mask;
1139 	int err;
1140 	struct leaf *l;
1141 
1142 	if (plen > 32)
1143 		return -EINVAL;
1144 
1145 	key = 0;
1146 	if (rta->rta_dst)
1147 		memcpy(&key, rta->rta_dst, 4);
1148 
1149 	key = ntohl(key);
1150 
1151 	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1152 
1153 	mask = ntohl(inet_make_mask(plen));
1154 
1155 	if (key & ~mask)
1156 		return -EINVAL;
1157 
1158 	key = key & mask;
1159 
1160 	fi = fib_create_info(r, rta, nlhdr, &err);
1161 
1162 	if (!fi)
1163 		goto err;
1164 
1165 	l = fib_find_node(t, key);
1166 	fa = NULL;
1167 
1168 	if (l) {
1169 		fa_head = get_fa_head(l, plen);
1170 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1171 	}
1172 
1173 	/* Now fa, if non-NULL, points to the first fib alias
1174 	 * with the same keys [prefix,tos,priority], if such key already
1175 	 * exists or to the node before which we will insert new one.
1176 	 *
1177 	 * If fa is NULL, we will need to allocate a new one and
1178 	 * insert to the head of f.
1179 	 *
1180 	 * If f is NULL, no fib node matched the destination key
1181 	 * and we need to allocate a new one of those as well.
1182 	 */
1183 
1184 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1185 		struct fib_alias *fa_orig;
1186 
1187 		err = -EEXIST;
1188 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1189 			goto out;
1190 
1191 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1192 			struct fib_info *fi_drop;
1193 			u8 state;
1194 
1195 			err = -ENOBUFS;
1196 			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1197 			if (new_fa == NULL)
1198 				goto out;
1199 
1200 			fi_drop = fa->fa_info;
1201 			new_fa->fa_tos = fa->fa_tos;
1202 			new_fa->fa_info = fi;
1203 			new_fa->fa_type = type;
1204 			new_fa->fa_scope = r->rtm_scope;
1205 			state = fa->fa_state;
1206 			new_fa->fa_state &= ~FA_S_ACCESSED;
1207 
1208 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1209 			alias_free_mem_rcu(fa);
1210 
1211 			fib_release_info(fi_drop);
1212 			if (state & FA_S_ACCESSED)
1213 				rt_cache_flush(-1);
1214 
1215 			goto succeeded;
1216 		}
1217 		/* Error if we find a perfect match which
1218 		 * uses the same scope, type, and nexthop
1219 		 * information.
1220 		 */
1221 		fa_orig = fa;
1222 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1223 			if (fa->fa_tos != tos)
1224 				break;
1225 			if (fa->fa_info->fib_priority != fi->fib_priority)
1226 				break;
1227 			if (fa->fa_type == type &&
1228 			    fa->fa_scope == r->rtm_scope &&
1229 			    fa->fa_info == fi) {
1230 				goto out;
1231 			}
1232 		}
1233 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1234 			fa = fa_orig;
1235 	}
1236 	err = -ENOENT;
1237 	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1238 		goto out;
1239 
1240 	err = -ENOBUFS;
1241 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1242 	if (new_fa == NULL)
1243 		goto out;
1244 
1245 	new_fa->fa_info = fi;
1246 	new_fa->fa_tos = tos;
1247 	new_fa->fa_type = type;
1248 	new_fa->fa_scope = r->rtm_scope;
1249 	new_fa->fa_state = 0;
1250 	/*
1251 	 * Insert new entry to the list.
1252 	 */
1253 
1254 	if (!fa_head) {
1255 		fa_head = fib_insert_node(t, &err, key, plen);
1256 		err = 0;
1257 		if (err)
1258 			goto out_free_new_fa;
1259 	}
1260 
1261 	list_add_tail_rcu(&new_fa->fa_list,
1262 			  (fa ? &fa->fa_list : fa_head));
1263 
1264 	rt_cache_flush(-1);
1265 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1266 succeeded:
1267 	return 0;
1268 
1269 out_free_new_fa:
1270 	kmem_cache_free(fn_alias_kmem, new_fa);
1271 out:
1272 	fib_release_info(fi);
1273 err:
1274 	return err;
1275 }
1276 
1277 
1278 /* should be called with rcu_read_lock */
1279 static inline int check_leaf(struct trie *t, struct leaf *l,
1280 			     t_key key, int *plen, const struct flowi *flp,
1281 			     struct fib_result *res)
1282 {
1283 	int err, i;
1284 	t_key mask;
1285 	struct leaf_info *li;
1286 	struct hlist_head *hhead = &l->list;
1287 	struct hlist_node *node;
1288 
1289 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1290 		i = li->plen;
1291 		mask = ntohl(inet_make_mask(i));
1292 		if (l->key != (key & mask))
1293 			continue;
1294 
1295 		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1296 			*plen = i;
1297 #ifdef CONFIG_IP_FIB_TRIE_STATS
1298 			t->stats.semantic_match_passed++;
1299 #endif
1300 			return err;
1301 		}
1302 #ifdef CONFIG_IP_FIB_TRIE_STATS
1303 		t->stats.semantic_match_miss++;
1304 #endif
1305 	}
1306 	return 1;
1307 }
1308 
1309 static int
1310 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1311 {
1312 	struct trie *t = (struct trie *) tb->tb_data;
1313 	int plen, ret = 0;
1314 	struct node *n;
1315 	struct tnode *pn;
1316 	int pos, bits;
1317 	t_key key = ntohl(flp->fl4_dst);
1318 	int chopped_off;
1319 	t_key cindex = 0;
1320 	int current_prefix_length = KEYLENGTH;
1321 	struct tnode *cn;
1322 	t_key node_prefix, key_prefix, pref_mismatch;
1323 	int mp;
1324 
1325 	rcu_read_lock();
1326 
1327 	n = rcu_dereference(t->trie);
1328 	if (!n)
1329 		goto failed;
1330 
1331 #ifdef CONFIG_IP_FIB_TRIE_STATS
1332 	t->stats.gets++;
1333 #endif
1334 
1335 	/* Just a leaf? */
1336 	if (IS_LEAF(n)) {
1337 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1338 			goto found;
1339 		goto failed;
1340 	}
1341 	pn = (struct tnode *) n;
1342 	chopped_off = 0;
1343 
1344 	while (pn) {
1345 		pos = pn->pos;
1346 		bits = pn->bits;
1347 
1348 		if (!chopped_off)
1349 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1350 
1351 		n = tnode_get_child(pn, cindex);
1352 
1353 		if (n == NULL) {
1354 #ifdef CONFIG_IP_FIB_TRIE_STATS
1355 			t->stats.null_node_hit++;
1356 #endif
1357 			goto backtrace;
1358 		}
1359 
1360 		if (IS_LEAF(n)) {
1361 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1362 				goto found;
1363 			else
1364 				goto backtrace;
1365 		}
1366 
1367 #define HL_OPTIMIZE
1368 #ifdef HL_OPTIMIZE
1369 		cn = (struct tnode *)n;
1370 
1371 		/*
1372 		 * It's a tnode, and we can do some extra checks here if we
1373 		 * like, to avoid descending into a dead-end branch.
1374 		 * This tnode is in the parent's child array at index
1375 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1376 		 * chopped off, so in reality the index may be just a
1377 		 * subprefix, padded with zero at the end.
1378 		 * We can also take a look at any skipped bits in this
1379 		 * tnode - everything up to p_pos is supposed to be ok,
1380 		 * and the non-chopped bits of the index (se previous
1381 		 * paragraph) are also guaranteed ok, but the rest is
1382 		 * considered unknown.
1383 		 *
1384 		 * The skipped bits are key[pos+bits..cn->pos].
1385 		 */
1386 
1387 		/* If current_prefix_length < pos+bits, we are already doing
1388 		 * actual prefix  matching, which means everything from
1389 		 * pos+(bits-chopped_off) onward must be zero along some
1390 		 * branch of this subtree - otherwise there is *no* valid
1391 		 * prefix present. Here we can only check the skipped
1392 		 * bits. Remember, since we have already indexed into the
1393 		 * parent's child array, we know that the bits we chopped of
1394 		 * *are* zero.
1395 		 */
1396 
1397 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1398 
1399 		if (current_prefix_length < pos+bits) {
1400 			if (tkey_extract_bits(cn->key, current_prefix_length,
1401 						cn->pos - current_prefix_length) != 0 ||
1402 			    !(cn->child[0]))
1403 				goto backtrace;
1404 		}
1405 
1406 		/*
1407 		 * If chopped_off=0, the index is fully validated and we
1408 		 * only need to look at the skipped bits for this, the new,
1409 		 * tnode. What we actually want to do is to find out if
1410 		 * these skipped bits match our key perfectly, or if we will
1411 		 * have to count on finding a matching prefix further down,
1412 		 * because if we do, we would like to have some way of
1413 		 * verifying the existence of such a prefix at this point.
1414 		 */
1415 
1416 		/* The only thing we can do at this point is to verify that
1417 		 * any such matching prefix can indeed be a prefix to our
1418 		 * key, and if the bits in the node we are inspecting that
1419 		 * do not match our key are not ZERO, this cannot be true.
1420 		 * Thus, find out where there is a mismatch (before cn->pos)
1421 		 * and verify that all the mismatching bits are zero in the
1422 		 * new tnode's key.
1423 		 */
1424 
1425 		/* Note: We aren't very concerned about the piece of the key
1426 		 * that precede pn->pos+pn->bits, since these have already been
1427 		 * checked. The bits after cn->pos aren't checked since these are
1428 		 * by definition "unknown" at this point. Thus, what we want to
1429 		 * see is if we are about to enter the "prefix matching" state,
1430 		 * and in that case verify that the skipped bits that will prevail
1431 		 * throughout this subtree are zero, as they have to be if we are
1432 		 * to find a matching prefix.
1433 		 */
1434 
1435 		node_prefix = MASK_PFX(cn->key, cn->pos);
1436 		key_prefix = MASK_PFX(key, cn->pos);
1437 		pref_mismatch = key_prefix^node_prefix;
1438 		mp = 0;
1439 
1440 		/* In short: If skipped bits in this node do not match the search
1441 		 * key, enter the "prefix matching" state.directly.
1442 		 */
1443 		if (pref_mismatch) {
1444 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1445 				mp++;
1446 				pref_mismatch = pref_mismatch <<1;
1447 			}
1448 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1449 
1450 			if (key_prefix != 0)
1451 				goto backtrace;
1452 
1453 			if (current_prefix_length >= cn->pos)
1454 				current_prefix_length = mp;
1455 		}
1456 #endif
1457 		pn = (struct tnode *)n; /* Descend */
1458 		chopped_off = 0;
1459 		continue;
1460 
1461 backtrace:
1462 		chopped_off++;
1463 
1464 		/* As zero don't change the child key (cindex) */
1465 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1466 			chopped_off++;
1467 
1468 		/* Decrease current_... with bits chopped off */
1469 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1470 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1471 
1472 		/*
1473 		 * Either we do the actual chop off according or if we have
1474 		 * chopped off all bits in this tnode walk up to our parent.
1475 		 */
1476 
1477 		if (chopped_off <= pn->bits) {
1478 			cindex &= ~(1 << (chopped_off-1));
1479 		} else {
1480 			if (NODE_PARENT(pn) == NULL)
1481 				goto failed;
1482 
1483 			/* Get Child's index */
1484 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1485 			pn = NODE_PARENT(pn);
1486 			chopped_off = 0;
1487 
1488 #ifdef CONFIG_IP_FIB_TRIE_STATS
1489 			t->stats.backtrack++;
1490 #endif
1491 			goto backtrace;
1492 		}
1493 	}
1494 failed:
1495 	ret = 1;
1496 found:
1497 	rcu_read_unlock();
1498 	return ret;
1499 }
1500 
1501 /* only called from updater side */
1502 static int trie_leaf_remove(struct trie *t, t_key key)
1503 {
1504 	t_key cindex;
1505 	struct tnode *tp = NULL;
1506 	struct node *n = t->trie;
1507 	struct leaf *l;
1508 
1509 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1510 
1511 	/* Note that in the case skipped bits, those bits are *not* checked!
1512 	 * When we finish this, we will have NULL or a T_LEAF, and the
1513 	 * T_LEAF may or may not match our key.
1514 	 */
1515 
1516 	while (n != NULL && IS_TNODE(n)) {
1517 		struct tnode *tn = (struct tnode *) n;
1518 		check_tnode(tn);
1519 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1520 
1521 		BUG_ON(n && NODE_PARENT(n) != tn);
1522 	}
1523 	l = (struct leaf *) n;
1524 
1525 	if (!n || !tkey_equals(l->key, key))
1526 		return 0;
1527 
1528 	/*
1529 	 * Key found.
1530 	 * Remove the leaf and rebalance the tree
1531 	 */
1532 
1533 	t->revision++;
1534 	t->size--;
1535 
1536 	preempt_disable();
1537 	tp = NODE_PARENT(n);
1538 	tnode_free((struct tnode *) n);
1539 
1540 	if (tp) {
1541 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1542 		put_child(t, (struct tnode *)tp, cindex, NULL);
1543 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1544 	} else
1545 		rcu_assign_pointer(t->trie, NULL);
1546 	preempt_enable();
1547 
1548 	return 1;
1549 }
1550 
1551 static int
1552 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1553 		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1554 {
1555 	struct trie *t = (struct trie *) tb->tb_data;
1556 	u32 key, mask;
1557 	int plen = r->rtm_dst_len;
1558 	u8 tos = r->rtm_tos;
1559 	struct fib_alias *fa, *fa_to_delete;
1560 	struct list_head *fa_head;
1561 	struct leaf *l;
1562 	struct leaf_info *li;
1563 
1564 
1565 	if (plen > 32)
1566 		return -EINVAL;
1567 
1568 	key = 0;
1569 	if (rta->rta_dst)
1570 		memcpy(&key, rta->rta_dst, 4);
1571 
1572 	key = ntohl(key);
1573 	mask = ntohl(inet_make_mask(plen));
1574 
1575 	if (key & ~mask)
1576 		return -EINVAL;
1577 
1578 	key = key & mask;
1579 	l = fib_find_node(t, key);
1580 
1581 	if (!l)
1582 		return -ESRCH;
1583 
1584 	fa_head = get_fa_head(l, plen);
1585 	fa = fib_find_alias(fa_head, tos, 0);
1586 
1587 	if (!fa)
1588 		return -ESRCH;
1589 
1590 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1591 
1592 	fa_to_delete = NULL;
1593 	fa_head = fa->fa_list.prev;
1594 
1595 	list_for_each_entry(fa, fa_head, fa_list) {
1596 		struct fib_info *fi = fa->fa_info;
1597 
1598 		if (fa->fa_tos != tos)
1599 			break;
1600 
1601 		if ((!r->rtm_type ||
1602 		     fa->fa_type == r->rtm_type) &&
1603 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1604 		     fa->fa_scope == r->rtm_scope) &&
1605 		    (!r->rtm_protocol ||
1606 		     fi->fib_protocol == r->rtm_protocol) &&
1607 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1608 			fa_to_delete = fa;
1609 			break;
1610 		}
1611 	}
1612 
1613 	if (!fa_to_delete)
1614 		return -ESRCH;
1615 
1616 	fa = fa_to_delete;
1617 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1618 
1619 	l = fib_find_node(t, key);
1620 	li = find_leaf_info(l, plen);
1621 
1622 	list_del_rcu(&fa->fa_list);
1623 
1624 	if (list_empty(fa_head)) {
1625 		hlist_del_rcu(&li->hlist);
1626 		free_leaf_info(li);
1627 	}
1628 
1629 	if (hlist_empty(&l->list))
1630 		trie_leaf_remove(t, key);
1631 
1632 	if (fa->fa_state & FA_S_ACCESSED)
1633 		rt_cache_flush(-1);
1634 
1635 	fib_release_info(fa->fa_info);
1636 	alias_free_mem_rcu(fa);
1637 	return 0;
1638 }
1639 
1640 static int trie_flush_list(struct trie *t, struct list_head *head)
1641 {
1642 	struct fib_alias *fa, *fa_node;
1643 	int found = 0;
1644 
1645 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1646 		struct fib_info *fi = fa->fa_info;
1647 
1648 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1649 			list_del_rcu(&fa->fa_list);
1650 			fib_release_info(fa->fa_info);
1651 			alias_free_mem_rcu(fa);
1652 			found++;
1653 		}
1654 	}
1655 	return found;
1656 }
1657 
1658 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1659 {
1660 	int found = 0;
1661 	struct hlist_head *lih = &l->list;
1662 	struct hlist_node *node, *tmp;
1663 	struct leaf_info *li = NULL;
1664 
1665 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1666 		found += trie_flush_list(t, &li->falh);
1667 
1668 		if (list_empty(&li->falh)) {
1669 			hlist_del_rcu(&li->hlist);
1670 			free_leaf_info(li);
1671 		}
1672 	}
1673 	return found;
1674 }
1675 
1676 /* rcu_read_lock needs to be hold by caller from readside */
1677 
1678 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1679 {
1680 	struct node *c = (struct node *) thisleaf;
1681 	struct tnode *p;
1682 	int idx;
1683 	struct node *trie = rcu_dereference(t->trie);
1684 
1685 	if (c == NULL) {
1686 		if (trie == NULL)
1687 			return NULL;
1688 
1689 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1690 			return (struct leaf *) trie;
1691 
1692 		p = (struct tnode*) trie;  /* Start */
1693 	} else
1694 		p = (struct tnode *) NODE_PARENT(c);
1695 
1696 	while (p) {
1697 		int pos, last;
1698 
1699 		/*  Find the next child of the parent */
1700 		if (c)
1701 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1702 		else
1703 			pos = 0;
1704 
1705 		last = 1 << p->bits;
1706 		for (idx = pos; idx < last ; idx++) {
1707 			c = rcu_dereference(p->child[idx]);
1708 
1709 			if (!c)
1710 				continue;
1711 
1712 			/* Decend if tnode */
1713 			while (IS_TNODE(c)) {
1714 				p = (struct tnode *) c;
1715   				idx = 0;
1716 
1717 				/* Rightmost non-NULL branch */
1718 				if (p && IS_TNODE(p))
1719 					while (!(c = rcu_dereference(p->child[idx]))
1720 					       && idx < (1<<p->bits)) idx++;
1721 
1722 				/* Done with this tnode? */
1723 				if (idx >= (1 << p->bits) || !c)
1724 					goto up;
1725 			}
1726 			return (struct leaf *) c;
1727 		}
1728 up:
1729 		/* No more children go up one step  */
1730 		c = (struct node *) p;
1731 		p = (struct tnode *) NODE_PARENT(p);
1732 	}
1733 	return NULL; /* Ready. Root of trie */
1734 }
1735 
1736 static int fn_trie_flush(struct fib_table *tb)
1737 {
1738 	struct trie *t = (struct trie *) tb->tb_data;
1739 	struct leaf *ll = NULL, *l = NULL;
1740 	int found = 0, h;
1741 
1742 	t->revision++;
1743 
1744 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1745 		found += trie_flush_leaf(t, l);
1746 
1747 		if (ll && hlist_empty(&ll->list))
1748 			trie_leaf_remove(t, ll->key);
1749 		ll = l;
1750 	}
1751 
1752 	if (ll && hlist_empty(&ll->list))
1753 		trie_leaf_remove(t, ll->key);
1754 
1755 	pr_debug("trie_flush found=%d\n", found);
1756 	return found;
1757 }
1758 
1759 static int trie_last_dflt = -1;
1760 
1761 static void
1762 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1763 {
1764 	struct trie *t = (struct trie *) tb->tb_data;
1765 	int order, last_idx;
1766 	struct fib_info *fi = NULL;
1767 	struct fib_info *last_resort;
1768 	struct fib_alias *fa = NULL;
1769 	struct list_head *fa_head;
1770 	struct leaf *l;
1771 
1772 	last_idx = -1;
1773 	last_resort = NULL;
1774 	order = -1;
1775 
1776 	rcu_read_lock();
1777 
1778 	l = fib_find_node(t, 0);
1779 	if (!l)
1780 		goto out;
1781 
1782 	fa_head = get_fa_head(l, 0);
1783 	if (!fa_head)
1784 		goto out;
1785 
1786 	if (list_empty(fa_head))
1787 		goto out;
1788 
1789 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1790 		struct fib_info *next_fi = fa->fa_info;
1791 
1792 		if (fa->fa_scope != res->scope ||
1793 		    fa->fa_type != RTN_UNICAST)
1794 			continue;
1795 
1796 		if (next_fi->fib_priority > res->fi->fib_priority)
1797 			break;
1798 		if (!next_fi->fib_nh[0].nh_gw ||
1799 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1800 			continue;
1801 		fa->fa_state |= FA_S_ACCESSED;
1802 
1803 		if (fi == NULL) {
1804 			if (next_fi != res->fi)
1805 				break;
1806 		} else if (!fib_detect_death(fi, order, &last_resort,
1807 					     &last_idx, &trie_last_dflt)) {
1808 			if (res->fi)
1809 				fib_info_put(res->fi);
1810 			res->fi = fi;
1811 			atomic_inc(&fi->fib_clntref);
1812 			trie_last_dflt = order;
1813 			goto out;
1814 		}
1815 		fi = next_fi;
1816 		order++;
1817 	}
1818 	if (order <= 0 || fi == NULL) {
1819 		trie_last_dflt = -1;
1820 		goto out;
1821 	}
1822 
1823 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1824 		if (res->fi)
1825 			fib_info_put(res->fi);
1826 		res->fi = fi;
1827 		atomic_inc(&fi->fib_clntref);
1828 		trie_last_dflt = order;
1829 		goto out;
1830 	}
1831 	if (last_idx >= 0) {
1832 		if (res->fi)
1833 			fib_info_put(res->fi);
1834 		res->fi = last_resort;
1835 		if (last_resort)
1836 			atomic_inc(&last_resort->fib_clntref);
1837 	}
1838 	trie_last_dflt = last_idx;
1839  out:;
1840 	rcu_read_unlock();
1841 }
1842 
1843 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1844 			   struct sk_buff *skb, struct netlink_callback *cb)
1845 {
1846 	int i, s_i;
1847 	struct fib_alias *fa;
1848 
1849 	u32 xkey = htonl(key);
1850 
1851 	s_i = cb->args[3];
1852 	i = 0;
1853 
1854 	/* rcu_read_lock is hold by caller */
1855 
1856 	list_for_each_entry_rcu(fa, fah, fa_list) {
1857 		if (i < s_i) {
1858 			i++;
1859 			continue;
1860 		}
1861 		BUG_ON(!fa->fa_info);
1862 
1863 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1864 				  cb->nlh->nlmsg_seq,
1865 				  RTM_NEWROUTE,
1866 				  tb->tb_id,
1867 				  fa->fa_type,
1868 				  fa->fa_scope,
1869 				  &xkey,
1870 				  plen,
1871 				  fa->fa_tos,
1872 				  fa->fa_info, 0) < 0) {
1873 			cb->args[3] = i;
1874 			return -1;
1875 		}
1876 		i++;
1877 	}
1878 	cb->args[3] = i;
1879 	return skb->len;
1880 }
1881 
1882 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1883 			     struct netlink_callback *cb)
1884 {
1885 	int h, s_h;
1886 	struct list_head *fa_head;
1887 	struct leaf *l = NULL;
1888 
1889 	s_h = cb->args[2];
1890 
1891 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1892 		if (h < s_h)
1893 			continue;
1894 		if (h > s_h)
1895 			memset(&cb->args[3], 0,
1896 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1897 
1898 		fa_head = get_fa_head(l, plen);
1899 
1900 		if (!fa_head)
1901 			continue;
1902 
1903 		if (list_empty(fa_head))
1904 			continue;
1905 
1906 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1907 			cb->args[2] = h;
1908 			return -1;
1909 		}
1910 	}
1911 	cb->args[2] = h;
1912 	return skb->len;
1913 }
1914 
1915 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1916 {
1917 	int m, s_m;
1918 	struct trie *t = (struct trie *) tb->tb_data;
1919 
1920 	s_m = cb->args[1];
1921 
1922 	rcu_read_lock();
1923 	for (m = 0; m <= 32; m++) {
1924 		if (m < s_m)
1925 			continue;
1926 		if (m > s_m)
1927 			memset(&cb->args[2], 0,
1928 				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1929 
1930 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1931 			cb->args[1] = m;
1932 			goto out;
1933 		}
1934 	}
1935 	rcu_read_unlock();
1936 	cb->args[1] = m;
1937 	return skb->len;
1938 out:
1939 	rcu_read_unlock();
1940 	return -1;
1941 }
1942 
1943 /* Fix more generic FIB names for init later */
1944 
1945 #ifdef CONFIG_IP_MULTIPLE_TABLES
1946 struct fib_table * fib_hash_init(int id)
1947 #else
1948 struct fib_table * __init fib_hash_init(int id)
1949 #endif
1950 {
1951 	struct fib_table *tb;
1952 	struct trie *t;
1953 
1954 	if (fn_alias_kmem == NULL)
1955 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1956 						  sizeof(struct fib_alias),
1957 						  0, SLAB_HWCACHE_ALIGN,
1958 						  NULL, NULL);
1959 
1960 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1961 		     GFP_KERNEL);
1962 	if (tb == NULL)
1963 		return NULL;
1964 
1965 	tb->tb_id = id;
1966 	tb->tb_lookup = fn_trie_lookup;
1967 	tb->tb_insert = fn_trie_insert;
1968 	tb->tb_delete = fn_trie_delete;
1969 	tb->tb_flush = fn_trie_flush;
1970 	tb->tb_select_default = fn_trie_select_default;
1971 	tb->tb_dump = fn_trie_dump;
1972 	memset(tb->tb_data, 0, sizeof(struct trie));
1973 
1974 	t = (struct trie *) tb->tb_data;
1975 
1976 	trie_init(t);
1977 
1978 	if (id == RT_TABLE_LOCAL)
1979 		trie_local = t;
1980 	else if (id == RT_TABLE_MAIN)
1981 		trie_main = t;
1982 
1983 	if (id == RT_TABLE_LOCAL)
1984 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1985 
1986 	return tb;
1987 }
1988 
1989 #ifdef CONFIG_PROC_FS
1990 /* Depth first Trie walk iterator */
1991 struct fib_trie_iter {
1992 	struct tnode *tnode;
1993 	struct trie *trie;
1994 	unsigned index;
1995 	unsigned depth;
1996 };
1997 
1998 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1999 {
2000 	struct tnode *tn = iter->tnode;
2001 	unsigned cindex = iter->index;
2002 	struct tnode *p;
2003 
2004 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2005 		 iter->tnode, iter->index, iter->depth);
2006 rescan:
2007 	while (cindex < (1<<tn->bits)) {
2008 		struct node *n = tnode_get_child(tn, cindex);
2009 
2010 		if (n) {
2011 			if (IS_LEAF(n)) {
2012 				iter->tnode = tn;
2013 				iter->index = cindex + 1;
2014 			} else {
2015 				/* push down one level */
2016 				iter->tnode = (struct tnode *) n;
2017 				iter->index = 0;
2018 				++iter->depth;
2019 			}
2020 			return n;
2021 		}
2022 
2023 		++cindex;
2024 	}
2025 
2026 	/* Current node exhausted, pop back up */
2027 	p = NODE_PARENT(tn);
2028 	if (p) {
2029 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2030 		tn = p;
2031 		--iter->depth;
2032 		goto rescan;
2033 	}
2034 
2035 	/* got root? */
2036 	return NULL;
2037 }
2038 
2039 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2040 				       struct trie *t)
2041 {
2042 	struct node *n ;
2043 
2044 	if(!t)
2045 		return NULL;
2046 
2047 	n = rcu_dereference(t->trie);
2048 
2049 	if(!iter)
2050 		return NULL;
2051 
2052 	if (n && IS_TNODE(n)) {
2053 		iter->tnode = (struct tnode *) n;
2054 		iter->trie = t;
2055 		iter->index = 0;
2056 		iter->depth = 1;
2057 		return n;
2058 	}
2059 	return NULL;
2060 }
2061 
2062 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2063 {
2064 	struct node *n;
2065 	struct fib_trie_iter iter;
2066 
2067 	memset(s, 0, sizeof(*s));
2068 
2069 	rcu_read_lock();
2070 	for (n = fib_trie_get_first(&iter, t); n;
2071 	     n = fib_trie_get_next(&iter)) {
2072 		if (IS_LEAF(n)) {
2073 			s->leaves++;
2074 			s->totdepth += iter.depth;
2075 			if (iter.depth > s->maxdepth)
2076 				s->maxdepth = iter.depth;
2077 		} else {
2078 			const struct tnode *tn = (const struct tnode *) n;
2079 			int i;
2080 
2081 			s->tnodes++;
2082 			if(tn->bits < MAX_STAT_DEPTH)
2083 				s->nodesizes[tn->bits]++;
2084 
2085 			for (i = 0; i < (1<<tn->bits); i++)
2086 				if (!tn->child[i])
2087 					s->nullpointers++;
2088 		}
2089 	}
2090 	rcu_read_unlock();
2091 }
2092 
2093 /*
2094  *	This outputs /proc/net/fib_triestats
2095  */
2096 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2097 {
2098 	unsigned i, max, pointers, bytes, avdepth;
2099 
2100 	if (stat->leaves)
2101 		avdepth = stat->totdepth*100 / stat->leaves;
2102 	else
2103 		avdepth = 0;
2104 
2105 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2106 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2107 
2108 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2109 
2110 	bytes = sizeof(struct leaf) * stat->leaves;
2111 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2112 	bytes += sizeof(struct tnode) * stat->tnodes;
2113 
2114 	max = MAX_STAT_DEPTH;
2115 	while (max > 0 && stat->nodesizes[max-1] == 0)
2116 		max--;
2117 
2118 	pointers = 0;
2119 	for (i = 1; i <= max; i++)
2120 		if (stat->nodesizes[i] != 0) {
2121 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2122 			pointers += (1<<i) * stat->nodesizes[i];
2123 		}
2124 	seq_putc(seq, '\n');
2125 	seq_printf(seq, "\tPointers: %d\n", pointers);
2126 
2127 	bytes += sizeof(struct node *) * pointers;
2128 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2129 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2130 
2131 #ifdef CONFIG_IP_FIB_TRIE_STATS
2132 	seq_printf(seq, "Counters:\n---------\n");
2133 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2134 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2135 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2136 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2137 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2138 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2139 #ifdef CLEAR_STATS
2140 	memset(&(t->stats), 0, sizeof(t->stats));
2141 #endif
2142 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2143 }
2144 
2145 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2146 {
2147 	struct trie_stat *stat;
2148 
2149 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2150 	if (!stat)
2151 		return -ENOMEM;
2152 
2153 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2154 		   sizeof(struct leaf), sizeof(struct tnode));
2155 
2156 	if (trie_local) {
2157 		seq_printf(seq, "Local:\n");
2158 		trie_collect_stats(trie_local, stat);
2159 		trie_show_stats(seq, stat);
2160 	}
2161 
2162 	if (trie_main) {
2163 		seq_printf(seq, "Main:\n");
2164 		trie_collect_stats(trie_main, stat);
2165 		trie_show_stats(seq, stat);
2166 	}
2167 	kfree(stat);
2168 
2169 	return 0;
2170 }
2171 
2172 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2173 {
2174 	return single_open(file, fib_triestat_seq_show, NULL);
2175 }
2176 
2177 static struct file_operations fib_triestat_fops = {
2178 	.owner	= THIS_MODULE,
2179 	.open	= fib_triestat_seq_open,
2180 	.read	= seq_read,
2181 	.llseek	= seq_lseek,
2182 	.release = single_release,
2183 };
2184 
2185 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2186 				      loff_t pos)
2187 {
2188 	loff_t idx = 0;
2189 	struct node *n;
2190 
2191 	for (n = fib_trie_get_first(iter, trie_local);
2192 	     n; ++idx, n = fib_trie_get_next(iter)) {
2193 		if (pos == idx)
2194 			return n;
2195 	}
2196 
2197 	for (n = fib_trie_get_first(iter, trie_main);
2198 	     n; ++idx, n = fib_trie_get_next(iter)) {
2199 		if (pos == idx)
2200 			return n;
2201 	}
2202 	return NULL;
2203 }
2204 
2205 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2206 {
2207 	rcu_read_lock();
2208 	if (*pos == 0)
2209 		return SEQ_START_TOKEN;
2210 	return fib_trie_get_idx(seq->private, *pos - 1);
2211 }
2212 
2213 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2214 {
2215 	struct fib_trie_iter *iter = seq->private;
2216 	void *l = v;
2217 
2218 	++*pos;
2219 	if (v == SEQ_START_TOKEN)
2220 		return fib_trie_get_idx(iter, 0);
2221 
2222 	v = fib_trie_get_next(iter);
2223 	BUG_ON(v == l);
2224 	if (v)
2225 		return v;
2226 
2227 	/* continue scan in next trie */
2228 	if (iter->trie == trie_local)
2229 		return fib_trie_get_first(iter, trie_main);
2230 
2231 	return NULL;
2232 }
2233 
2234 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2235 {
2236 	rcu_read_unlock();
2237 }
2238 
2239 static void seq_indent(struct seq_file *seq, int n)
2240 {
2241 	while (n-- > 0) seq_puts(seq, "   ");
2242 }
2243 
2244 static inline const char *rtn_scope(enum rt_scope_t s)
2245 {
2246 	static char buf[32];
2247 
2248 	switch(s) {
2249 	case RT_SCOPE_UNIVERSE: return "universe";
2250 	case RT_SCOPE_SITE:	return "site";
2251 	case RT_SCOPE_LINK:	return "link";
2252 	case RT_SCOPE_HOST:	return "host";
2253 	case RT_SCOPE_NOWHERE:	return "nowhere";
2254 	default:
2255 		snprintf(buf, sizeof(buf), "scope=%d", s);
2256 		return buf;
2257 	}
2258 }
2259 
2260 static const char *rtn_type_names[__RTN_MAX] = {
2261 	[RTN_UNSPEC] = "UNSPEC",
2262 	[RTN_UNICAST] = "UNICAST",
2263 	[RTN_LOCAL] = "LOCAL",
2264 	[RTN_BROADCAST] = "BROADCAST",
2265 	[RTN_ANYCAST] = "ANYCAST",
2266 	[RTN_MULTICAST] = "MULTICAST",
2267 	[RTN_BLACKHOLE] = "BLACKHOLE",
2268 	[RTN_UNREACHABLE] = "UNREACHABLE",
2269 	[RTN_PROHIBIT] = "PROHIBIT",
2270 	[RTN_THROW] = "THROW",
2271 	[RTN_NAT] = "NAT",
2272 	[RTN_XRESOLVE] = "XRESOLVE",
2273 };
2274 
2275 static inline const char *rtn_type(unsigned t)
2276 {
2277 	static char buf[32];
2278 
2279 	if (t < __RTN_MAX && rtn_type_names[t])
2280 		return rtn_type_names[t];
2281 	snprintf(buf, sizeof(buf), "type %d", t);
2282 	return buf;
2283 }
2284 
2285 /* Pretty print the trie */
2286 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2287 {
2288 	const struct fib_trie_iter *iter = seq->private;
2289 	struct node *n = v;
2290 
2291 	if (v == SEQ_START_TOKEN)
2292 		return 0;
2293 
2294 	if (IS_TNODE(n)) {
2295 		struct tnode *tn = (struct tnode *) n;
2296 		t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2297 
2298 		if (!NODE_PARENT(n)) {
2299 			if (iter->trie == trie_local)
2300 				seq_puts(seq, "<local>:\n");
2301 			else
2302 				seq_puts(seq, "<main>:\n");
2303 		}
2304 		seq_indent(seq, iter->depth-1);
2305 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2306 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2307 			   tn->empty_children);
2308 
2309 	} else {
2310 		struct leaf *l = (struct leaf *) n;
2311 		int i;
2312 		u32 val = ntohl(l->key);
2313 
2314 		seq_indent(seq, iter->depth);
2315 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2316 		for (i = 32; i >= 0; i--) {
2317 			struct leaf_info *li = find_leaf_info(l, i);
2318 			if (li) {
2319 				struct fib_alias *fa;
2320 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2321 					seq_indent(seq, iter->depth+1);
2322 					seq_printf(seq, "  /%d %s %s", i,
2323 						   rtn_scope(fa->fa_scope),
2324 						   rtn_type(fa->fa_type));
2325 					if (fa->fa_tos)
2326 						seq_printf(seq, "tos =%d\n",
2327 							   fa->fa_tos);
2328 					seq_putc(seq, '\n');
2329 				}
2330 			}
2331 		}
2332 	}
2333 
2334 	return 0;
2335 }
2336 
2337 static struct seq_operations fib_trie_seq_ops = {
2338 	.start  = fib_trie_seq_start,
2339 	.next   = fib_trie_seq_next,
2340 	.stop   = fib_trie_seq_stop,
2341 	.show   = fib_trie_seq_show,
2342 };
2343 
2344 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2345 {
2346 	struct seq_file *seq;
2347 	int rc = -ENOMEM;
2348 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2349 
2350 	if (!s)
2351 		goto out;
2352 
2353 	rc = seq_open(file, &fib_trie_seq_ops);
2354 	if (rc)
2355 		goto out_kfree;
2356 
2357 	seq	     = file->private_data;
2358 	seq->private = s;
2359 	memset(s, 0, sizeof(*s));
2360 out:
2361 	return rc;
2362 out_kfree:
2363 	kfree(s);
2364 	goto out;
2365 }
2366 
2367 static struct file_operations fib_trie_fops = {
2368 	.owner  = THIS_MODULE,
2369 	.open   = fib_trie_seq_open,
2370 	.read   = seq_read,
2371 	.llseek = seq_lseek,
2372 	.release = seq_release_private,
2373 };
2374 
2375 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2376 {
2377 	static unsigned type2flags[RTN_MAX + 1] = {
2378 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2379 	};
2380 	unsigned flags = type2flags[type];
2381 
2382 	if (fi && fi->fib_nh->nh_gw)
2383 		flags |= RTF_GATEWAY;
2384 	if (mask == 0xFFFFFFFF)
2385 		flags |= RTF_HOST;
2386 	flags |= RTF_UP;
2387 	return flags;
2388 }
2389 
2390 /*
2391  *	This outputs /proc/net/route.
2392  *	The format of the file is not supposed to be changed
2393  * 	and needs to be same as fib_hash output to avoid breaking
2394  *	legacy utilities
2395  */
2396 static int fib_route_seq_show(struct seq_file *seq, void *v)
2397 {
2398 	const struct fib_trie_iter *iter = seq->private;
2399 	struct leaf *l = v;
2400 	int i;
2401 	char bf[128];
2402 
2403 	if (v == SEQ_START_TOKEN) {
2404 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2405 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2406 			   "\tWindow\tIRTT");
2407 		return 0;
2408 	}
2409 
2410 	if (iter->trie == trie_local)
2411 		return 0;
2412 	if (IS_TNODE(l))
2413 		return 0;
2414 
2415 	for (i=32; i>=0; i--) {
2416 		struct leaf_info *li = find_leaf_info(l, i);
2417 		struct fib_alias *fa;
2418 		u32 mask, prefix;
2419 
2420 		if (!li)
2421 			continue;
2422 
2423 		mask = inet_make_mask(li->plen);
2424 		prefix = htonl(l->key);
2425 
2426 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2427 			const struct fib_info *fi = fa->fa_info;
2428 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2429 
2430 			if (fa->fa_type == RTN_BROADCAST
2431 			    || fa->fa_type == RTN_MULTICAST)
2432 				continue;
2433 
2434 			if (fi)
2435 				snprintf(bf, sizeof(bf),
2436 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2437 					 fi->fib_dev ? fi->fib_dev->name : "*",
2438 					 prefix,
2439 					 fi->fib_nh->nh_gw, flags, 0, 0,
2440 					 fi->fib_priority,
2441 					 mask,
2442 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2443 					 fi->fib_window,
2444 					 fi->fib_rtt >> 3);
2445 			else
2446 				snprintf(bf, sizeof(bf),
2447 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2448 					 prefix, 0, flags, 0, 0, 0,
2449 					 mask, 0, 0, 0);
2450 
2451 			seq_printf(seq, "%-127s\n", bf);
2452 		}
2453 	}
2454 
2455 	return 0;
2456 }
2457 
2458 static struct seq_operations fib_route_seq_ops = {
2459 	.start  = fib_trie_seq_start,
2460 	.next   = fib_trie_seq_next,
2461 	.stop   = fib_trie_seq_stop,
2462 	.show   = fib_route_seq_show,
2463 };
2464 
2465 static int fib_route_seq_open(struct inode *inode, struct file *file)
2466 {
2467 	struct seq_file *seq;
2468 	int rc = -ENOMEM;
2469 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2470 
2471 	if (!s)
2472 		goto out;
2473 
2474 	rc = seq_open(file, &fib_route_seq_ops);
2475 	if (rc)
2476 		goto out_kfree;
2477 
2478 	seq	     = file->private_data;
2479 	seq->private = s;
2480 	memset(s, 0, sizeof(*s));
2481 out:
2482 	return rc;
2483 out_kfree:
2484 	kfree(s);
2485 	goto out;
2486 }
2487 
2488 static struct file_operations fib_route_fops = {
2489 	.owner  = THIS_MODULE,
2490 	.open   = fib_route_seq_open,
2491 	.read   = seq_read,
2492 	.llseek = seq_lseek,
2493 	.release = seq_release_private,
2494 };
2495 
2496 int __init fib_proc_init(void)
2497 {
2498 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2499 		goto out1;
2500 
2501 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2502 		goto out2;
2503 
2504 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2505 		goto out3;
2506 
2507 	return 0;
2508 
2509 out3:
2510 	proc_net_remove("fib_triestat");
2511 out2:
2512 	proc_net_remove("fib_trie");
2513 out1:
2514 	return -ENOMEM;
2515 }
2516 
2517 void __init fib_proc_exit(void)
2518 {
2519 	proc_net_remove("fib_trie");
2520 	proc_net_remove("fib_triestat");
2521 	proc_net_remove("route");
2522 }
2523 
2524 #endif /* CONFIG_PROC_FS */
2525