xref: /linux/net/ipv4/fib_trie.c (revision b454cc6636d254fbf6049b73e9560aee76fb04a3)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.407"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/sched.h>
61 #include <linux/mm.h>
62 #include <linux/string.h>
63 #include <linux/socket.h>
64 #include <linux/sockios.h>
65 #include <linux/errno.h>
66 #include <linux/in.h>
67 #include <linux/inet.h>
68 #include <linux/inetdevice.h>
69 #include <linux/netdevice.h>
70 #include <linux/if_arp.h>
71 #include <linux/proc_fs.h>
72 #include <linux/rcupdate.h>
73 #include <linux/skbuff.h>
74 #include <linux/netlink.h>
75 #include <linux/init.h>
76 #include <linux/list.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #undef CONFIG_IP_FIB_TRIE_STATS
86 #define MAX_STAT_DEPTH 32
87 
88 #define KEYLENGTH (8*sizeof(t_key))
89 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
90 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
91 
92 typedef unsigned int t_key;
93 
94 #define T_TNODE 0
95 #define T_LEAF  1
96 #define NODE_TYPE_MASK	0x1UL
97 #define NODE_PARENT(node) \
98 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
99 
100 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
101 
102 #define NODE_SET_PARENT(node, ptr)		\
103 	rcu_assign_pointer((node)->parent,	\
104 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
105 
106 #define IS_TNODE(n) (!(n->parent & T_LEAF))
107 #define IS_LEAF(n) (n->parent & T_LEAF)
108 
109 struct node {
110 	t_key key;
111 	unsigned long parent;
112 };
113 
114 struct leaf {
115 	t_key key;
116 	unsigned long parent;
117 	struct hlist_head list;
118 	struct rcu_head rcu;
119 };
120 
121 struct leaf_info {
122 	struct hlist_node hlist;
123 	struct rcu_head rcu;
124 	int plen;
125 	struct list_head falh;
126 };
127 
128 struct tnode {
129 	t_key key;
130 	unsigned long parent;
131 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
132 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
133 	unsigned short full_children;	/* KEYLENGTH bits needed */
134 	unsigned short empty_children;	/* KEYLENGTH bits needed */
135 	struct rcu_head rcu;
136 	struct node *child[0];
137 };
138 
139 #ifdef CONFIG_IP_FIB_TRIE_STATS
140 struct trie_use_stats {
141 	unsigned int gets;
142 	unsigned int backtrack;
143 	unsigned int semantic_match_passed;
144 	unsigned int semantic_match_miss;
145 	unsigned int null_node_hit;
146 	unsigned int resize_node_skipped;
147 };
148 #endif
149 
150 struct trie_stat {
151 	unsigned int totdepth;
152 	unsigned int maxdepth;
153 	unsigned int tnodes;
154 	unsigned int leaves;
155 	unsigned int nullpointers;
156 	unsigned int nodesizes[MAX_STAT_DEPTH];
157 };
158 
159 struct trie {
160 	struct node *trie;
161 #ifdef CONFIG_IP_FIB_TRIE_STATS
162 	struct trie_use_stats stats;
163 #endif
164 	int size;
165 	unsigned int revision;
166 };
167 
168 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
169 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
170 static struct node *resize(struct trie *t, struct tnode *tn);
171 static struct tnode *inflate(struct trie *t, struct tnode *tn);
172 static struct tnode *halve(struct trie *t, struct tnode *tn);
173 static void tnode_free(struct tnode *tn);
174 
175 static struct kmem_cache *fn_alias_kmem __read_mostly;
176 static struct trie *trie_local = NULL, *trie_main = NULL;
177 
178 
179 /* rcu_read_lock needs to be hold by caller from readside */
180 
181 static inline struct node *tnode_get_child(struct tnode *tn, int i)
182 {
183 	BUG_ON(i >= 1 << tn->bits);
184 
185 	return rcu_dereference(tn->child[i]);
186 }
187 
188 static inline int tnode_child_length(const struct tnode *tn)
189 {
190 	return 1 << tn->bits;
191 }
192 
193 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
194 {
195 	if (offset < KEYLENGTH)
196 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
197 	else
198 		return 0;
199 }
200 
201 static inline int tkey_equals(t_key a, t_key b)
202 {
203 	return a == b;
204 }
205 
206 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
207 {
208 	if (bits == 0 || offset >= KEYLENGTH)
209 		return 1;
210 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
211 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
212 }
213 
214 static inline int tkey_mismatch(t_key a, int offset, t_key b)
215 {
216 	t_key diff = a ^ b;
217 	int i = offset;
218 
219 	if (!diff)
220 		return 0;
221 	while ((diff << i) >> (KEYLENGTH-1) == 0)
222 		i++;
223 	return i;
224 }
225 
226 /*
227   To understand this stuff, an understanding of keys and all their bits is
228   necessary. Every node in the trie has a key associated with it, but not
229   all of the bits in that key are significant.
230 
231   Consider a node 'n' and its parent 'tp'.
232 
233   If n is a leaf, every bit in its key is significant. Its presence is
234   necessitated by path compression, since during a tree traversal (when
235   searching for a leaf - unless we are doing an insertion) we will completely
236   ignore all skipped bits we encounter. Thus we need to verify, at the end of
237   a potentially successful search, that we have indeed been walking the
238   correct key path.
239 
240   Note that we can never "miss" the correct key in the tree if present by
241   following the wrong path. Path compression ensures that segments of the key
242   that are the same for all keys with a given prefix are skipped, but the
243   skipped part *is* identical for each node in the subtrie below the skipped
244   bit! trie_insert() in this implementation takes care of that - note the
245   call to tkey_sub_equals() in trie_insert().
246 
247   if n is an internal node - a 'tnode' here, the various parts of its key
248   have many different meanings.
249 
250   Example:
251   _________________________________________________________________
252   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
253   -----------------------------------------------------------------
254     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
255 
256   _________________________________________________________________
257   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
258   -----------------------------------------------------------------
259    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
260 
261   tp->pos = 7
262   tp->bits = 3
263   n->pos = 15
264   n->bits = 4
265 
266   First, let's just ignore the bits that come before the parent tp, that is
267   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
268   not use them for anything.
269 
270   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
271   index into the parent's child array. That is, they will be used to find
272   'n' among tp's children.
273 
274   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
275   for the node n.
276 
277   All the bits we have seen so far are significant to the node n. The rest
278   of the bits are really not needed or indeed known in n->key.
279 
280   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
281   n's child array, and will of course be different for each child.
282 
283 
284   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
285   at this point.
286 
287 */
288 
289 static inline void check_tnode(const struct tnode *tn)
290 {
291 	WARN_ON(tn && tn->pos+tn->bits > 32);
292 }
293 
294 static int halve_threshold = 25;
295 static int inflate_threshold = 50;
296 static int halve_threshold_root = 15;
297 static int inflate_threshold_root = 25;
298 
299 
300 static void __alias_free_mem(struct rcu_head *head)
301 {
302 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
303 	kmem_cache_free(fn_alias_kmem, fa);
304 }
305 
306 static inline void alias_free_mem_rcu(struct fib_alias *fa)
307 {
308 	call_rcu(&fa->rcu, __alias_free_mem);
309 }
310 
311 static void __leaf_free_rcu(struct rcu_head *head)
312 {
313 	kfree(container_of(head, struct leaf, rcu));
314 }
315 
316 static void __leaf_info_free_rcu(struct rcu_head *head)
317 {
318 	kfree(container_of(head, struct leaf_info, rcu));
319 }
320 
321 static inline void free_leaf_info(struct leaf_info *leaf)
322 {
323 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
324 }
325 
326 static struct tnode *tnode_alloc(unsigned int size)
327 {
328 	struct page *pages;
329 
330 	if (size <= PAGE_SIZE)
331 		return kcalloc(size, 1, GFP_KERNEL);
332 
333 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
334 	if (!pages)
335 		return NULL;
336 
337 	return page_address(pages);
338 }
339 
340 static void __tnode_free_rcu(struct rcu_head *head)
341 {
342 	struct tnode *tn = container_of(head, struct tnode, rcu);
343 	unsigned int size = sizeof(struct tnode) +
344 		(1 << tn->bits) * sizeof(struct node *);
345 
346 	if (size <= PAGE_SIZE)
347 		kfree(tn);
348 	else
349 		free_pages((unsigned long)tn, get_order(size));
350 }
351 
352 static inline void tnode_free(struct tnode *tn)
353 {
354 	if(IS_LEAF(tn)) {
355 		struct leaf *l = (struct leaf *) tn;
356 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
357 	}
358         else
359 		call_rcu(&tn->rcu, __tnode_free_rcu);
360 }
361 
362 static struct leaf *leaf_new(void)
363 {
364 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
365 	if (l) {
366 		l->parent = T_LEAF;
367 		INIT_HLIST_HEAD(&l->list);
368 	}
369 	return l;
370 }
371 
372 static struct leaf_info *leaf_info_new(int plen)
373 {
374 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
375 	if (li) {
376 		li->plen = plen;
377 		INIT_LIST_HEAD(&li->falh);
378 	}
379 	return li;
380 }
381 
382 static struct tnode* tnode_new(t_key key, int pos, int bits)
383 {
384 	int nchildren = 1<<bits;
385 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
386 	struct tnode *tn = tnode_alloc(sz);
387 
388 	if (tn) {
389 		memset(tn, 0, sz);
390 		tn->parent = T_TNODE;
391 		tn->pos = pos;
392 		tn->bits = bits;
393 		tn->key = key;
394 		tn->full_children = 0;
395 		tn->empty_children = 1<<bits;
396 	}
397 
398 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
399 		 (unsigned int) (sizeof(struct node) * 1<<bits));
400 	return tn;
401 }
402 
403 /*
404  * Check whether a tnode 'n' is "full", i.e. it is an internal node
405  * and no bits are skipped. See discussion in dyntree paper p. 6
406  */
407 
408 static inline int tnode_full(const struct tnode *tn, const struct node *n)
409 {
410 	if (n == NULL || IS_LEAF(n))
411 		return 0;
412 
413 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
414 }
415 
416 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
417 {
418 	tnode_put_child_reorg(tn, i, n, -1);
419 }
420 
421  /*
422   * Add a child at position i overwriting the old value.
423   * Update the value of full_children and empty_children.
424   */
425 
426 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
427 {
428 	struct node *chi = tn->child[i];
429 	int isfull;
430 
431 	BUG_ON(i >= 1<<tn->bits);
432 
433 
434 	/* update emptyChildren */
435 	if (n == NULL && chi != NULL)
436 		tn->empty_children++;
437 	else if (n != NULL && chi == NULL)
438 		tn->empty_children--;
439 
440 	/* update fullChildren */
441 	if (wasfull == -1)
442 		wasfull = tnode_full(tn, chi);
443 
444 	isfull = tnode_full(tn, n);
445 	if (wasfull && !isfull)
446 		tn->full_children--;
447 	else if (!wasfull && isfull)
448 		tn->full_children++;
449 
450 	if (n)
451 		NODE_SET_PARENT(n, tn);
452 
453 	rcu_assign_pointer(tn->child[i], n);
454 }
455 
456 static struct node *resize(struct trie *t, struct tnode *tn)
457 {
458 	int i;
459 	int err = 0;
460 	struct tnode *old_tn;
461 	int inflate_threshold_use;
462 	int halve_threshold_use;
463 
464  	if (!tn)
465 		return NULL;
466 
467 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
468 		 tn, inflate_threshold, halve_threshold);
469 
470 	/* No children */
471 	if (tn->empty_children == tnode_child_length(tn)) {
472 		tnode_free(tn);
473 		return NULL;
474 	}
475 	/* One child */
476 	if (tn->empty_children == tnode_child_length(tn) - 1)
477 		for (i = 0; i < tnode_child_length(tn); i++) {
478 			struct node *n;
479 
480 			n = tn->child[i];
481 			if (!n)
482 				continue;
483 
484 			/* compress one level */
485 			NODE_SET_PARENT(n, NULL);
486 			tnode_free(tn);
487 			return n;
488 		}
489 	/*
490 	 * Double as long as the resulting node has a number of
491 	 * nonempty nodes that are above the threshold.
492 	 */
493 
494 	/*
495 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
496 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
497 	 * Telecommunications, page 6:
498 	 * "A node is doubled if the ratio of non-empty children to all
499 	 * children in the *doubled* node is at least 'high'."
500 	 *
501 	 * 'high' in this instance is the variable 'inflate_threshold'. It
502 	 * is expressed as a percentage, so we multiply it with
503 	 * tnode_child_length() and instead of multiplying by 2 (since the
504 	 * child array will be doubled by inflate()) and multiplying
505 	 * the left-hand side by 100 (to handle the percentage thing) we
506 	 * multiply the left-hand side by 50.
507 	 *
508 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
509 	 * - tn->empty_children is of course the number of non-null children
510 	 * in the current node. tn->full_children is the number of "full"
511 	 * children, that is non-null tnodes with a skip value of 0.
512 	 * All of those will be doubled in the resulting inflated tnode, so
513 	 * we just count them one extra time here.
514 	 *
515 	 * A clearer way to write this would be:
516 	 *
517 	 * to_be_doubled = tn->full_children;
518 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
519 	 *     tn->full_children;
520 	 *
521 	 * new_child_length = tnode_child_length(tn) * 2;
522 	 *
523 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
524 	 *      new_child_length;
525 	 * if (new_fill_factor >= inflate_threshold)
526 	 *
527 	 * ...and so on, tho it would mess up the while () loop.
528 	 *
529 	 * anyway,
530 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
531 	 *      inflate_threshold
532 	 *
533 	 * avoid a division:
534 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
535 	 *      inflate_threshold * new_child_length
536 	 *
537 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
538 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
539 	 *    tn->full_children) >= inflate_threshold * new_child_length
540 	 *
541 	 * expand new_child_length:
542 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
543 	 *    tn->full_children) >=
544 	 *      inflate_threshold * tnode_child_length(tn) * 2
545 	 *
546 	 * shorten again:
547 	 * 50 * (tn->full_children + tnode_child_length(tn) -
548 	 *    tn->empty_children) >= inflate_threshold *
549 	 *    tnode_child_length(tn)
550 	 *
551 	 */
552 
553 	check_tnode(tn);
554 
555 	/* Keep root node larger  */
556 
557 	if(!tn->parent)
558 		inflate_threshold_use = inflate_threshold_root;
559 	else
560 		inflate_threshold_use = inflate_threshold;
561 
562 	err = 0;
563 	while ((tn->full_children > 0 &&
564 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
565 				inflate_threshold_use * tnode_child_length(tn))) {
566 
567 		old_tn = tn;
568 		tn = inflate(t, tn);
569 		if (IS_ERR(tn)) {
570 			tn = old_tn;
571 #ifdef CONFIG_IP_FIB_TRIE_STATS
572 			t->stats.resize_node_skipped++;
573 #endif
574 			break;
575 		}
576 	}
577 
578 	check_tnode(tn);
579 
580 	/*
581 	 * Halve as long as the number of empty children in this
582 	 * node is above threshold.
583 	 */
584 
585 
586 	/* Keep root node larger  */
587 
588 	if(!tn->parent)
589 		halve_threshold_use = halve_threshold_root;
590 	else
591 		halve_threshold_use = halve_threshold;
592 
593 	err = 0;
594 	while (tn->bits > 1 &&
595 	       100 * (tnode_child_length(tn) - tn->empty_children) <
596 	       halve_threshold_use * tnode_child_length(tn)) {
597 
598 		old_tn = tn;
599 		tn = halve(t, tn);
600 		if (IS_ERR(tn)) {
601 			tn = old_tn;
602 #ifdef CONFIG_IP_FIB_TRIE_STATS
603 			t->stats.resize_node_skipped++;
604 #endif
605 			break;
606 		}
607 	}
608 
609 
610 	/* Only one child remains */
611 	if (tn->empty_children == tnode_child_length(tn) - 1)
612 		for (i = 0; i < tnode_child_length(tn); i++) {
613 			struct node *n;
614 
615 			n = tn->child[i];
616 			if (!n)
617 				continue;
618 
619 			/* compress one level */
620 
621 			NODE_SET_PARENT(n, NULL);
622 			tnode_free(tn);
623 			return n;
624 		}
625 
626 	return (struct node *) tn;
627 }
628 
629 static struct tnode *inflate(struct trie *t, struct tnode *tn)
630 {
631 	struct tnode *inode;
632 	struct tnode *oldtnode = tn;
633 	int olen = tnode_child_length(tn);
634 	int i;
635 
636 	pr_debug("In inflate\n");
637 
638 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
639 
640 	if (!tn)
641 		return ERR_PTR(-ENOMEM);
642 
643 	/*
644 	 * Preallocate and store tnodes before the actual work so we
645 	 * don't get into an inconsistent state if memory allocation
646 	 * fails. In case of failure we return the oldnode and  inflate
647 	 * of tnode is ignored.
648 	 */
649 
650 	for (i = 0; i < olen; i++) {
651 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
652 
653 		if (inode &&
654 		    IS_TNODE(inode) &&
655 		    inode->pos == oldtnode->pos + oldtnode->bits &&
656 		    inode->bits > 1) {
657 			struct tnode *left, *right;
658 			t_key m = TKEY_GET_MASK(inode->pos, 1);
659 
660 			left = tnode_new(inode->key&(~m), inode->pos + 1,
661 					 inode->bits - 1);
662 			if (!left)
663 				goto nomem;
664 
665 			right = tnode_new(inode->key|m, inode->pos + 1,
666 					  inode->bits - 1);
667 
668                         if (!right) {
669 				tnode_free(left);
670 				goto nomem;
671                         }
672 
673 			put_child(t, tn, 2*i, (struct node *) left);
674 			put_child(t, tn, 2*i+1, (struct node *) right);
675 		}
676 	}
677 
678 	for (i = 0; i < olen; i++) {
679 		struct node *node = tnode_get_child(oldtnode, i);
680 		struct tnode *left, *right;
681 		int size, j;
682 
683 		/* An empty child */
684 		if (node == NULL)
685 			continue;
686 
687 		/* A leaf or an internal node with skipped bits */
688 
689 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
690 		   tn->pos + tn->bits - 1) {
691 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
692 					     1) == 0)
693 				put_child(t, tn, 2*i, node);
694 			else
695 				put_child(t, tn, 2*i+1, node);
696 			continue;
697 		}
698 
699 		/* An internal node with two children */
700 		inode = (struct tnode *) node;
701 
702 		if (inode->bits == 1) {
703 			put_child(t, tn, 2*i, inode->child[0]);
704 			put_child(t, tn, 2*i+1, inode->child[1]);
705 
706 			tnode_free(inode);
707 			continue;
708 		}
709 
710 		/* An internal node with more than two children */
711 
712 		/* We will replace this node 'inode' with two new
713 		 * ones, 'left' and 'right', each with half of the
714 		 * original children. The two new nodes will have
715 		 * a position one bit further down the key and this
716 		 * means that the "significant" part of their keys
717 		 * (see the discussion near the top of this file)
718 		 * will differ by one bit, which will be "0" in
719 		 * left's key and "1" in right's key. Since we are
720 		 * moving the key position by one step, the bit that
721 		 * we are moving away from - the bit at position
722 		 * (inode->pos) - is the one that will differ between
723 		 * left and right. So... we synthesize that bit in the
724 		 * two  new keys.
725 		 * The mask 'm' below will be a single "one" bit at
726 		 * the position (inode->pos)
727 		 */
728 
729 		/* Use the old key, but set the new significant
730 		 *   bit to zero.
731 		 */
732 
733 		left = (struct tnode *) tnode_get_child(tn, 2*i);
734 		put_child(t, tn, 2*i, NULL);
735 
736 		BUG_ON(!left);
737 
738 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
739 		put_child(t, tn, 2*i+1, NULL);
740 
741 		BUG_ON(!right);
742 
743 		size = tnode_child_length(left);
744 		for (j = 0; j < size; j++) {
745 			put_child(t, left, j, inode->child[j]);
746 			put_child(t, right, j, inode->child[j + size]);
747 		}
748 		put_child(t, tn, 2*i, resize(t, left));
749 		put_child(t, tn, 2*i+1, resize(t, right));
750 
751 		tnode_free(inode);
752 	}
753 	tnode_free(oldtnode);
754 	return tn;
755 nomem:
756 	{
757 		int size = tnode_child_length(tn);
758 		int j;
759 
760 		for (j = 0; j < size; j++)
761 			if (tn->child[j])
762 				tnode_free((struct tnode *)tn->child[j]);
763 
764 		tnode_free(tn);
765 
766 		return ERR_PTR(-ENOMEM);
767 	}
768 }
769 
770 static struct tnode *halve(struct trie *t, struct tnode *tn)
771 {
772 	struct tnode *oldtnode = tn;
773 	struct node *left, *right;
774 	int i;
775 	int olen = tnode_child_length(tn);
776 
777 	pr_debug("In halve\n");
778 
779 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
780 
781 	if (!tn)
782 		return ERR_PTR(-ENOMEM);
783 
784 	/*
785 	 * Preallocate and store tnodes before the actual work so we
786 	 * don't get into an inconsistent state if memory allocation
787 	 * fails. In case of failure we return the oldnode and halve
788 	 * of tnode is ignored.
789 	 */
790 
791 	for (i = 0; i < olen; i += 2) {
792 		left = tnode_get_child(oldtnode, i);
793 		right = tnode_get_child(oldtnode, i+1);
794 
795 		/* Two nonempty children */
796 		if (left && right) {
797 			struct tnode *newn;
798 
799 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
800 
801 			if (!newn)
802 				goto nomem;
803 
804 			put_child(t, tn, i/2, (struct node *)newn);
805 		}
806 
807 	}
808 
809 	for (i = 0; i < olen; i += 2) {
810 		struct tnode *newBinNode;
811 
812 		left = tnode_get_child(oldtnode, i);
813 		right = tnode_get_child(oldtnode, i+1);
814 
815 		/* At least one of the children is empty */
816 		if (left == NULL) {
817 			if (right == NULL)    /* Both are empty */
818 				continue;
819 			put_child(t, tn, i/2, right);
820 			continue;
821 		}
822 
823 		if (right == NULL) {
824 			put_child(t, tn, i/2, left);
825 			continue;
826 		}
827 
828 		/* Two nonempty children */
829 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
830 		put_child(t, tn, i/2, NULL);
831 		put_child(t, newBinNode, 0, left);
832 		put_child(t, newBinNode, 1, right);
833 		put_child(t, tn, i/2, resize(t, newBinNode));
834 	}
835 	tnode_free(oldtnode);
836 	return tn;
837 nomem:
838 	{
839 		int size = tnode_child_length(tn);
840 		int j;
841 
842 		for (j = 0; j < size; j++)
843 			if (tn->child[j])
844 				tnode_free((struct tnode *)tn->child[j]);
845 
846 		tnode_free(tn);
847 
848 		return ERR_PTR(-ENOMEM);
849 	}
850 }
851 
852 static void trie_init(struct trie *t)
853 {
854 	if (!t)
855 		return;
856 
857 	t->size = 0;
858 	rcu_assign_pointer(t->trie, NULL);
859 	t->revision = 0;
860 #ifdef CONFIG_IP_FIB_TRIE_STATS
861 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
862 #endif
863 }
864 
865 /* readside must use rcu_read_lock currently dump routines
866  via get_fa_head and dump */
867 
868 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
869 {
870 	struct hlist_head *head = &l->list;
871 	struct hlist_node *node;
872 	struct leaf_info *li;
873 
874 	hlist_for_each_entry_rcu(li, node, head, hlist)
875 		if (li->plen == plen)
876 			return li;
877 
878 	return NULL;
879 }
880 
881 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
882 {
883 	struct leaf_info *li = find_leaf_info(l, plen);
884 
885 	if (!li)
886 		return NULL;
887 
888 	return &li->falh;
889 }
890 
891 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
892 {
893         struct leaf_info *li = NULL, *last = NULL;
894         struct hlist_node *node;
895 
896         if (hlist_empty(head)) {
897                 hlist_add_head_rcu(&new->hlist, head);
898         } else {
899                 hlist_for_each_entry(li, node, head, hlist) {
900                         if (new->plen > li->plen)
901                                 break;
902 
903                         last = li;
904                 }
905                 if (last)
906                         hlist_add_after_rcu(&last->hlist, &new->hlist);
907                 else
908                         hlist_add_before_rcu(&new->hlist, &li->hlist);
909         }
910 }
911 
912 /* rcu_read_lock needs to be hold by caller from readside */
913 
914 static struct leaf *
915 fib_find_node(struct trie *t, u32 key)
916 {
917 	int pos;
918 	struct tnode *tn;
919 	struct node *n;
920 
921 	pos = 0;
922 	n = rcu_dereference(t->trie);
923 
924 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
925 		tn = (struct tnode *) n;
926 
927 		check_tnode(tn);
928 
929 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
930 			pos = tn->pos + tn->bits;
931 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
932 		} else
933 			break;
934 	}
935 	/* Case we have found a leaf. Compare prefixes */
936 
937 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
938 		return (struct leaf *)n;
939 
940 	return NULL;
941 }
942 
943 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
944 {
945 	int wasfull;
946 	t_key cindex, key;
947 	struct tnode *tp = NULL;
948 
949 	key = tn->key;
950 
951 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
952 
953 		tp = NODE_PARENT(tn);
954 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
955 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
956 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
957 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
958 
959 		if (!NODE_PARENT(tn))
960 			break;
961 
962 		tn = NODE_PARENT(tn);
963 	}
964 	/* Handle last (top) tnode */
965 	if (IS_TNODE(tn))
966 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
967 
968 	return (struct node*) tn;
969 }
970 
971 /* only used from updater-side */
972 
973 static  struct list_head *
974 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
975 {
976 	int pos, newpos;
977 	struct tnode *tp = NULL, *tn = NULL;
978 	struct node *n;
979 	struct leaf *l;
980 	int missbit;
981 	struct list_head *fa_head = NULL;
982 	struct leaf_info *li;
983 	t_key cindex;
984 
985 	pos = 0;
986 	n = t->trie;
987 
988 	/* If we point to NULL, stop. Either the tree is empty and we should
989 	 * just put a new leaf in if, or we have reached an empty child slot,
990 	 * and we should just put our new leaf in that.
991 	 * If we point to a T_TNODE, check if it matches our key. Note that
992 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
993 	 * not be the parent's 'pos'+'bits'!
994 	 *
995 	 * If it does match the current key, get pos/bits from it, extract
996 	 * the index from our key, push the T_TNODE and walk the tree.
997 	 *
998 	 * If it doesn't, we have to replace it with a new T_TNODE.
999 	 *
1000 	 * If we point to a T_LEAF, it might or might not have the same key
1001 	 * as we do. If it does, just change the value, update the T_LEAF's
1002 	 * value, and return it.
1003 	 * If it doesn't, we need to replace it with a T_TNODE.
1004 	 */
1005 
1006 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1007 		tn = (struct tnode *) n;
1008 
1009 		check_tnode(tn);
1010 
1011 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1012 			tp = tn;
1013 			pos = tn->pos + tn->bits;
1014 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1015 
1016 			BUG_ON(n && NODE_PARENT(n) != tn);
1017 		} else
1018 			break;
1019 	}
1020 
1021 	/*
1022 	 * n  ----> NULL, LEAF or TNODE
1023 	 *
1024 	 * tp is n's (parent) ----> NULL or TNODE
1025 	 */
1026 
1027 	BUG_ON(tp && IS_LEAF(tp));
1028 
1029 	/* Case 1: n is a leaf. Compare prefixes */
1030 
1031 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1032 		struct leaf *l = (struct leaf *) n;
1033 
1034 		li = leaf_info_new(plen);
1035 
1036 		if (!li) {
1037 			*err = -ENOMEM;
1038 			goto err;
1039 		}
1040 
1041 		fa_head = &li->falh;
1042 		insert_leaf_info(&l->list, li);
1043 		goto done;
1044 	}
1045 	t->size++;
1046 	l = leaf_new();
1047 
1048 	if (!l) {
1049 		*err = -ENOMEM;
1050 		goto err;
1051 	}
1052 
1053 	l->key = key;
1054 	li = leaf_info_new(plen);
1055 
1056 	if (!li) {
1057 		tnode_free((struct tnode *) l);
1058 		*err = -ENOMEM;
1059 		goto err;
1060 	}
1061 
1062 	fa_head = &li->falh;
1063 	insert_leaf_info(&l->list, li);
1064 
1065 	if (t->trie && n == NULL) {
1066 		/* Case 2: n is NULL, and will just insert a new leaf */
1067 
1068 		NODE_SET_PARENT(l, tp);
1069 
1070 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1071 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1072 	} else {
1073 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1074 		/*
1075 		 *  Add a new tnode here
1076 		 *  first tnode need some special handling
1077 		 */
1078 
1079 		if (tp)
1080 			pos = tp->pos+tp->bits;
1081 		else
1082 			pos = 0;
1083 
1084 		if (n) {
1085 			newpos = tkey_mismatch(key, pos, n->key);
1086 			tn = tnode_new(n->key, newpos, 1);
1087 		} else {
1088 			newpos = 0;
1089 			tn = tnode_new(key, newpos, 1); /* First tnode */
1090 		}
1091 
1092 		if (!tn) {
1093 			free_leaf_info(li);
1094 			tnode_free((struct tnode *) l);
1095 			*err = -ENOMEM;
1096 			goto err;
1097 		}
1098 
1099 		NODE_SET_PARENT(tn, tp);
1100 
1101 		missbit = tkey_extract_bits(key, newpos, 1);
1102 		put_child(t, tn, missbit, (struct node *)l);
1103 		put_child(t, tn, 1-missbit, n);
1104 
1105 		if (tp) {
1106 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1108 		} else {
1109 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1110 			tp = tn;
1111 		}
1112 	}
1113 
1114 	if (tp && tp->pos + tp->bits > 32)
1115 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1116 		       tp, tp->pos, tp->bits, key, plen);
1117 
1118 	/* Rebalance the trie */
1119 
1120 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1121 done:
1122 	t->revision++;
1123 err:
1124 	return fa_head;
1125 }
1126 
1127 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1128 {
1129 	struct trie *t = (struct trie *) tb->tb_data;
1130 	struct fib_alias *fa, *new_fa;
1131 	struct list_head *fa_head = NULL;
1132 	struct fib_info *fi;
1133 	int plen = cfg->fc_dst_len;
1134 	u8 tos = cfg->fc_tos;
1135 	u32 key, mask;
1136 	int err;
1137 	struct leaf *l;
1138 
1139 	if (plen > 32)
1140 		return -EINVAL;
1141 
1142 	key = ntohl(cfg->fc_dst);
1143 
1144 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1145 
1146 	mask = ntohl(inet_make_mask(plen));
1147 
1148 	if (key & ~mask)
1149 		return -EINVAL;
1150 
1151 	key = key & mask;
1152 
1153 	fi = fib_create_info(cfg);
1154 	if (IS_ERR(fi)) {
1155 		err = PTR_ERR(fi);
1156 		goto err;
1157 	}
1158 
1159 	l = fib_find_node(t, key);
1160 	fa = NULL;
1161 
1162 	if (l) {
1163 		fa_head = get_fa_head(l, plen);
1164 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1165 	}
1166 
1167 	/* Now fa, if non-NULL, points to the first fib alias
1168 	 * with the same keys [prefix,tos,priority], if such key already
1169 	 * exists or to the node before which we will insert new one.
1170 	 *
1171 	 * If fa is NULL, we will need to allocate a new one and
1172 	 * insert to the head of f.
1173 	 *
1174 	 * If f is NULL, no fib node matched the destination key
1175 	 * and we need to allocate a new one of those as well.
1176 	 */
1177 
1178 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1179 		struct fib_alias *fa_orig;
1180 
1181 		err = -EEXIST;
1182 		if (cfg->fc_nlflags & NLM_F_EXCL)
1183 			goto out;
1184 
1185 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1186 			struct fib_info *fi_drop;
1187 			u8 state;
1188 
1189 			err = -ENOBUFS;
1190 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1191 			if (new_fa == NULL)
1192 				goto out;
1193 
1194 			fi_drop = fa->fa_info;
1195 			new_fa->fa_tos = fa->fa_tos;
1196 			new_fa->fa_info = fi;
1197 			new_fa->fa_type = cfg->fc_type;
1198 			new_fa->fa_scope = cfg->fc_scope;
1199 			state = fa->fa_state;
1200 			new_fa->fa_state &= ~FA_S_ACCESSED;
1201 
1202 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1203 			alias_free_mem_rcu(fa);
1204 
1205 			fib_release_info(fi_drop);
1206 			if (state & FA_S_ACCESSED)
1207 				rt_cache_flush(-1);
1208 
1209 			goto succeeded;
1210 		}
1211 		/* Error if we find a perfect match which
1212 		 * uses the same scope, type, and nexthop
1213 		 * information.
1214 		 */
1215 		fa_orig = fa;
1216 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1217 			if (fa->fa_tos != tos)
1218 				break;
1219 			if (fa->fa_info->fib_priority != fi->fib_priority)
1220 				break;
1221 			if (fa->fa_type == cfg->fc_type &&
1222 			    fa->fa_scope == cfg->fc_scope &&
1223 			    fa->fa_info == fi) {
1224 				goto out;
1225 			}
1226 		}
1227 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1228 			fa = fa_orig;
1229 	}
1230 	err = -ENOENT;
1231 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1232 		goto out;
1233 
1234 	err = -ENOBUFS;
1235 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1236 	if (new_fa == NULL)
1237 		goto out;
1238 
1239 	new_fa->fa_info = fi;
1240 	new_fa->fa_tos = tos;
1241 	new_fa->fa_type = cfg->fc_type;
1242 	new_fa->fa_scope = cfg->fc_scope;
1243 	new_fa->fa_state = 0;
1244 	/*
1245 	 * Insert new entry to the list.
1246 	 */
1247 
1248 	if (!fa_head) {
1249 		err = 0;
1250 		fa_head = fib_insert_node(t, &err, key, plen);
1251 		if (err)
1252 			goto out_free_new_fa;
1253 	}
1254 
1255 	list_add_tail_rcu(&new_fa->fa_list,
1256 			  (fa ? &fa->fa_list : fa_head));
1257 
1258 	rt_cache_flush(-1);
1259 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1260 		  &cfg->fc_nlinfo);
1261 succeeded:
1262 	return 0;
1263 
1264 out_free_new_fa:
1265 	kmem_cache_free(fn_alias_kmem, new_fa);
1266 out:
1267 	fib_release_info(fi);
1268 err:
1269 	return err;
1270 }
1271 
1272 
1273 /* should be called with rcu_read_lock */
1274 static inline int check_leaf(struct trie *t, struct leaf *l,
1275 			     t_key key, int *plen, const struct flowi *flp,
1276 			     struct fib_result *res)
1277 {
1278 	int err, i;
1279 	__be32 mask;
1280 	struct leaf_info *li;
1281 	struct hlist_head *hhead = &l->list;
1282 	struct hlist_node *node;
1283 
1284 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1285 		i = li->plen;
1286 		mask = inet_make_mask(i);
1287 		if (l->key != (key & ntohl(mask)))
1288 			continue;
1289 
1290 		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1291 			*plen = i;
1292 #ifdef CONFIG_IP_FIB_TRIE_STATS
1293 			t->stats.semantic_match_passed++;
1294 #endif
1295 			return err;
1296 		}
1297 #ifdef CONFIG_IP_FIB_TRIE_STATS
1298 		t->stats.semantic_match_miss++;
1299 #endif
1300 	}
1301 	return 1;
1302 }
1303 
1304 static int
1305 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1306 {
1307 	struct trie *t = (struct trie *) tb->tb_data;
1308 	int plen, ret = 0;
1309 	struct node *n;
1310 	struct tnode *pn;
1311 	int pos, bits;
1312 	t_key key = ntohl(flp->fl4_dst);
1313 	int chopped_off;
1314 	t_key cindex = 0;
1315 	int current_prefix_length = KEYLENGTH;
1316 	struct tnode *cn;
1317 	t_key node_prefix, key_prefix, pref_mismatch;
1318 	int mp;
1319 
1320 	rcu_read_lock();
1321 
1322 	n = rcu_dereference(t->trie);
1323 	if (!n)
1324 		goto failed;
1325 
1326 #ifdef CONFIG_IP_FIB_TRIE_STATS
1327 	t->stats.gets++;
1328 #endif
1329 
1330 	/* Just a leaf? */
1331 	if (IS_LEAF(n)) {
1332 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1333 			goto found;
1334 		goto failed;
1335 	}
1336 	pn = (struct tnode *) n;
1337 	chopped_off = 0;
1338 
1339 	while (pn) {
1340 		pos = pn->pos;
1341 		bits = pn->bits;
1342 
1343 		if (!chopped_off)
1344 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1345 
1346 		n = tnode_get_child(pn, cindex);
1347 
1348 		if (n == NULL) {
1349 #ifdef CONFIG_IP_FIB_TRIE_STATS
1350 			t->stats.null_node_hit++;
1351 #endif
1352 			goto backtrace;
1353 		}
1354 
1355 		if (IS_LEAF(n)) {
1356 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1357 				goto found;
1358 			else
1359 				goto backtrace;
1360 		}
1361 
1362 #define HL_OPTIMIZE
1363 #ifdef HL_OPTIMIZE
1364 		cn = (struct tnode *)n;
1365 
1366 		/*
1367 		 * It's a tnode, and we can do some extra checks here if we
1368 		 * like, to avoid descending into a dead-end branch.
1369 		 * This tnode is in the parent's child array at index
1370 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1371 		 * chopped off, so in reality the index may be just a
1372 		 * subprefix, padded with zero at the end.
1373 		 * We can also take a look at any skipped bits in this
1374 		 * tnode - everything up to p_pos is supposed to be ok,
1375 		 * and the non-chopped bits of the index (se previous
1376 		 * paragraph) are also guaranteed ok, but the rest is
1377 		 * considered unknown.
1378 		 *
1379 		 * The skipped bits are key[pos+bits..cn->pos].
1380 		 */
1381 
1382 		/* If current_prefix_length < pos+bits, we are already doing
1383 		 * actual prefix  matching, which means everything from
1384 		 * pos+(bits-chopped_off) onward must be zero along some
1385 		 * branch of this subtree - otherwise there is *no* valid
1386 		 * prefix present. Here we can only check the skipped
1387 		 * bits. Remember, since we have already indexed into the
1388 		 * parent's child array, we know that the bits we chopped of
1389 		 * *are* zero.
1390 		 */
1391 
1392 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1393 
1394 		if (current_prefix_length < pos+bits) {
1395 			if (tkey_extract_bits(cn->key, current_prefix_length,
1396 						cn->pos - current_prefix_length) != 0 ||
1397 			    !(cn->child[0]))
1398 				goto backtrace;
1399 		}
1400 
1401 		/*
1402 		 * If chopped_off=0, the index is fully validated and we
1403 		 * only need to look at the skipped bits for this, the new,
1404 		 * tnode. What we actually want to do is to find out if
1405 		 * these skipped bits match our key perfectly, or if we will
1406 		 * have to count on finding a matching prefix further down,
1407 		 * because if we do, we would like to have some way of
1408 		 * verifying the existence of such a prefix at this point.
1409 		 */
1410 
1411 		/* The only thing we can do at this point is to verify that
1412 		 * any such matching prefix can indeed be a prefix to our
1413 		 * key, and if the bits in the node we are inspecting that
1414 		 * do not match our key are not ZERO, this cannot be true.
1415 		 * Thus, find out where there is a mismatch (before cn->pos)
1416 		 * and verify that all the mismatching bits are zero in the
1417 		 * new tnode's key.
1418 		 */
1419 
1420 		/* Note: We aren't very concerned about the piece of the key
1421 		 * that precede pn->pos+pn->bits, since these have already been
1422 		 * checked. The bits after cn->pos aren't checked since these are
1423 		 * by definition "unknown" at this point. Thus, what we want to
1424 		 * see is if we are about to enter the "prefix matching" state,
1425 		 * and in that case verify that the skipped bits that will prevail
1426 		 * throughout this subtree are zero, as they have to be if we are
1427 		 * to find a matching prefix.
1428 		 */
1429 
1430 		node_prefix = MASK_PFX(cn->key, cn->pos);
1431 		key_prefix = MASK_PFX(key, cn->pos);
1432 		pref_mismatch = key_prefix^node_prefix;
1433 		mp = 0;
1434 
1435 		/* In short: If skipped bits in this node do not match the search
1436 		 * key, enter the "prefix matching" state.directly.
1437 		 */
1438 		if (pref_mismatch) {
1439 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1440 				mp++;
1441 				pref_mismatch = pref_mismatch <<1;
1442 			}
1443 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1444 
1445 			if (key_prefix != 0)
1446 				goto backtrace;
1447 
1448 			if (current_prefix_length >= cn->pos)
1449 				current_prefix_length = mp;
1450 		}
1451 #endif
1452 		pn = (struct tnode *)n; /* Descend */
1453 		chopped_off = 0;
1454 		continue;
1455 
1456 backtrace:
1457 		chopped_off++;
1458 
1459 		/* As zero don't change the child key (cindex) */
1460 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1461 			chopped_off++;
1462 
1463 		/* Decrease current_... with bits chopped off */
1464 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1465 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1466 
1467 		/*
1468 		 * Either we do the actual chop off according or if we have
1469 		 * chopped off all bits in this tnode walk up to our parent.
1470 		 */
1471 
1472 		if (chopped_off <= pn->bits) {
1473 			cindex &= ~(1 << (chopped_off-1));
1474 		} else {
1475 			if (NODE_PARENT(pn) == NULL)
1476 				goto failed;
1477 
1478 			/* Get Child's index */
1479 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1480 			pn = NODE_PARENT(pn);
1481 			chopped_off = 0;
1482 
1483 #ifdef CONFIG_IP_FIB_TRIE_STATS
1484 			t->stats.backtrack++;
1485 #endif
1486 			goto backtrace;
1487 		}
1488 	}
1489 failed:
1490 	ret = 1;
1491 found:
1492 	rcu_read_unlock();
1493 	return ret;
1494 }
1495 
1496 /* only called from updater side */
1497 static int trie_leaf_remove(struct trie *t, t_key key)
1498 {
1499 	t_key cindex;
1500 	struct tnode *tp = NULL;
1501 	struct node *n = t->trie;
1502 	struct leaf *l;
1503 
1504 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1505 
1506 	/* Note that in the case skipped bits, those bits are *not* checked!
1507 	 * When we finish this, we will have NULL or a T_LEAF, and the
1508 	 * T_LEAF may or may not match our key.
1509 	 */
1510 
1511 	while (n != NULL && IS_TNODE(n)) {
1512 		struct tnode *tn = (struct tnode *) n;
1513 		check_tnode(tn);
1514 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1515 
1516 		BUG_ON(n && NODE_PARENT(n) != tn);
1517 	}
1518 	l = (struct leaf *) n;
1519 
1520 	if (!n || !tkey_equals(l->key, key))
1521 		return 0;
1522 
1523 	/*
1524 	 * Key found.
1525 	 * Remove the leaf and rebalance the tree
1526 	 */
1527 
1528 	t->revision++;
1529 	t->size--;
1530 
1531 	preempt_disable();
1532 	tp = NODE_PARENT(n);
1533 	tnode_free((struct tnode *) n);
1534 
1535 	if (tp) {
1536 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1537 		put_child(t, (struct tnode *)tp, cindex, NULL);
1538 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1539 	} else
1540 		rcu_assign_pointer(t->trie, NULL);
1541 	preempt_enable();
1542 
1543 	return 1;
1544 }
1545 
1546 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1547 {
1548 	struct trie *t = (struct trie *) tb->tb_data;
1549 	u32 key, mask;
1550 	int plen = cfg->fc_dst_len;
1551 	u8 tos = cfg->fc_tos;
1552 	struct fib_alias *fa, *fa_to_delete;
1553 	struct list_head *fa_head;
1554 	struct leaf *l;
1555 	struct leaf_info *li;
1556 
1557 	if (plen > 32)
1558 		return -EINVAL;
1559 
1560 	key = ntohl(cfg->fc_dst);
1561 	mask = ntohl(inet_make_mask(plen));
1562 
1563 	if (key & ~mask)
1564 		return -EINVAL;
1565 
1566 	key = key & mask;
1567 	l = fib_find_node(t, key);
1568 
1569 	if (!l)
1570 		return -ESRCH;
1571 
1572 	fa_head = get_fa_head(l, plen);
1573 	fa = fib_find_alias(fa_head, tos, 0);
1574 
1575 	if (!fa)
1576 		return -ESRCH;
1577 
1578 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1579 
1580 	fa_to_delete = NULL;
1581 	fa_head = fa->fa_list.prev;
1582 
1583 	list_for_each_entry(fa, fa_head, fa_list) {
1584 		struct fib_info *fi = fa->fa_info;
1585 
1586 		if (fa->fa_tos != tos)
1587 			break;
1588 
1589 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1590 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1591 		     fa->fa_scope == cfg->fc_scope) &&
1592 		    (!cfg->fc_protocol ||
1593 		     fi->fib_protocol == cfg->fc_protocol) &&
1594 		    fib_nh_match(cfg, fi) == 0) {
1595 			fa_to_delete = fa;
1596 			break;
1597 		}
1598 	}
1599 
1600 	if (!fa_to_delete)
1601 		return -ESRCH;
1602 
1603 	fa = fa_to_delete;
1604 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1605 		  &cfg->fc_nlinfo);
1606 
1607 	l = fib_find_node(t, key);
1608 	li = find_leaf_info(l, plen);
1609 
1610 	list_del_rcu(&fa->fa_list);
1611 
1612 	if (list_empty(fa_head)) {
1613 		hlist_del_rcu(&li->hlist);
1614 		free_leaf_info(li);
1615 	}
1616 
1617 	if (hlist_empty(&l->list))
1618 		trie_leaf_remove(t, key);
1619 
1620 	if (fa->fa_state & FA_S_ACCESSED)
1621 		rt_cache_flush(-1);
1622 
1623 	fib_release_info(fa->fa_info);
1624 	alias_free_mem_rcu(fa);
1625 	return 0;
1626 }
1627 
1628 static int trie_flush_list(struct trie *t, struct list_head *head)
1629 {
1630 	struct fib_alias *fa, *fa_node;
1631 	int found = 0;
1632 
1633 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1634 		struct fib_info *fi = fa->fa_info;
1635 
1636 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1637 			list_del_rcu(&fa->fa_list);
1638 			fib_release_info(fa->fa_info);
1639 			alias_free_mem_rcu(fa);
1640 			found++;
1641 		}
1642 	}
1643 	return found;
1644 }
1645 
1646 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1647 {
1648 	int found = 0;
1649 	struct hlist_head *lih = &l->list;
1650 	struct hlist_node *node, *tmp;
1651 	struct leaf_info *li = NULL;
1652 
1653 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1654 		found += trie_flush_list(t, &li->falh);
1655 
1656 		if (list_empty(&li->falh)) {
1657 			hlist_del_rcu(&li->hlist);
1658 			free_leaf_info(li);
1659 		}
1660 	}
1661 	return found;
1662 }
1663 
1664 /* rcu_read_lock needs to be hold by caller from readside */
1665 
1666 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1667 {
1668 	struct node *c = (struct node *) thisleaf;
1669 	struct tnode *p;
1670 	int idx;
1671 	struct node *trie = rcu_dereference(t->trie);
1672 
1673 	if (c == NULL) {
1674 		if (trie == NULL)
1675 			return NULL;
1676 
1677 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1678 			return (struct leaf *) trie;
1679 
1680 		p = (struct tnode*) trie;  /* Start */
1681 	} else
1682 		p = (struct tnode *) NODE_PARENT(c);
1683 
1684 	while (p) {
1685 		int pos, last;
1686 
1687 		/*  Find the next child of the parent */
1688 		if (c)
1689 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1690 		else
1691 			pos = 0;
1692 
1693 		last = 1 << p->bits;
1694 		for (idx = pos; idx < last ; idx++) {
1695 			c = rcu_dereference(p->child[idx]);
1696 
1697 			if (!c)
1698 				continue;
1699 
1700 			/* Decend if tnode */
1701 			while (IS_TNODE(c)) {
1702 				p = (struct tnode *) c;
1703   				idx = 0;
1704 
1705 				/* Rightmost non-NULL branch */
1706 				if (p && IS_TNODE(p))
1707 					while (!(c = rcu_dereference(p->child[idx]))
1708 					       && idx < (1<<p->bits)) idx++;
1709 
1710 				/* Done with this tnode? */
1711 				if (idx >= (1 << p->bits) || !c)
1712 					goto up;
1713 			}
1714 			return (struct leaf *) c;
1715 		}
1716 up:
1717 		/* No more children go up one step  */
1718 		c = (struct node *) p;
1719 		p = (struct tnode *) NODE_PARENT(p);
1720 	}
1721 	return NULL; /* Ready. Root of trie */
1722 }
1723 
1724 static int fn_trie_flush(struct fib_table *tb)
1725 {
1726 	struct trie *t = (struct trie *) tb->tb_data;
1727 	struct leaf *ll = NULL, *l = NULL;
1728 	int found = 0, h;
1729 
1730 	t->revision++;
1731 
1732 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1733 		found += trie_flush_leaf(t, l);
1734 
1735 		if (ll && hlist_empty(&ll->list))
1736 			trie_leaf_remove(t, ll->key);
1737 		ll = l;
1738 	}
1739 
1740 	if (ll && hlist_empty(&ll->list))
1741 		trie_leaf_remove(t, ll->key);
1742 
1743 	pr_debug("trie_flush found=%d\n", found);
1744 	return found;
1745 }
1746 
1747 static int trie_last_dflt = -1;
1748 
1749 static void
1750 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1751 {
1752 	struct trie *t = (struct trie *) tb->tb_data;
1753 	int order, last_idx;
1754 	struct fib_info *fi = NULL;
1755 	struct fib_info *last_resort;
1756 	struct fib_alias *fa = NULL;
1757 	struct list_head *fa_head;
1758 	struct leaf *l;
1759 
1760 	last_idx = -1;
1761 	last_resort = NULL;
1762 	order = -1;
1763 
1764 	rcu_read_lock();
1765 
1766 	l = fib_find_node(t, 0);
1767 	if (!l)
1768 		goto out;
1769 
1770 	fa_head = get_fa_head(l, 0);
1771 	if (!fa_head)
1772 		goto out;
1773 
1774 	if (list_empty(fa_head))
1775 		goto out;
1776 
1777 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1778 		struct fib_info *next_fi = fa->fa_info;
1779 
1780 		if (fa->fa_scope != res->scope ||
1781 		    fa->fa_type != RTN_UNICAST)
1782 			continue;
1783 
1784 		if (next_fi->fib_priority > res->fi->fib_priority)
1785 			break;
1786 		if (!next_fi->fib_nh[0].nh_gw ||
1787 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1788 			continue;
1789 		fa->fa_state |= FA_S_ACCESSED;
1790 
1791 		if (fi == NULL) {
1792 			if (next_fi != res->fi)
1793 				break;
1794 		} else if (!fib_detect_death(fi, order, &last_resort,
1795 					     &last_idx, &trie_last_dflt)) {
1796 			if (res->fi)
1797 				fib_info_put(res->fi);
1798 			res->fi = fi;
1799 			atomic_inc(&fi->fib_clntref);
1800 			trie_last_dflt = order;
1801 			goto out;
1802 		}
1803 		fi = next_fi;
1804 		order++;
1805 	}
1806 	if (order <= 0 || fi == NULL) {
1807 		trie_last_dflt = -1;
1808 		goto out;
1809 	}
1810 
1811 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1812 		if (res->fi)
1813 			fib_info_put(res->fi);
1814 		res->fi = fi;
1815 		atomic_inc(&fi->fib_clntref);
1816 		trie_last_dflt = order;
1817 		goto out;
1818 	}
1819 	if (last_idx >= 0) {
1820 		if (res->fi)
1821 			fib_info_put(res->fi);
1822 		res->fi = last_resort;
1823 		if (last_resort)
1824 			atomic_inc(&last_resort->fib_clntref);
1825 	}
1826 	trie_last_dflt = last_idx;
1827  out:;
1828 	rcu_read_unlock();
1829 }
1830 
1831 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1832 			   struct sk_buff *skb, struct netlink_callback *cb)
1833 {
1834 	int i, s_i;
1835 	struct fib_alias *fa;
1836 
1837 	__be32 xkey = htonl(key);
1838 
1839 	s_i = cb->args[4];
1840 	i = 0;
1841 
1842 	/* rcu_read_lock is hold by caller */
1843 
1844 	list_for_each_entry_rcu(fa, fah, fa_list) {
1845 		if (i < s_i) {
1846 			i++;
1847 			continue;
1848 		}
1849 		BUG_ON(!fa->fa_info);
1850 
1851 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1852 				  cb->nlh->nlmsg_seq,
1853 				  RTM_NEWROUTE,
1854 				  tb->tb_id,
1855 				  fa->fa_type,
1856 				  fa->fa_scope,
1857 				  xkey,
1858 				  plen,
1859 				  fa->fa_tos,
1860 				  fa->fa_info, 0) < 0) {
1861 			cb->args[4] = i;
1862 			return -1;
1863 		}
1864 		i++;
1865 	}
1866 	cb->args[4] = i;
1867 	return skb->len;
1868 }
1869 
1870 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1871 			     struct netlink_callback *cb)
1872 {
1873 	int h, s_h;
1874 	struct list_head *fa_head;
1875 	struct leaf *l = NULL;
1876 
1877 	s_h = cb->args[3];
1878 
1879 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1880 		if (h < s_h)
1881 			continue;
1882 		if (h > s_h)
1883 			memset(&cb->args[4], 0,
1884 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1885 
1886 		fa_head = get_fa_head(l, plen);
1887 
1888 		if (!fa_head)
1889 			continue;
1890 
1891 		if (list_empty(fa_head))
1892 			continue;
1893 
1894 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1895 			cb->args[3] = h;
1896 			return -1;
1897 		}
1898 	}
1899 	cb->args[3] = h;
1900 	return skb->len;
1901 }
1902 
1903 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1904 {
1905 	int m, s_m;
1906 	struct trie *t = (struct trie *) tb->tb_data;
1907 
1908 	s_m = cb->args[2];
1909 
1910 	rcu_read_lock();
1911 	for (m = 0; m <= 32; m++) {
1912 		if (m < s_m)
1913 			continue;
1914 		if (m > s_m)
1915 			memset(&cb->args[3], 0,
1916 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1917 
1918 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1919 			cb->args[2] = m;
1920 			goto out;
1921 		}
1922 	}
1923 	rcu_read_unlock();
1924 	cb->args[2] = m;
1925 	return skb->len;
1926 out:
1927 	rcu_read_unlock();
1928 	return -1;
1929 }
1930 
1931 /* Fix more generic FIB names for init later */
1932 
1933 #ifdef CONFIG_IP_MULTIPLE_TABLES
1934 struct fib_table * fib_hash_init(u32 id)
1935 #else
1936 struct fib_table * __init fib_hash_init(u32 id)
1937 #endif
1938 {
1939 	struct fib_table *tb;
1940 	struct trie *t;
1941 
1942 	if (fn_alias_kmem == NULL)
1943 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1944 						  sizeof(struct fib_alias),
1945 						  0, SLAB_HWCACHE_ALIGN,
1946 						  NULL, NULL);
1947 
1948 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1949 		     GFP_KERNEL);
1950 	if (tb == NULL)
1951 		return NULL;
1952 
1953 	tb->tb_id = id;
1954 	tb->tb_lookup = fn_trie_lookup;
1955 	tb->tb_insert = fn_trie_insert;
1956 	tb->tb_delete = fn_trie_delete;
1957 	tb->tb_flush = fn_trie_flush;
1958 	tb->tb_select_default = fn_trie_select_default;
1959 	tb->tb_dump = fn_trie_dump;
1960 	memset(tb->tb_data, 0, sizeof(struct trie));
1961 
1962 	t = (struct trie *) tb->tb_data;
1963 
1964 	trie_init(t);
1965 
1966 	if (id == RT_TABLE_LOCAL)
1967 		trie_local = t;
1968 	else if (id == RT_TABLE_MAIN)
1969 		trie_main = t;
1970 
1971 	if (id == RT_TABLE_LOCAL)
1972 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1973 
1974 	return tb;
1975 }
1976 
1977 #ifdef CONFIG_PROC_FS
1978 /* Depth first Trie walk iterator */
1979 struct fib_trie_iter {
1980 	struct tnode *tnode;
1981 	struct trie *trie;
1982 	unsigned index;
1983 	unsigned depth;
1984 };
1985 
1986 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1987 {
1988 	struct tnode *tn = iter->tnode;
1989 	unsigned cindex = iter->index;
1990 	struct tnode *p;
1991 
1992 	/* A single entry routing table */
1993 	if (!tn)
1994 		return NULL;
1995 
1996 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1997 		 iter->tnode, iter->index, iter->depth);
1998 rescan:
1999 	while (cindex < (1<<tn->bits)) {
2000 		struct node *n = tnode_get_child(tn, cindex);
2001 
2002 		if (n) {
2003 			if (IS_LEAF(n)) {
2004 				iter->tnode = tn;
2005 				iter->index = cindex + 1;
2006 			} else {
2007 				/* push down one level */
2008 				iter->tnode = (struct tnode *) n;
2009 				iter->index = 0;
2010 				++iter->depth;
2011 			}
2012 			return n;
2013 		}
2014 
2015 		++cindex;
2016 	}
2017 
2018 	/* Current node exhausted, pop back up */
2019 	p = NODE_PARENT(tn);
2020 	if (p) {
2021 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2022 		tn = p;
2023 		--iter->depth;
2024 		goto rescan;
2025 	}
2026 
2027 	/* got root? */
2028 	return NULL;
2029 }
2030 
2031 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2032 				       struct trie *t)
2033 {
2034 	struct node *n ;
2035 
2036 	if(!t)
2037 		return NULL;
2038 
2039 	n = rcu_dereference(t->trie);
2040 
2041 	if(!iter)
2042 		return NULL;
2043 
2044 	if (n) {
2045 		if (IS_TNODE(n)) {
2046 			iter->tnode = (struct tnode *) n;
2047 			iter->trie = t;
2048 			iter->index = 0;
2049 			iter->depth = 1;
2050 		} else {
2051 			iter->tnode = NULL;
2052 			iter->trie  = t;
2053 			iter->index = 0;
2054 			iter->depth = 0;
2055 		}
2056 		return n;
2057 	}
2058 	return NULL;
2059 }
2060 
2061 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2062 {
2063 	struct node *n;
2064 	struct fib_trie_iter iter;
2065 
2066 	memset(s, 0, sizeof(*s));
2067 
2068 	rcu_read_lock();
2069 	for (n = fib_trie_get_first(&iter, t); n;
2070 	     n = fib_trie_get_next(&iter)) {
2071 		if (IS_LEAF(n)) {
2072 			s->leaves++;
2073 			s->totdepth += iter.depth;
2074 			if (iter.depth > s->maxdepth)
2075 				s->maxdepth = iter.depth;
2076 		} else {
2077 			const struct tnode *tn = (const struct tnode *) n;
2078 			int i;
2079 
2080 			s->tnodes++;
2081 			if(tn->bits < MAX_STAT_DEPTH)
2082 				s->nodesizes[tn->bits]++;
2083 
2084 			for (i = 0; i < (1<<tn->bits); i++)
2085 				if (!tn->child[i])
2086 					s->nullpointers++;
2087 		}
2088 	}
2089 	rcu_read_unlock();
2090 }
2091 
2092 /*
2093  *	This outputs /proc/net/fib_triestats
2094  */
2095 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2096 {
2097 	unsigned i, max, pointers, bytes, avdepth;
2098 
2099 	if (stat->leaves)
2100 		avdepth = stat->totdepth*100 / stat->leaves;
2101 	else
2102 		avdepth = 0;
2103 
2104 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2105 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2106 
2107 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2108 
2109 	bytes = sizeof(struct leaf) * stat->leaves;
2110 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2111 	bytes += sizeof(struct tnode) * stat->tnodes;
2112 
2113 	max = MAX_STAT_DEPTH;
2114 	while (max > 0 && stat->nodesizes[max-1] == 0)
2115 		max--;
2116 
2117 	pointers = 0;
2118 	for (i = 1; i <= max; i++)
2119 		if (stat->nodesizes[i] != 0) {
2120 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2121 			pointers += (1<<i) * stat->nodesizes[i];
2122 		}
2123 	seq_putc(seq, '\n');
2124 	seq_printf(seq, "\tPointers: %d\n", pointers);
2125 
2126 	bytes += sizeof(struct node *) * pointers;
2127 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2128 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2129 
2130 #ifdef CONFIG_IP_FIB_TRIE_STATS
2131 	seq_printf(seq, "Counters:\n---------\n");
2132 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2133 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2134 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2135 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2136 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2137 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2138 #ifdef CLEAR_STATS
2139 	memset(&(t->stats), 0, sizeof(t->stats));
2140 #endif
2141 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2142 }
2143 
2144 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2145 {
2146 	struct trie_stat *stat;
2147 
2148 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2149 	if (!stat)
2150 		return -ENOMEM;
2151 
2152 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2153 		   sizeof(struct leaf), sizeof(struct tnode));
2154 
2155 	if (trie_local) {
2156 		seq_printf(seq, "Local:\n");
2157 		trie_collect_stats(trie_local, stat);
2158 		trie_show_stats(seq, stat);
2159 	}
2160 
2161 	if (trie_main) {
2162 		seq_printf(seq, "Main:\n");
2163 		trie_collect_stats(trie_main, stat);
2164 		trie_show_stats(seq, stat);
2165 	}
2166 	kfree(stat);
2167 
2168 	return 0;
2169 }
2170 
2171 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2172 {
2173 	return single_open(file, fib_triestat_seq_show, NULL);
2174 }
2175 
2176 static struct file_operations fib_triestat_fops = {
2177 	.owner	= THIS_MODULE,
2178 	.open	= fib_triestat_seq_open,
2179 	.read	= seq_read,
2180 	.llseek	= seq_lseek,
2181 	.release = single_release,
2182 };
2183 
2184 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2185 				      loff_t pos)
2186 {
2187 	loff_t idx = 0;
2188 	struct node *n;
2189 
2190 	for (n = fib_trie_get_first(iter, trie_local);
2191 	     n; ++idx, n = fib_trie_get_next(iter)) {
2192 		if (pos == idx)
2193 			return n;
2194 	}
2195 
2196 	for (n = fib_trie_get_first(iter, trie_main);
2197 	     n; ++idx, n = fib_trie_get_next(iter)) {
2198 		if (pos == idx)
2199 			return n;
2200 	}
2201 	return NULL;
2202 }
2203 
2204 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2205 {
2206 	rcu_read_lock();
2207 	if (*pos == 0)
2208 		return SEQ_START_TOKEN;
2209 	return fib_trie_get_idx(seq->private, *pos - 1);
2210 }
2211 
2212 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2213 {
2214 	struct fib_trie_iter *iter = seq->private;
2215 	void *l = v;
2216 
2217 	++*pos;
2218 	if (v == SEQ_START_TOKEN)
2219 		return fib_trie_get_idx(iter, 0);
2220 
2221 	v = fib_trie_get_next(iter);
2222 	BUG_ON(v == l);
2223 	if (v)
2224 		return v;
2225 
2226 	/* continue scan in next trie */
2227 	if (iter->trie == trie_local)
2228 		return fib_trie_get_first(iter, trie_main);
2229 
2230 	return NULL;
2231 }
2232 
2233 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2234 {
2235 	rcu_read_unlock();
2236 }
2237 
2238 static void seq_indent(struct seq_file *seq, int n)
2239 {
2240 	while (n-- > 0) seq_puts(seq, "   ");
2241 }
2242 
2243 static inline const char *rtn_scope(enum rt_scope_t s)
2244 {
2245 	static char buf[32];
2246 
2247 	switch(s) {
2248 	case RT_SCOPE_UNIVERSE: return "universe";
2249 	case RT_SCOPE_SITE:	return "site";
2250 	case RT_SCOPE_LINK:	return "link";
2251 	case RT_SCOPE_HOST:	return "host";
2252 	case RT_SCOPE_NOWHERE:	return "nowhere";
2253 	default:
2254 		snprintf(buf, sizeof(buf), "scope=%d", s);
2255 		return buf;
2256 	}
2257 }
2258 
2259 static const char *rtn_type_names[__RTN_MAX] = {
2260 	[RTN_UNSPEC] = "UNSPEC",
2261 	[RTN_UNICAST] = "UNICAST",
2262 	[RTN_LOCAL] = "LOCAL",
2263 	[RTN_BROADCAST] = "BROADCAST",
2264 	[RTN_ANYCAST] = "ANYCAST",
2265 	[RTN_MULTICAST] = "MULTICAST",
2266 	[RTN_BLACKHOLE] = "BLACKHOLE",
2267 	[RTN_UNREACHABLE] = "UNREACHABLE",
2268 	[RTN_PROHIBIT] = "PROHIBIT",
2269 	[RTN_THROW] = "THROW",
2270 	[RTN_NAT] = "NAT",
2271 	[RTN_XRESOLVE] = "XRESOLVE",
2272 };
2273 
2274 static inline const char *rtn_type(unsigned t)
2275 {
2276 	static char buf[32];
2277 
2278 	if (t < __RTN_MAX && rtn_type_names[t])
2279 		return rtn_type_names[t];
2280 	snprintf(buf, sizeof(buf), "type %d", t);
2281 	return buf;
2282 }
2283 
2284 /* Pretty print the trie */
2285 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2286 {
2287 	const struct fib_trie_iter *iter = seq->private;
2288 	struct node *n = v;
2289 
2290 	if (v == SEQ_START_TOKEN)
2291 		return 0;
2292 
2293 	if (!NODE_PARENT(n)) {
2294 		if (iter->trie == trie_local)
2295 			seq_puts(seq, "<local>:\n");
2296 		else
2297 			seq_puts(seq, "<main>:\n");
2298 	}
2299 
2300 	if (IS_TNODE(n)) {
2301 		struct tnode *tn = (struct tnode *) n;
2302 		__be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2303 
2304 		seq_indent(seq, iter->depth-1);
2305 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2306 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2307 			   tn->empty_children);
2308 
2309 	} else {
2310 		struct leaf *l = (struct leaf *) n;
2311 		int i;
2312 		__be32 val = htonl(l->key);
2313 
2314 		seq_indent(seq, iter->depth);
2315 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2316 		for (i = 32; i >= 0; i--) {
2317 			struct leaf_info *li = find_leaf_info(l, i);
2318 			if (li) {
2319 				struct fib_alias *fa;
2320 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2321 					seq_indent(seq, iter->depth+1);
2322 					seq_printf(seq, "  /%d %s %s", i,
2323 						   rtn_scope(fa->fa_scope),
2324 						   rtn_type(fa->fa_type));
2325 					if (fa->fa_tos)
2326 						seq_printf(seq, "tos =%d\n",
2327 							   fa->fa_tos);
2328 					seq_putc(seq, '\n');
2329 				}
2330 			}
2331 		}
2332 	}
2333 
2334 	return 0;
2335 }
2336 
2337 static struct seq_operations fib_trie_seq_ops = {
2338 	.start  = fib_trie_seq_start,
2339 	.next   = fib_trie_seq_next,
2340 	.stop   = fib_trie_seq_stop,
2341 	.show   = fib_trie_seq_show,
2342 };
2343 
2344 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2345 {
2346 	struct seq_file *seq;
2347 	int rc = -ENOMEM;
2348 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2349 
2350 	if (!s)
2351 		goto out;
2352 
2353 	rc = seq_open(file, &fib_trie_seq_ops);
2354 	if (rc)
2355 		goto out_kfree;
2356 
2357 	seq	     = file->private_data;
2358 	seq->private = s;
2359 	memset(s, 0, sizeof(*s));
2360 out:
2361 	return rc;
2362 out_kfree:
2363 	kfree(s);
2364 	goto out;
2365 }
2366 
2367 static struct file_operations fib_trie_fops = {
2368 	.owner  = THIS_MODULE,
2369 	.open   = fib_trie_seq_open,
2370 	.read   = seq_read,
2371 	.llseek = seq_lseek,
2372 	.release = seq_release_private,
2373 };
2374 
2375 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2376 {
2377 	static unsigned type2flags[RTN_MAX + 1] = {
2378 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2379 	};
2380 	unsigned flags = type2flags[type];
2381 
2382 	if (fi && fi->fib_nh->nh_gw)
2383 		flags |= RTF_GATEWAY;
2384 	if (mask == htonl(0xFFFFFFFF))
2385 		flags |= RTF_HOST;
2386 	flags |= RTF_UP;
2387 	return flags;
2388 }
2389 
2390 /*
2391  *	This outputs /proc/net/route.
2392  *	The format of the file is not supposed to be changed
2393  * 	and needs to be same as fib_hash output to avoid breaking
2394  *	legacy utilities
2395  */
2396 static int fib_route_seq_show(struct seq_file *seq, void *v)
2397 {
2398 	const struct fib_trie_iter *iter = seq->private;
2399 	struct leaf *l = v;
2400 	int i;
2401 	char bf[128];
2402 
2403 	if (v == SEQ_START_TOKEN) {
2404 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2405 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2406 			   "\tWindow\tIRTT");
2407 		return 0;
2408 	}
2409 
2410 	if (iter->trie == trie_local)
2411 		return 0;
2412 	if (IS_TNODE(l))
2413 		return 0;
2414 
2415 	for (i=32; i>=0; i--) {
2416 		struct leaf_info *li = find_leaf_info(l, i);
2417 		struct fib_alias *fa;
2418 		__be32 mask, prefix;
2419 
2420 		if (!li)
2421 			continue;
2422 
2423 		mask = inet_make_mask(li->plen);
2424 		prefix = htonl(l->key);
2425 
2426 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2427 			const struct fib_info *fi = fa->fa_info;
2428 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2429 
2430 			if (fa->fa_type == RTN_BROADCAST
2431 			    || fa->fa_type == RTN_MULTICAST)
2432 				continue;
2433 
2434 			if (fi)
2435 				snprintf(bf, sizeof(bf),
2436 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2437 					 fi->fib_dev ? fi->fib_dev->name : "*",
2438 					 prefix,
2439 					 fi->fib_nh->nh_gw, flags, 0, 0,
2440 					 fi->fib_priority,
2441 					 mask,
2442 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2443 					 fi->fib_window,
2444 					 fi->fib_rtt >> 3);
2445 			else
2446 				snprintf(bf, sizeof(bf),
2447 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2448 					 prefix, 0, flags, 0, 0, 0,
2449 					 mask, 0, 0, 0);
2450 
2451 			seq_printf(seq, "%-127s\n", bf);
2452 		}
2453 	}
2454 
2455 	return 0;
2456 }
2457 
2458 static struct seq_operations fib_route_seq_ops = {
2459 	.start  = fib_trie_seq_start,
2460 	.next   = fib_trie_seq_next,
2461 	.stop   = fib_trie_seq_stop,
2462 	.show   = fib_route_seq_show,
2463 };
2464 
2465 static int fib_route_seq_open(struct inode *inode, struct file *file)
2466 {
2467 	struct seq_file *seq;
2468 	int rc = -ENOMEM;
2469 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2470 
2471 	if (!s)
2472 		goto out;
2473 
2474 	rc = seq_open(file, &fib_route_seq_ops);
2475 	if (rc)
2476 		goto out_kfree;
2477 
2478 	seq	     = file->private_data;
2479 	seq->private = s;
2480 	memset(s, 0, sizeof(*s));
2481 out:
2482 	return rc;
2483 out_kfree:
2484 	kfree(s);
2485 	goto out;
2486 }
2487 
2488 static struct file_operations fib_route_fops = {
2489 	.owner  = THIS_MODULE,
2490 	.open   = fib_route_seq_open,
2491 	.read   = seq_read,
2492 	.llseek = seq_lseek,
2493 	.release = seq_release_private,
2494 };
2495 
2496 int __init fib_proc_init(void)
2497 {
2498 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2499 		goto out1;
2500 
2501 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2502 		goto out2;
2503 
2504 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2505 		goto out3;
2506 
2507 	return 0;
2508 
2509 out3:
2510 	proc_net_remove("fib_triestat");
2511 out2:
2512 	proc_net_remove("fib_trie");
2513 out1:
2514 	return -ENOMEM;
2515 }
2516 
2517 void __init fib_proc_exit(void)
2518 {
2519 	proc_net_remove("fib_trie");
2520 	proc_net_remove("fib_triestat");
2521 	proc_net_remove("route");
2522 }
2523 
2524 #endif /* CONFIG_PROC_FS */
2525