xref: /linux/net/ipv4/fib_trie.c (revision ba6e8564f459211117ce300eae2c7fdd23befe34)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.407"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #undef CONFIG_IP_FIB_TRIE_STATS
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90 
91 typedef unsigned int t_key;
92 
93 #define T_TNODE 0
94 #define T_LEAF  1
95 #define NODE_TYPE_MASK	0x1UL
96 #define NODE_PARENT(node) \
97 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
98 
99 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
100 
101 #define NODE_SET_PARENT(node, ptr)		\
102 	rcu_assign_pointer((node)->parent,	\
103 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
104 
105 #define IS_TNODE(n) (!(n->parent & T_LEAF))
106 #define IS_LEAF(n) (n->parent & T_LEAF)
107 
108 struct node {
109 	t_key key;
110 	unsigned long parent;
111 };
112 
113 struct leaf {
114 	t_key key;
115 	unsigned long parent;
116 	struct hlist_head list;
117 	struct rcu_head rcu;
118 };
119 
120 struct leaf_info {
121 	struct hlist_node hlist;
122 	struct rcu_head rcu;
123 	int plen;
124 	struct list_head falh;
125 };
126 
127 struct tnode {
128 	t_key key;
129 	unsigned long parent;
130 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
131 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
132 	unsigned short full_children;	/* KEYLENGTH bits needed */
133 	unsigned short empty_children;	/* KEYLENGTH bits needed */
134 	struct rcu_head rcu;
135 	struct node *child[0];
136 };
137 
138 #ifdef CONFIG_IP_FIB_TRIE_STATS
139 struct trie_use_stats {
140 	unsigned int gets;
141 	unsigned int backtrack;
142 	unsigned int semantic_match_passed;
143 	unsigned int semantic_match_miss;
144 	unsigned int null_node_hit;
145 	unsigned int resize_node_skipped;
146 };
147 #endif
148 
149 struct trie_stat {
150 	unsigned int totdepth;
151 	unsigned int maxdepth;
152 	unsigned int tnodes;
153 	unsigned int leaves;
154 	unsigned int nullpointers;
155 	unsigned int nodesizes[MAX_STAT_DEPTH];
156 };
157 
158 struct trie {
159 	struct node *trie;
160 #ifdef CONFIG_IP_FIB_TRIE_STATS
161 	struct trie_use_stats stats;
162 #endif
163 	int size;
164 	unsigned int revision;
165 };
166 
167 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
168 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
169 static struct node *resize(struct trie *t, struct tnode *tn);
170 static struct tnode *inflate(struct trie *t, struct tnode *tn);
171 static struct tnode *halve(struct trie *t, struct tnode *tn);
172 static void tnode_free(struct tnode *tn);
173 
174 static struct kmem_cache *fn_alias_kmem __read_mostly;
175 static struct trie *trie_local = NULL, *trie_main = NULL;
176 
177 
178 /* rcu_read_lock needs to be hold by caller from readside */
179 
180 static inline struct node *tnode_get_child(struct tnode *tn, int i)
181 {
182 	BUG_ON(i >= 1 << tn->bits);
183 
184 	return rcu_dereference(tn->child[i]);
185 }
186 
187 static inline int tnode_child_length(const struct tnode *tn)
188 {
189 	return 1 << tn->bits;
190 }
191 
192 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
193 {
194 	if (offset < KEYLENGTH)
195 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
196 	else
197 		return 0;
198 }
199 
200 static inline int tkey_equals(t_key a, t_key b)
201 {
202 	return a == b;
203 }
204 
205 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
206 {
207 	if (bits == 0 || offset >= KEYLENGTH)
208 		return 1;
209 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
210 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
211 }
212 
213 static inline int tkey_mismatch(t_key a, int offset, t_key b)
214 {
215 	t_key diff = a ^ b;
216 	int i = offset;
217 
218 	if (!diff)
219 		return 0;
220 	while ((diff << i) >> (KEYLENGTH-1) == 0)
221 		i++;
222 	return i;
223 }
224 
225 /*
226   To understand this stuff, an understanding of keys and all their bits is
227   necessary. Every node in the trie has a key associated with it, but not
228   all of the bits in that key are significant.
229 
230   Consider a node 'n' and its parent 'tp'.
231 
232   If n is a leaf, every bit in its key is significant. Its presence is
233   necessitated by path compression, since during a tree traversal (when
234   searching for a leaf - unless we are doing an insertion) we will completely
235   ignore all skipped bits we encounter. Thus we need to verify, at the end of
236   a potentially successful search, that we have indeed been walking the
237   correct key path.
238 
239   Note that we can never "miss" the correct key in the tree if present by
240   following the wrong path. Path compression ensures that segments of the key
241   that are the same for all keys with a given prefix are skipped, but the
242   skipped part *is* identical for each node in the subtrie below the skipped
243   bit! trie_insert() in this implementation takes care of that - note the
244   call to tkey_sub_equals() in trie_insert().
245 
246   if n is an internal node - a 'tnode' here, the various parts of its key
247   have many different meanings.
248 
249   Example:
250   _________________________________________________________________
251   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252   -----------------------------------------------------------------
253     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
254 
255   _________________________________________________________________
256   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257   -----------------------------------------------------------------
258    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
259 
260   tp->pos = 7
261   tp->bits = 3
262   n->pos = 15
263   n->bits = 4
264 
265   First, let's just ignore the bits that come before the parent tp, that is
266   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267   not use them for anything.
268 
269   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270   index into the parent's child array. That is, they will be used to find
271   'n' among tp's children.
272 
273   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274   for the node n.
275 
276   All the bits we have seen so far are significant to the node n. The rest
277   of the bits are really not needed or indeed known in n->key.
278 
279   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280   n's child array, and will of course be different for each child.
281 
282 
283   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284   at this point.
285 
286 */
287 
288 static inline void check_tnode(const struct tnode *tn)
289 {
290 	WARN_ON(tn && tn->pos+tn->bits > 32);
291 }
292 
293 static int halve_threshold = 25;
294 static int inflate_threshold = 50;
295 static int halve_threshold_root = 15;
296 static int inflate_threshold_root = 25;
297 
298 
299 static void __alias_free_mem(struct rcu_head *head)
300 {
301 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
302 	kmem_cache_free(fn_alias_kmem, fa);
303 }
304 
305 static inline void alias_free_mem_rcu(struct fib_alias *fa)
306 {
307 	call_rcu(&fa->rcu, __alias_free_mem);
308 }
309 
310 static void __leaf_free_rcu(struct rcu_head *head)
311 {
312 	kfree(container_of(head, struct leaf, rcu));
313 }
314 
315 static void __leaf_info_free_rcu(struct rcu_head *head)
316 {
317 	kfree(container_of(head, struct leaf_info, rcu));
318 }
319 
320 static inline void free_leaf_info(struct leaf_info *leaf)
321 {
322 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
323 }
324 
325 static struct tnode *tnode_alloc(unsigned int size)
326 {
327 	struct page *pages;
328 
329 	if (size <= PAGE_SIZE)
330 		return kcalloc(size, 1, GFP_KERNEL);
331 
332 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
333 	if (!pages)
334 		return NULL;
335 
336 	return page_address(pages);
337 }
338 
339 static void __tnode_free_rcu(struct rcu_head *head)
340 {
341 	struct tnode *tn = container_of(head, struct tnode, rcu);
342 	unsigned int size = sizeof(struct tnode) +
343 		(1 << tn->bits) * sizeof(struct node *);
344 
345 	if (size <= PAGE_SIZE)
346 		kfree(tn);
347 	else
348 		free_pages((unsigned long)tn, get_order(size));
349 }
350 
351 static inline void tnode_free(struct tnode *tn)
352 {
353 	if(IS_LEAF(tn)) {
354 		struct leaf *l = (struct leaf *) tn;
355 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
356 	}
357 	else
358 		call_rcu(&tn->rcu, __tnode_free_rcu);
359 }
360 
361 static struct leaf *leaf_new(void)
362 {
363 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
364 	if (l) {
365 		l->parent = T_LEAF;
366 		INIT_HLIST_HEAD(&l->list);
367 	}
368 	return l;
369 }
370 
371 static struct leaf_info *leaf_info_new(int plen)
372 {
373 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
374 	if (li) {
375 		li->plen = plen;
376 		INIT_LIST_HEAD(&li->falh);
377 	}
378 	return li;
379 }
380 
381 static struct tnode* tnode_new(t_key key, int pos, int bits)
382 {
383 	int nchildren = 1<<bits;
384 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
385 	struct tnode *tn = tnode_alloc(sz);
386 
387 	if (tn) {
388 		memset(tn, 0, sz);
389 		tn->parent = T_TNODE;
390 		tn->pos = pos;
391 		tn->bits = bits;
392 		tn->key = key;
393 		tn->full_children = 0;
394 		tn->empty_children = 1<<bits;
395 	}
396 
397 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
398 		 (unsigned int) (sizeof(struct node) * 1<<bits));
399 	return tn;
400 }
401 
402 /*
403  * Check whether a tnode 'n' is "full", i.e. it is an internal node
404  * and no bits are skipped. See discussion in dyntree paper p. 6
405  */
406 
407 static inline int tnode_full(const struct tnode *tn, const struct node *n)
408 {
409 	if (n == NULL || IS_LEAF(n))
410 		return 0;
411 
412 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
413 }
414 
415 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
416 {
417 	tnode_put_child_reorg(tn, i, n, -1);
418 }
419 
420  /*
421   * Add a child at position i overwriting the old value.
422   * Update the value of full_children and empty_children.
423   */
424 
425 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
426 {
427 	struct node *chi = tn->child[i];
428 	int isfull;
429 
430 	BUG_ON(i >= 1<<tn->bits);
431 
432 
433 	/* update emptyChildren */
434 	if (n == NULL && chi != NULL)
435 		tn->empty_children++;
436 	else if (n != NULL && chi == NULL)
437 		tn->empty_children--;
438 
439 	/* update fullChildren */
440 	if (wasfull == -1)
441 		wasfull = tnode_full(tn, chi);
442 
443 	isfull = tnode_full(tn, n);
444 	if (wasfull && !isfull)
445 		tn->full_children--;
446 	else if (!wasfull && isfull)
447 		tn->full_children++;
448 
449 	if (n)
450 		NODE_SET_PARENT(n, tn);
451 
452 	rcu_assign_pointer(tn->child[i], n);
453 }
454 
455 static struct node *resize(struct trie *t, struct tnode *tn)
456 {
457 	int i;
458 	int err = 0;
459 	struct tnode *old_tn;
460 	int inflate_threshold_use;
461 	int halve_threshold_use;
462 
463 	if (!tn)
464 		return NULL;
465 
466 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 		 tn, inflate_threshold, halve_threshold);
468 
469 	/* No children */
470 	if (tn->empty_children == tnode_child_length(tn)) {
471 		tnode_free(tn);
472 		return NULL;
473 	}
474 	/* One child */
475 	if (tn->empty_children == tnode_child_length(tn) - 1)
476 		for (i = 0; i < tnode_child_length(tn); i++) {
477 			struct node *n;
478 
479 			n = tn->child[i];
480 			if (!n)
481 				continue;
482 
483 			/* compress one level */
484 			NODE_SET_PARENT(n, NULL);
485 			tnode_free(tn);
486 			return n;
487 		}
488 	/*
489 	 * Double as long as the resulting node has a number of
490 	 * nonempty nodes that are above the threshold.
491 	 */
492 
493 	/*
494 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
495 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
496 	 * Telecommunications, page 6:
497 	 * "A node is doubled if the ratio of non-empty children to all
498 	 * children in the *doubled* node is at least 'high'."
499 	 *
500 	 * 'high' in this instance is the variable 'inflate_threshold'. It
501 	 * is expressed as a percentage, so we multiply it with
502 	 * tnode_child_length() and instead of multiplying by 2 (since the
503 	 * child array will be doubled by inflate()) and multiplying
504 	 * the left-hand side by 100 (to handle the percentage thing) we
505 	 * multiply the left-hand side by 50.
506 	 *
507 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
508 	 * - tn->empty_children is of course the number of non-null children
509 	 * in the current node. tn->full_children is the number of "full"
510 	 * children, that is non-null tnodes with a skip value of 0.
511 	 * All of those will be doubled in the resulting inflated tnode, so
512 	 * we just count them one extra time here.
513 	 *
514 	 * A clearer way to write this would be:
515 	 *
516 	 * to_be_doubled = tn->full_children;
517 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
518 	 *     tn->full_children;
519 	 *
520 	 * new_child_length = tnode_child_length(tn) * 2;
521 	 *
522 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
523 	 *      new_child_length;
524 	 * if (new_fill_factor >= inflate_threshold)
525 	 *
526 	 * ...and so on, tho it would mess up the while () loop.
527 	 *
528 	 * anyway,
529 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
530 	 *      inflate_threshold
531 	 *
532 	 * avoid a division:
533 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
534 	 *      inflate_threshold * new_child_length
535 	 *
536 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
537 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
538 	 *    tn->full_children) >= inflate_threshold * new_child_length
539 	 *
540 	 * expand new_child_length:
541 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 	 *    tn->full_children) >=
543 	 *      inflate_threshold * tnode_child_length(tn) * 2
544 	 *
545 	 * shorten again:
546 	 * 50 * (tn->full_children + tnode_child_length(tn) -
547 	 *    tn->empty_children) >= inflate_threshold *
548 	 *    tnode_child_length(tn)
549 	 *
550 	 */
551 
552 	check_tnode(tn);
553 
554 	/* Keep root node larger  */
555 
556 	if(!tn->parent)
557 		inflate_threshold_use = inflate_threshold_root;
558 	else
559 		inflate_threshold_use = inflate_threshold;
560 
561 	err = 0;
562 	while ((tn->full_children > 0 &&
563 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
564 				inflate_threshold_use * tnode_child_length(tn))) {
565 
566 		old_tn = tn;
567 		tn = inflate(t, tn);
568 		if (IS_ERR(tn)) {
569 			tn = old_tn;
570 #ifdef CONFIG_IP_FIB_TRIE_STATS
571 			t->stats.resize_node_skipped++;
572 #endif
573 			break;
574 		}
575 	}
576 
577 	check_tnode(tn);
578 
579 	/*
580 	 * Halve as long as the number of empty children in this
581 	 * node is above threshold.
582 	 */
583 
584 
585 	/* Keep root node larger  */
586 
587 	if(!tn->parent)
588 		halve_threshold_use = halve_threshold_root;
589 	else
590 		halve_threshold_use = halve_threshold;
591 
592 	err = 0;
593 	while (tn->bits > 1 &&
594 	       100 * (tnode_child_length(tn) - tn->empty_children) <
595 	       halve_threshold_use * tnode_child_length(tn)) {
596 
597 		old_tn = tn;
598 		tn = halve(t, tn);
599 		if (IS_ERR(tn)) {
600 			tn = old_tn;
601 #ifdef CONFIG_IP_FIB_TRIE_STATS
602 			t->stats.resize_node_skipped++;
603 #endif
604 			break;
605 		}
606 	}
607 
608 
609 	/* Only one child remains */
610 	if (tn->empty_children == tnode_child_length(tn) - 1)
611 		for (i = 0; i < tnode_child_length(tn); i++) {
612 			struct node *n;
613 
614 			n = tn->child[i];
615 			if (!n)
616 				continue;
617 
618 			/* compress one level */
619 
620 			NODE_SET_PARENT(n, NULL);
621 			tnode_free(tn);
622 			return n;
623 		}
624 
625 	return (struct node *) tn;
626 }
627 
628 static struct tnode *inflate(struct trie *t, struct tnode *tn)
629 {
630 	struct tnode *inode;
631 	struct tnode *oldtnode = tn;
632 	int olen = tnode_child_length(tn);
633 	int i;
634 
635 	pr_debug("In inflate\n");
636 
637 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
638 
639 	if (!tn)
640 		return ERR_PTR(-ENOMEM);
641 
642 	/*
643 	 * Preallocate and store tnodes before the actual work so we
644 	 * don't get into an inconsistent state if memory allocation
645 	 * fails. In case of failure we return the oldnode and  inflate
646 	 * of tnode is ignored.
647 	 */
648 
649 	for (i = 0; i < olen; i++) {
650 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
651 
652 		if (inode &&
653 		    IS_TNODE(inode) &&
654 		    inode->pos == oldtnode->pos + oldtnode->bits &&
655 		    inode->bits > 1) {
656 			struct tnode *left, *right;
657 			t_key m = TKEY_GET_MASK(inode->pos, 1);
658 
659 			left = tnode_new(inode->key&(~m), inode->pos + 1,
660 					 inode->bits - 1);
661 			if (!left)
662 				goto nomem;
663 
664 			right = tnode_new(inode->key|m, inode->pos + 1,
665 					  inode->bits - 1);
666 
667 			if (!right) {
668 				tnode_free(left);
669 				goto nomem;
670 			}
671 
672 			put_child(t, tn, 2*i, (struct node *) left);
673 			put_child(t, tn, 2*i+1, (struct node *) right);
674 		}
675 	}
676 
677 	for (i = 0; i < olen; i++) {
678 		struct node *node = tnode_get_child(oldtnode, i);
679 		struct tnode *left, *right;
680 		int size, j;
681 
682 		/* An empty child */
683 		if (node == NULL)
684 			continue;
685 
686 		/* A leaf or an internal node with skipped bits */
687 
688 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
689 		   tn->pos + tn->bits - 1) {
690 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
691 					     1) == 0)
692 				put_child(t, tn, 2*i, node);
693 			else
694 				put_child(t, tn, 2*i+1, node);
695 			continue;
696 		}
697 
698 		/* An internal node with two children */
699 		inode = (struct tnode *) node;
700 
701 		if (inode->bits == 1) {
702 			put_child(t, tn, 2*i, inode->child[0]);
703 			put_child(t, tn, 2*i+1, inode->child[1]);
704 
705 			tnode_free(inode);
706 			continue;
707 		}
708 
709 		/* An internal node with more than two children */
710 
711 		/* We will replace this node 'inode' with two new
712 		 * ones, 'left' and 'right', each with half of the
713 		 * original children. The two new nodes will have
714 		 * a position one bit further down the key and this
715 		 * means that the "significant" part of their keys
716 		 * (see the discussion near the top of this file)
717 		 * will differ by one bit, which will be "0" in
718 		 * left's key and "1" in right's key. Since we are
719 		 * moving the key position by one step, the bit that
720 		 * we are moving away from - the bit at position
721 		 * (inode->pos) - is the one that will differ between
722 		 * left and right. So... we synthesize that bit in the
723 		 * two  new keys.
724 		 * The mask 'm' below will be a single "one" bit at
725 		 * the position (inode->pos)
726 		 */
727 
728 		/* Use the old key, but set the new significant
729 		 *   bit to zero.
730 		 */
731 
732 		left = (struct tnode *) tnode_get_child(tn, 2*i);
733 		put_child(t, tn, 2*i, NULL);
734 
735 		BUG_ON(!left);
736 
737 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
738 		put_child(t, tn, 2*i+1, NULL);
739 
740 		BUG_ON(!right);
741 
742 		size = tnode_child_length(left);
743 		for (j = 0; j < size; j++) {
744 			put_child(t, left, j, inode->child[j]);
745 			put_child(t, right, j, inode->child[j + size]);
746 		}
747 		put_child(t, tn, 2*i, resize(t, left));
748 		put_child(t, tn, 2*i+1, resize(t, right));
749 
750 		tnode_free(inode);
751 	}
752 	tnode_free(oldtnode);
753 	return tn;
754 nomem:
755 	{
756 		int size = tnode_child_length(tn);
757 		int j;
758 
759 		for (j = 0; j < size; j++)
760 			if (tn->child[j])
761 				tnode_free((struct tnode *)tn->child[j]);
762 
763 		tnode_free(tn);
764 
765 		return ERR_PTR(-ENOMEM);
766 	}
767 }
768 
769 static struct tnode *halve(struct trie *t, struct tnode *tn)
770 {
771 	struct tnode *oldtnode = tn;
772 	struct node *left, *right;
773 	int i;
774 	int olen = tnode_child_length(tn);
775 
776 	pr_debug("In halve\n");
777 
778 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
779 
780 	if (!tn)
781 		return ERR_PTR(-ENOMEM);
782 
783 	/*
784 	 * Preallocate and store tnodes before the actual work so we
785 	 * don't get into an inconsistent state if memory allocation
786 	 * fails. In case of failure we return the oldnode and halve
787 	 * of tnode is ignored.
788 	 */
789 
790 	for (i = 0; i < olen; i += 2) {
791 		left = tnode_get_child(oldtnode, i);
792 		right = tnode_get_child(oldtnode, i+1);
793 
794 		/* Two nonempty children */
795 		if (left && right) {
796 			struct tnode *newn;
797 
798 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
799 
800 			if (!newn)
801 				goto nomem;
802 
803 			put_child(t, tn, i/2, (struct node *)newn);
804 		}
805 
806 	}
807 
808 	for (i = 0; i < olen; i += 2) {
809 		struct tnode *newBinNode;
810 
811 		left = tnode_get_child(oldtnode, i);
812 		right = tnode_get_child(oldtnode, i+1);
813 
814 		/* At least one of the children is empty */
815 		if (left == NULL) {
816 			if (right == NULL)    /* Both are empty */
817 				continue;
818 			put_child(t, tn, i/2, right);
819 			continue;
820 		}
821 
822 		if (right == NULL) {
823 			put_child(t, tn, i/2, left);
824 			continue;
825 		}
826 
827 		/* Two nonempty children */
828 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
829 		put_child(t, tn, i/2, NULL);
830 		put_child(t, newBinNode, 0, left);
831 		put_child(t, newBinNode, 1, right);
832 		put_child(t, tn, i/2, resize(t, newBinNode));
833 	}
834 	tnode_free(oldtnode);
835 	return tn;
836 nomem:
837 	{
838 		int size = tnode_child_length(tn);
839 		int j;
840 
841 		for (j = 0; j < size; j++)
842 			if (tn->child[j])
843 				tnode_free((struct tnode *)tn->child[j]);
844 
845 		tnode_free(tn);
846 
847 		return ERR_PTR(-ENOMEM);
848 	}
849 }
850 
851 static void trie_init(struct trie *t)
852 {
853 	if (!t)
854 		return;
855 
856 	t->size = 0;
857 	rcu_assign_pointer(t->trie, NULL);
858 	t->revision = 0;
859 #ifdef CONFIG_IP_FIB_TRIE_STATS
860 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
861 #endif
862 }
863 
864 /* readside must use rcu_read_lock currently dump routines
865  via get_fa_head and dump */
866 
867 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
868 {
869 	struct hlist_head *head = &l->list;
870 	struct hlist_node *node;
871 	struct leaf_info *li;
872 
873 	hlist_for_each_entry_rcu(li, node, head, hlist)
874 		if (li->plen == plen)
875 			return li;
876 
877 	return NULL;
878 }
879 
880 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
881 {
882 	struct leaf_info *li = find_leaf_info(l, plen);
883 
884 	if (!li)
885 		return NULL;
886 
887 	return &li->falh;
888 }
889 
890 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
891 {
892 	struct leaf_info *li = NULL, *last = NULL;
893 	struct hlist_node *node;
894 
895 	if (hlist_empty(head)) {
896 		hlist_add_head_rcu(&new->hlist, head);
897 	} else {
898 		hlist_for_each_entry(li, node, head, hlist) {
899 			if (new->plen > li->plen)
900 				break;
901 
902 			last = li;
903 		}
904 		if (last)
905 			hlist_add_after_rcu(&last->hlist, &new->hlist);
906 		else
907 			hlist_add_before_rcu(&new->hlist, &li->hlist);
908 	}
909 }
910 
911 /* rcu_read_lock needs to be hold by caller from readside */
912 
913 static struct leaf *
914 fib_find_node(struct trie *t, u32 key)
915 {
916 	int pos;
917 	struct tnode *tn;
918 	struct node *n;
919 
920 	pos = 0;
921 	n = rcu_dereference(t->trie);
922 
923 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
924 		tn = (struct tnode *) n;
925 
926 		check_tnode(tn);
927 
928 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
929 			pos = tn->pos + tn->bits;
930 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
931 		} else
932 			break;
933 	}
934 	/* Case we have found a leaf. Compare prefixes */
935 
936 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
937 		return (struct leaf *)n;
938 
939 	return NULL;
940 }
941 
942 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
943 {
944 	int wasfull;
945 	t_key cindex, key;
946 	struct tnode *tp = NULL;
947 
948 	key = tn->key;
949 
950 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
951 
952 		tp = NODE_PARENT(tn);
953 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
954 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
955 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
956 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
957 
958 		if (!NODE_PARENT(tn))
959 			break;
960 
961 		tn = NODE_PARENT(tn);
962 	}
963 	/* Handle last (top) tnode */
964 	if (IS_TNODE(tn))
965 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
966 
967 	return (struct node*) tn;
968 }
969 
970 /* only used from updater-side */
971 
972 static  struct list_head *
973 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
974 {
975 	int pos, newpos;
976 	struct tnode *tp = NULL, *tn = NULL;
977 	struct node *n;
978 	struct leaf *l;
979 	int missbit;
980 	struct list_head *fa_head = NULL;
981 	struct leaf_info *li;
982 	t_key cindex;
983 
984 	pos = 0;
985 	n = t->trie;
986 
987 	/* If we point to NULL, stop. Either the tree is empty and we should
988 	 * just put a new leaf in if, or we have reached an empty child slot,
989 	 * and we should just put our new leaf in that.
990 	 * If we point to a T_TNODE, check if it matches our key. Note that
991 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
992 	 * not be the parent's 'pos'+'bits'!
993 	 *
994 	 * If it does match the current key, get pos/bits from it, extract
995 	 * the index from our key, push the T_TNODE and walk the tree.
996 	 *
997 	 * If it doesn't, we have to replace it with a new T_TNODE.
998 	 *
999 	 * If we point to a T_LEAF, it might or might not have the same key
1000 	 * as we do. If it does, just change the value, update the T_LEAF's
1001 	 * value, and return it.
1002 	 * If it doesn't, we need to replace it with a T_TNODE.
1003 	 */
1004 
1005 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1006 		tn = (struct tnode *) n;
1007 
1008 		check_tnode(tn);
1009 
1010 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1011 			tp = tn;
1012 			pos = tn->pos + tn->bits;
1013 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1014 
1015 			BUG_ON(n && NODE_PARENT(n) != tn);
1016 		} else
1017 			break;
1018 	}
1019 
1020 	/*
1021 	 * n  ----> NULL, LEAF or TNODE
1022 	 *
1023 	 * tp is n's (parent) ----> NULL or TNODE
1024 	 */
1025 
1026 	BUG_ON(tp && IS_LEAF(tp));
1027 
1028 	/* Case 1: n is a leaf. Compare prefixes */
1029 
1030 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1031 		struct leaf *l = (struct leaf *) n;
1032 
1033 		li = leaf_info_new(plen);
1034 
1035 		if (!li) {
1036 			*err = -ENOMEM;
1037 			goto err;
1038 		}
1039 
1040 		fa_head = &li->falh;
1041 		insert_leaf_info(&l->list, li);
1042 		goto done;
1043 	}
1044 	t->size++;
1045 	l = leaf_new();
1046 
1047 	if (!l) {
1048 		*err = -ENOMEM;
1049 		goto err;
1050 	}
1051 
1052 	l->key = key;
1053 	li = leaf_info_new(plen);
1054 
1055 	if (!li) {
1056 		tnode_free((struct tnode *) l);
1057 		*err = -ENOMEM;
1058 		goto err;
1059 	}
1060 
1061 	fa_head = &li->falh;
1062 	insert_leaf_info(&l->list, li);
1063 
1064 	if (t->trie && n == NULL) {
1065 		/* Case 2: n is NULL, and will just insert a new leaf */
1066 
1067 		NODE_SET_PARENT(l, tp);
1068 
1069 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1070 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1071 	} else {
1072 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1073 		/*
1074 		 *  Add a new tnode here
1075 		 *  first tnode need some special handling
1076 		 */
1077 
1078 		if (tp)
1079 			pos = tp->pos+tp->bits;
1080 		else
1081 			pos = 0;
1082 
1083 		if (n) {
1084 			newpos = tkey_mismatch(key, pos, n->key);
1085 			tn = tnode_new(n->key, newpos, 1);
1086 		} else {
1087 			newpos = 0;
1088 			tn = tnode_new(key, newpos, 1); /* First tnode */
1089 		}
1090 
1091 		if (!tn) {
1092 			free_leaf_info(li);
1093 			tnode_free((struct tnode *) l);
1094 			*err = -ENOMEM;
1095 			goto err;
1096 		}
1097 
1098 		NODE_SET_PARENT(tn, tp);
1099 
1100 		missbit = tkey_extract_bits(key, newpos, 1);
1101 		put_child(t, tn, missbit, (struct node *)l);
1102 		put_child(t, tn, 1-missbit, n);
1103 
1104 		if (tp) {
1105 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1106 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1107 		} else {
1108 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1109 			tp = tn;
1110 		}
1111 	}
1112 
1113 	if (tp && tp->pos + tp->bits > 32)
1114 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1115 		       tp, tp->pos, tp->bits, key, plen);
1116 
1117 	/* Rebalance the trie */
1118 
1119 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1120 done:
1121 	t->revision++;
1122 err:
1123 	return fa_head;
1124 }
1125 
1126 /*
1127  * Caller must hold RTNL.
1128  */
1129 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1130 {
1131 	struct trie *t = (struct trie *) tb->tb_data;
1132 	struct fib_alias *fa, *new_fa;
1133 	struct list_head *fa_head = NULL;
1134 	struct fib_info *fi;
1135 	int plen = cfg->fc_dst_len;
1136 	u8 tos = cfg->fc_tos;
1137 	u32 key, mask;
1138 	int err;
1139 	struct leaf *l;
1140 
1141 	if (plen > 32)
1142 		return -EINVAL;
1143 
1144 	key = ntohl(cfg->fc_dst);
1145 
1146 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1147 
1148 	mask = ntohl(inet_make_mask(plen));
1149 
1150 	if (key & ~mask)
1151 		return -EINVAL;
1152 
1153 	key = key & mask;
1154 
1155 	fi = fib_create_info(cfg);
1156 	if (IS_ERR(fi)) {
1157 		err = PTR_ERR(fi);
1158 		goto err;
1159 	}
1160 
1161 	l = fib_find_node(t, key);
1162 	fa = NULL;
1163 
1164 	if (l) {
1165 		fa_head = get_fa_head(l, plen);
1166 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1167 	}
1168 
1169 	/* Now fa, if non-NULL, points to the first fib alias
1170 	 * with the same keys [prefix,tos,priority], if such key already
1171 	 * exists or to the node before which we will insert new one.
1172 	 *
1173 	 * If fa is NULL, we will need to allocate a new one and
1174 	 * insert to the head of f.
1175 	 *
1176 	 * If f is NULL, no fib node matched the destination key
1177 	 * and we need to allocate a new one of those as well.
1178 	 */
1179 
1180 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1181 		struct fib_alias *fa_orig;
1182 
1183 		err = -EEXIST;
1184 		if (cfg->fc_nlflags & NLM_F_EXCL)
1185 			goto out;
1186 
1187 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1188 			struct fib_info *fi_drop;
1189 			u8 state;
1190 
1191 			err = -ENOBUFS;
1192 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1193 			if (new_fa == NULL)
1194 				goto out;
1195 
1196 			fi_drop = fa->fa_info;
1197 			new_fa->fa_tos = fa->fa_tos;
1198 			new_fa->fa_info = fi;
1199 			new_fa->fa_type = cfg->fc_type;
1200 			new_fa->fa_scope = cfg->fc_scope;
1201 			state = fa->fa_state;
1202 			new_fa->fa_state &= ~FA_S_ACCESSED;
1203 
1204 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1205 			alias_free_mem_rcu(fa);
1206 
1207 			fib_release_info(fi_drop);
1208 			if (state & FA_S_ACCESSED)
1209 				rt_cache_flush(-1);
1210 
1211 			goto succeeded;
1212 		}
1213 		/* Error if we find a perfect match which
1214 		 * uses the same scope, type, and nexthop
1215 		 * information.
1216 		 */
1217 		fa_orig = fa;
1218 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1219 			if (fa->fa_tos != tos)
1220 				break;
1221 			if (fa->fa_info->fib_priority != fi->fib_priority)
1222 				break;
1223 			if (fa->fa_type == cfg->fc_type &&
1224 			    fa->fa_scope == cfg->fc_scope &&
1225 			    fa->fa_info == fi) {
1226 				goto out;
1227 			}
1228 		}
1229 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1230 			fa = fa_orig;
1231 	}
1232 	err = -ENOENT;
1233 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1234 		goto out;
1235 
1236 	err = -ENOBUFS;
1237 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1238 	if (new_fa == NULL)
1239 		goto out;
1240 
1241 	new_fa->fa_info = fi;
1242 	new_fa->fa_tos = tos;
1243 	new_fa->fa_type = cfg->fc_type;
1244 	new_fa->fa_scope = cfg->fc_scope;
1245 	new_fa->fa_state = 0;
1246 	/*
1247 	 * Insert new entry to the list.
1248 	 */
1249 
1250 	if (!fa_head) {
1251 		err = 0;
1252 		fa_head = fib_insert_node(t, &err, key, plen);
1253 		if (err)
1254 			goto out_free_new_fa;
1255 	}
1256 
1257 	list_add_tail_rcu(&new_fa->fa_list,
1258 			  (fa ? &fa->fa_list : fa_head));
1259 
1260 	rt_cache_flush(-1);
1261 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1262 		  &cfg->fc_nlinfo);
1263 succeeded:
1264 	return 0;
1265 
1266 out_free_new_fa:
1267 	kmem_cache_free(fn_alias_kmem, new_fa);
1268 out:
1269 	fib_release_info(fi);
1270 err:
1271 	return err;
1272 }
1273 
1274 
1275 /* should be called with rcu_read_lock */
1276 static inline int check_leaf(struct trie *t, struct leaf *l,
1277 			     t_key key, int *plen, const struct flowi *flp,
1278 			     struct fib_result *res)
1279 {
1280 	int err, i;
1281 	__be32 mask;
1282 	struct leaf_info *li;
1283 	struct hlist_head *hhead = &l->list;
1284 	struct hlist_node *node;
1285 
1286 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1287 		i = li->plen;
1288 		mask = inet_make_mask(i);
1289 		if (l->key != (key & ntohl(mask)))
1290 			continue;
1291 
1292 		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1293 			*plen = i;
1294 #ifdef CONFIG_IP_FIB_TRIE_STATS
1295 			t->stats.semantic_match_passed++;
1296 #endif
1297 			return err;
1298 		}
1299 #ifdef CONFIG_IP_FIB_TRIE_STATS
1300 		t->stats.semantic_match_miss++;
1301 #endif
1302 	}
1303 	return 1;
1304 }
1305 
1306 static int
1307 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1308 {
1309 	struct trie *t = (struct trie *) tb->tb_data;
1310 	int plen, ret = 0;
1311 	struct node *n;
1312 	struct tnode *pn;
1313 	int pos, bits;
1314 	t_key key = ntohl(flp->fl4_dst);
1315 	int chopped_off;
1316 	t_key cindex = 0;
1317 	int current_prefix_length = KEYLENGTH;
1318 	struct tnode *cn;
1319 	t_key node_prefix, key_prefix, pref_mismatch;
1320 	int mp;
1321 
1322 	rcu_read_lock();
1323 
1324 	n = rcu_dereference(t->trie);
1325 	if (!n)
1326 		goto failed;
1327 
1328 #ifdef CONFIG_IP_FIB_TRIE_STATS
1329 	t->stats.gets++;
1330 #endif
1331 
1332 	/* Just a leaf? */
1333 	if (IS_LEAF(n)) {
1334 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1335 			goto found;
1336 		goto failed;
1337 	}
1338 	pn = (struct tnode *) n;
1339 	chopped_off = 0;
1340 
1341 	while (pn) {
1342 		pos = pn->pos;
1343 		bits = pn->bits;
1344 
1345 		if (!chopped_off)
1346 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1347 
1348 		n = tnode_get_child(pn, cindex);
1349 
1350 		if (n == NULL) {
1351 #ifdef CONFIG_IP_FIB_TRIE_STATS
1352 			t->stats.null_node_hit++;
1353 #endif
1354 			goto backtrace;
1355 		}
1356 
1357 		if (IS_LEAF(n)) {
1358 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1359 				goto found;
1360 			else
1361 				goto backtrace;
1362 		}
1363 
1364 #define HL_OPTIMIZE
1365 #ifdef HL_OPTIMIZE
1366 		cn = (struct tnode *)n;
1367 
1368 		/*
1369 		 * It's a tnode, and we can do some extra checks here if we
1370 		 * like, to avoid descending into a dead-end branch.
1371 		 * This tnode is in the parent's child array at index
1372 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1373 		 * chopped off, so in reality the index may be just a
1374 		 * subprefix, padded with zero at the end.
1375 		 * We can also take a look at any skipped bits in this
1376 		 * tnode - everything up to p_pos is supposed to be ok,
1377 		 * and the non-chopped bits of the index (se previous
1378 		 * paragraph) are also guaranteed ok, but the rest is
1379 		 * considered unknown.
1380 		 *
1381 		 * The skipped bits are key[pos+bits..cn->pos].
1382 		 */
1383 
1384 		/* If current_prefix_length < pos+bits, we are already doing
1385 		 * actual prefix  matching, which means everything from
1386 		 * pos+(bits-chopped_off) onward must be zero along some
1387 		 * branch of this subtree - otherwise there is *no* valid
1388 		 * prefix present. Here we can only check the skipped
1389 		 * bits. Remember, since we have already indexed into the
1390 		 * parent's child array, we know that the bits we chopped of
1391 		 * *are* zero.
1392 		 */
1393 
1394 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1395 
1396 		if (current_prefix_length < pos+bits) {
1397 			if (tkey_extract_bits(cn->key, current_prefix_length,
1398 						cn->pos - current_prefix_length) != 0 ||
1399 			    !(cn->child[0]))
1400 				goto backtrace;
1401 		}
1402 
1403 		/*
1404 		 * If chopped_off=0, the index is fully validated and we
1405 		 * only need to look at the skipped bits for this, the new,
1406 		 * tnode. What we actually want to do is to find out if
1407 		 * these skipped bits match our key perfectly, or if we will
1408 		 * have to count on finding a matching prefix further down,
1409 		 * because if we do, we would like to have some way of
1410 		 * verifying the existence of such a prefix at this point.
1411 		 */
1412 
1413 		/* The only thing we can do at this point is to verify that
1414 		 * any such matching prefix can indeed be a prefix to our
1415 		 * key, and if the bits in the node we are inspecting that
1416 		 * do not match our key are not ZERO, this cannot be true.
1417 		 * Thus, find out where there is a mismatch (before cn->pos)
1418 		 * and verify that all the mismatching bits are zero in the
1419 		 * new tnode's key.
1420 		 */
1421 
1422 		/* Note: We aren't very concerned about the piece of the key
1423 		 * that precede pn->pos+pn->bits, since these have already been
1424 		 * checked. The bits after cn->pos aren't checked since these are
1425 		 * by definition "unknown" at this point. Thus, what we want to
1426 		 * see is if we are about to enter the "prefix matching" state,
1427 		 * and in that case verify that the skipped bits that will prevail
1428 		 * throughout this subtree are zero, as they have to be if we are
1429 		 * to find a matching prefix.
1430 		 */
1431 
1432 		node_prefix = MASK_PFX(cn->key, cn->pos);
1433 		key_prefix = MASK_PFX(key, cn->pos);
1434 		pref_mismatch = key_prefix^node_prefix;
1435 		mp = 0;
1436 
1437 		/* In short: If skipped bits in this node do not match the search
1438 		 * key, enter the "prefix matching" state.directly.
1439 		 */
1440 		if (pref_mismatch) {
1441 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1442 				mp++;
1443 				pref_mismatch = pref_mismatch <<1;
1444 			}
1445 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1446 
1447 			if (key_prefix != 0)
1448 				goto backtrace;
1449 
1450 			if (current_prefix_length >= cn->pos)
1451 				current_prefix_length = mp;
1452 		}
1453 #endif
1454 		pn = (struct tnode *)n; /* Descend */
1455 		chopped_off = 0;
1456 		continue;
1457 
1458 backtrace:
1459 		chopped_off++;
1460 
1461 		/* As zero don't change the child key (cindex) */
1462 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1463 			chopped_off++;
1464 
1465 		/* Decrease current_... with bits chopped off */
1466 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1467 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1468 
1469 		/*
1470 		 * Either we do the actual chop off according or if we have
1471 		 * chopped off all bits in this tnode walk up to our parent.
1472 		 */
1473 
1474 		if (chopped_off <= pn->bits) {
1475 			cindex &= ~(1 << (chopped_off-1));
1476 		} else {
1477 			if (NODE_PARENT(pn) == NULL)
1478 				goto failed;
1479 
1480 			/* Get Child's index */
1481 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1482 			pn = NODE_PARENT(pn);
1483 			chopped_off = 0;
1484 
1485 #ifdef CONFIG_IP_FIB_TRIE_STATS
1486 			t->stats.backtrack++;
1487 #endif
1488 			goto backtrace;
1489 		}
1490 	}
1491 failed:
1492 	ret = 1;
1493 found:
1494 	rcu_read_unlock();
1495 	return ret;
1496 }
1497 
1498 /* only called from updater side */
1499 static int trie_leaf_remove(struct trie *t, t_key key)
1500 {
1501 	t_key cindex;
1502 	struct tnode *tp = NULL;
1503 	struct node *n = t->trie;
1504 	struct leaf *l;
1505 
1506 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1507 
1508 	/* Note that in the case skipped bits, those bits are *not* checked!
1509 	 * When we finish this, we will have NULL or a T_LEAF, and the
1510 	 * T_LEAF may or may not match our key.
1511 	 */
1512 
1513 	while (n != NULL && IS_TNODE(n)) {
1514 		struct tnode *tn = (struct tnode *) n;
1515 		check_tnode(tn);
1516 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1517 
1518 		BUG_ON(n && NODE_PARENT(n) != tn);
1519 	}
1520 	l = (struct leaf *) n;
1521 
1522 	if (!n || !tkey_equals(l->key, key))
1523 		return 0;
1524 
1525 	/*
1526 	 * Key found.
1527 	 * Remove the leaf and rebalance the tree
1528 	 */
1529 
1530 	t->revision++;
1531 	t->size--;
1532 
1533 	tp = NODE_PARENT(n);
1534 	tnode_free((struct tnode *) n);
1535 
1536 	if (tp) {
1537 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1538 		put_child(t, (struct tnode *)tp, cindex, NULL);
1539 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1540 	} else
1541 		rcu_assign_pointer(t->trie, NULL);
1542 
1543 	return 1;
1544 }
1545 
1546 /*
1547  * Caller must hold RTNL.
1548  */
1549 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1550 {
1551 	struct trie *t = (struct trie *) tb->tb_data;
1552 	u32 key, mask;
1553 	int plen = cfg->fc_dst_len;
1554 	u8 tos = cfg->fc_tos;
1555 	struct fib_alias *fa, *fa_to_delete;
1556 	struct list_head *fa_head;
1557 	struct leaf *l;
1558 	struct leaf_info *li;
1559 
1560 	if (plen > 32)
1561 		return -EINVAL;
1562 
1563 	key = ntohl(cfg->fc_dst);
1564 	mask = ntohl(inet_make_mask(plen));
1565 
1566 	if (key & ~mask)
1567 		return -EINVAL;
1568 
1569 	key = key & mask;
1570 	l = fib_find_node(t, key);
1571 
1572 	if (!l)
1573 		return -ESRCH;
1574 
1575 	fa_head = get_fa_head(l, plen);
1576 	fa = fib_find_alias(fa_head, tos, 0);
1577 
1578 	if (!fa)
1579 		return -ESRCH;
1580 
1581 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1582 
1583 	fa_to_delete = NULL;
1584 	fa_head = fa->fa_list.prev;
1585 
1586 	list_for_each_entry(fa, fa_head, fa_list) {
1587 		struct fib_info *fi = fa->fa_info;
1588 
1589 		if (fa->fa_tos != tos)
1590 			break;
1591 
1592 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1593 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1594 		     fa->fa_scope == cfg->fc_scope) &&
1595 		    (!cfg->fc_protocol ||
1596 		     fi->fib_protocol == cfg->fc_protocol) &&
1597 		    fib_nh_match(cfg, fi) == 0) {
1598 			fa_to_delete = fa;
1599 			break;
1600 		}
1601 	}
1602 
1603 	if (!fa_to_delete)
1604 		return -ESRCH;
1605 
1606 	fa = fa_to_delete;
1607 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1608 		  &cfg->fc_nlinfo);
1609 
1610 	l = fib_find_node(t, key);
1611 	li = find_leaf_info(l, plen);
1612 
1613 	list_del_rcu(&fa->fa_list);
1614 
1615 	if (list_empty(fa_head)) {
1616 		hlist_del_rcu(&li->hlist);
1617 		free_leaf_info(li);
1618 	}
1619 
1620 	if (hlist_empty(&l->list))
1621 		trie_leaf_remove(t, key);
1622 
1623 	if (fa->fa_state & FA_S_ACCESSED)
1624 		rt_cache_flush(-1);
1625 
1626 	fib_release_info(fa->fa_info);
1627 	alias_free_mem_rcu(fa);
1628 	return 0;
1629 }
1630 
1631 static int trie_flush_list(struct trie *t, struct list_head *head)
1632 {
1633 	struct fib_alias *fa, *fa_node;
1634 	int found = 0;
1635 
1636 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1637 		struct fib_info *fi = fa->fa_info;
1638 
1639 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1640 			list_del_rcu(&fa->fa_list);
1641 			fib_release_info(fa->fa_info);
1642 			alias_free_mem_rcu(fa);
1643 			found++;
1644 		}
1645 	}
1646 	return found;
1647 }
1648 
1649 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1650 {
1651 	int found = 0;
1652 	struct hlist_head *lih = &l->list;
1653 	struct hlist_node *node, *tmp;
1654 	struct leaf_info *li = NULL;
1655 
1656 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1657 		found += trie_flush_list(t, &li->falh);
1658 
1659 		if (list_empty(&li->falh)) {
1660 			hlist_del_rcu(&li->hlist);
1661 			free_leaf_info(li);
1662 		}
1663 	}
1664 	return found;
1665 }
1666 
1667 /* rcu_read_lock needs to be hold by caller from readside */
1668 
1669 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1670 {
1671 	struct node *c = (struct node *) thisleaf;
1672 	struct tnode *p;
1673 	int idx;
1674 	struct node *trie = rcu_dereference(t->trie);
1675 
1676 	if (c == NULL) {
1677 		if (trie == NULL)
1678 			return NULL;
1679 
1680 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1681 			return (struct leaf *) trie;
1682 
1683 		p = (struct tnode*) trie;  /* Start */
1684 	} else
1685 		p = (struct tnode *) NODE_PARENT(c);
1686 
1687 	while (p) {
1688 		int pos, last;
1689 
1690 		/*  Find the next child of the parent */
1691 		if (c)
1692 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1693 		else
1694 			pos = 0;
1695 
1696 		last = 1 << p->bits;
1697 		for (idx = pos; idx < last ; idx++) {
1698 			c = rcu_dereference(p->child[idx]);
1699 
1700 			if (!c)
1701 				continue;
1702 
1703 			/* Decend if tnode */
1704 			while (IS_TNODE(c)) {
1705 				p = (struct tnode *) c;
1706 				idx = 0;
1707 
1708 				/* Rightmost non-NULL branch */
1709 				if (p && IS_TNODE(p))
1710 					while (!(c = rcu_dereference(p->child[idx]))
1711 					       && idx < (1<<p->bits)) idx++;
1712 
1713 				/* Done with this tnode? */
1714 				if (idx >= (1 << p->bits) || !c)
1715 					goto up;
1716 			}
1717 			return (struct leaf *) c;
1718 		}
1719 up:
1720 		/* No more children go up one step  */
1721 		c = (struct node *) p;
1722 		p = (struct tnode *) NODE_PARENT(p);
1723 	}
1724 	return NULL; /* Ready. Root of trie */
1725 }
1726 
1727 /*
1728  * Caller must hold RTNL.
1729  */
1730 static int fn_trie_flush(struct fib_table *tb)
1731 {
1732 	struct trie *t = (struct trie *) tb->tb_data;
1733 	struct leaf *ll = NULL, *l = NULL;
1734 	int found = 0, h;
1735 
1736 	t->revision++;
1737 
1738 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1739 		found += trie_flush_leaf(t, l);
1740 
1741 		if (ll && hlist_empty(&ll->list))
1742 			trie_leaf_remove(t, ll->key);
1743 		ll = l;
1744 	}
1745 
1746 	if (ll && hlist_empty(&ll->list))
1747 		trie_leaf_remove(t, ll->key);
1748 
1749 	pr_debug("trie_flush found=%d\n", found);
1750 	return found;
1751 }
1752 
1753 static int trie_last_dflt = -1;
1754 
1755 static void
1756 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1757 {
1758 	struct trie *t = (struct trie *) tb->tb_data;
1759 	int order, last_idx;
1760 	struct fib_info *fi = NULL;
1761 	struct fib_info *last_resort;
1762 	struct fib_alias *fa = NULL;
1763 	struct list_head *fa_head;
1764 	struct leaf *l;
1765 
1766 	last_idx = -1;
1767 	last_resort = NULL;
1768 	order = -1;
1769 
1770 	rcu_read_lock();
1771 
1772 	l = fib_find_node(t, 0);
1773 	if (!l)
1774 		goto out;
1775 
1776 	fa_head = get_fa_head(l, 0);
1777 	if (!fa_head)
1778 		goto out;
1779 
1780 	if (list_empty(fa_head))
1781 		goto out;
1782 
1783 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1784 		struct fib_info *next_fi = fa->fa_info;
1785 
1786 		if (fa->fa_scope != res->scope ||
1787 		    fa->fa_type != RTN_UNICAST)
1788 			continue;
1789 
1790 		if (next_fi->fib_priority > res->fi->fib_priority)
1791 			break;
1792 		if (!next_fi->fib_nh[0].nh_gw ||
1793 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1794 			continue;
1795 		fa->fa_state |= FA_S_ACCESSED;
1796 
1797 		if (fi == NULL) {
1798 			if (next_fi != res->fi)
1799 				break;
1800 		} else if (!fib_detect_death(fi, order, &last_resort,
1801 					     &last_idx, &trie_last_dflt)) {
1802 			if (res->fi)
1803 				fib_info_put(res->fi);
1804 			res->fi = fi;
1805 			atomic_inc(&fi->fib_clntref);
1806 			trie_last_dflt = order;
1807 			goto out;
1808 		}
1809 		fi = next_fi;
1810 		order++;
1811 	}
1812 	if (order <= 0 || fi == NULL) {
1813 		trie_last_dflt = -1;
1814 		goto out;
1815 	}
1816 
1817 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1818 		if (res->fi)
1819 			fib_info_put(res->fi);
1820 		res->fi = fi;
1821 		atomic_inc(&fi->fib_clntref);
1822 		trie_last_dflt = order;
1823 		goto out;
1824 	}
1825 	if (last_idx >= 0) {
1826 		if (res->fi)
1827 			fib_info_put(res->fi);
1828 		res->fi = last_resort;
1829 		if (last_resort)
1830 			atomic_inc(&last_resort->fib_clntref);
1831 	}
1832 	trie_last_dflt = last_idx;
1833  out:;
1834 	rcu_read_unlock();
1835 }
1836 
1837 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1838 			   struct sk_buff *skb, struct netlink_callback *cb)
1839 {
1840 	int i, s_i;
1841 	struct fib_alias *fa;
1842 
1843 	__be32 xkey = htonl(key);
1844 
1845 	s_i = cb->args[4];
1846 	i = 0;
1847 
1848 	/* rcu_read_lock is hold by caller */
1849 
1850 	list_for_each_entry_rcu(fa, fah, fa_list) {
1851 		if (i < s_i) {
1852 			i++;
1853 			continue;
1854 		}
1855 		BUG_ON(!fa->fa_info);
1856 
1857 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1858 				  cb->nlh->nlmsg_seq,
1859 				  RTM_NEWROUTE,
1860 				  tb->tb_id,
1861 				  fa->fa_type,
1862 				  fa->fa_scope,
1863 				  xkey,
1864 				  plen,
1865 				  fa->fa_tos,
1866 				  fa->fa_info, 0) < 0) {
1867 			cb->args[4] = i;
1868 			return -1;
1869 		}
1870 		i++;
1871 	}
1872 	cb->args[4] = i;
1873 	return skb->len;
1874 }
1875 
1876 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1877 			     struct netlink_callback *cb)
1878 {
1879 	int h, s_h;
1880 	struct list_head *fa_head;
1881 	struct leaf *l = NULL;
1882 
1883 	s_h = cb->args[3];
1884 
1885 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1886 		if (h < s_h)
1887 			continue;
1888 		if (h > s_h)
1889 			memset(&cb->args[4], 0,
1890 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1891 
1892 		fa_head = get_fa_head(l, plen);
1893 
1894 		if (!fa_head)
1895 			continue;
1896 
1897 		if (list_empty(fa_head))
1898 			continue;
1899 
1900 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1901 			cb->args[3] = h;
1902 			return -1;
1903 		}
1904 	}
1905 	cb->args[3] = h;
1906 	return skb->len;
1907 }
1908 
1909 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1910 {
1911 	int m, s_m;
1912 	struct trie *t = (struct trie *) tb->tb_data;
1913 
1914 	s_m = cb->args[2];
1915 
1916 	rcu_read_lock();
1917 	for (m = 0; m <= 32; m++) {
1918 		if (m < s_m)
1919 			continue;
1920 		if (m > s_m)
1921 			memset(&cb->args[3], 0,
1922 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1923 
1924 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1925 			cb->args[2] = m;
1926 			goto out;
1927 		}
1928 	}
1929 	rcu_read_unlock();
1930 	cb->args[2] = m;
1931 	return skb->len;
1932 out:
1933 	rcu_read_unlock();
1934 	return -1;
1935 }
1936 
1937 /* Fix more generic FIB names for init later */
1938 
1939 #ifdef CONFIG_IP_MULTIPLE_TABLES
1940 struct fib_table * fib_hash_init(u32 id)
1941 #else
1942 struct fib_table * __init fib_hash_init(u32 id)
1943 #endif
1944 {
1945 	struct fib_table *tb;
1946 	struct trie *t;
1947 
1948 	if (fn_alias_kmem == NULL)
1949 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1950 						  sizeof(struct fib_alias),
1951 						  0, SLAB_HWCACHE_ALIGN,
1952 						  NULL, NULL);
1953 
1954 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1955 		     GFP_KERNEL);
1956 	if (tb == NULL)
1957 		return NULL;
1958 
1959 	tb->tb_id = id;
1960 	tb->tb_lookup = fn_trie_lookup;
1961 	tb->tb_insert = fn_trie_insert;
1962 	tb->tb_delete = fn_trie_delete;
1963 	tb->tb_flush = fn_trie_flush;
1964 	tb->tb_select_default = fn_trie_select_default;
1965 	tb->tb_dump = fn_trie_dump;
1966 	memset(tb->tb_data, 0, sizeof(struct trie));
1967 
1968 	t = (struct trie *) tb->tb_data;
1969 
1970 	trie_init(t);
1971 
1972 	if (id == RT_TABLE_LOCAL)
1973 		trie_local = t;
1974 	else if (id == RT_TABLE_MAIN)
1975 		trie_main = t;
1976 
1977 	if (id == RT_TABLE_LOCAL)
1978 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1979 
1980 	return tb;
1981 }
1982 
1983 #ifdef CONFIG_PROC_FS
1984 /* Depth first Trie walk iterator */
1985 struct fib_trie_iter {
1986 	struct tnode *tnode;
1987 	struct trie *trie;
1988 	unsigned index;
1989 	unsigned depth;
1990 };
1991 
1992 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1993 {
1994 	struct tnode *tn = iter->tnode;
1995 	unsigned cindex = iter->index;
1996 	struct tnode *p;
1997 
1998 	/* A single entry routing table */
1999 	if (!tn)
2000 		return NULL;
2001 
2002 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2003 		 iter->tnode, iter->index, iter->depth);
2004 rescan:
2005 	while (cindex < (1<<tn->bits)) {
2006 		struct node *n = tnode_get_child(tn, cindex);
2007 
2008 		if (n) {
2009 			if (IS_LEAF(n)) {
2010 				iter->tnode = tn;
2011 				iter->index = cindex + 1;
2012 			} else {
2013 				/* push down one level */
2014 				iter->tnode = (struct tnode *) n;
2015 				iter->index = 0;
2016 				++iter->depth;
2017 			}
2018 			return n;
2019 		}
2020 
2021 		++cindex;
2022 	}
2023 
2024 	/* Current node exhausted, pop back up */
2025 	p = NODE_PARENT(tn);
2026 	if (p) {
2027 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2028 		tn = p;
2029 		--iter->depth;
2030 		goto rescan;
2031 	}
2032 
2033 	/* got root? */
2034 	return NULL;
2035 }
2036 
2037 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2038 				       struct trie *t)
2039 {
2040 	struct node *n ;
2041 
2042 	if(!t)
2043 		return NULL;
2044 
2045 	n = rcu_dereference(t->trie);
2046 
2047 	if(!iter)
2048 		return NULL;
2049 
2050 	if (n) {
2051 		if (IS_TNODE(n)) {
2052 			iter->tnode = (struct tnode *) n;
2053 			iter->trie = t;
2054 			iter->index = 0;
2055 			iter->depth = 1;
2056 		} else {
2057 			iter->tnode = NULL;
2058 			iter->trie  = t;
2059 			iter->index = 0;
2060 			iter->depth = 0;
2061 		}
2062 		return n;
2063 	}
2064 	return NULL;
2065 }
2066 
2067 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068 {
2069 	struct node *n;
2070 	struct fib_trie_iter iter;
2071 
2072 	memset(s, 0, sizeof(*s));
2073 
2074 	rcu_read_lock();
2075 	for (n = fib_trie_get_first(&iter, t); n;
2076 	     n = fib_trie_get_next(&iter)) {
2077 		if (IS_LEAF(n)) {
2078 			s->leaves++;
2079 			s->totdepth += iter.depth;
2080 			if (iter.depth > s->maxdepth)
2081 				s->maxdepth = iter.depth;
2082 		} else {
2083 			const struct tnode *tn = (const struct tnode *) n;
2084 			int i;
2085 
2086 			s->tnodes++;
2087 			if(tn->bits < MAX_STAT_DEPTH)
2088 				s->nodesizes[tn->bits]++;
2089 
2090 			for (i = 0; i < (1<<tn->bits); i++)
2091 				if (!tn->child[i])
2092 					s->nullpointers++;
2093 		}
2094 	}
2095 	rcu_read_unlock();
2096 }
2097 
2098 /*
2099  *	This outputs /proc/net/fib_triestats
2100  */
2101 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2102 {
2103 	unsigned i, max, pointers, bytes, avdepth;
2104 
2105 	if (stat->leaves)
2106 		avdepth = stat->totdepth*100 / stat->leaves;
2107 	else
2108 		avdepth = 0;
2109 
2110 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2111 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2112 
2113 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2114 
2115 	bytes = sizeof(struct leaf) * stat->leaves;
2116 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2117 	bytes += sizeof(struct tnode) * stat->tnodes;
2118 
2119 	max = MAX_STAT_DEPTH;
2120 	while (max > 0 && stat->nodesizes[max-1] == 0)
2121 		max--;
2122 
2123 	pointers = 0;
2124 	for (i = 1; i <= max; i++)
2125 		if (stat->nodesizes[i] != 0) {
2126 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2127 			pointers += (1<<i) * stat->nodesizes[i];
2128 		}
2129 	seq_putc(seq, '\n');
2130 	seq_printf(seq, "\tPointers: %d\n", pointers);
2131 
2132 	bytes += sizeof(struct node *) * pointers;
2133 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2134 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2135 
2136 #ifdef CONFIG_IP_FIB_TRIE_STATS
2137 	seq_printf(seq, "Counters:\n---------\n");
2138 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2139 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2140 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2141 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2142 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2143 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2144 #ifdef CLEAR_STATS
2145 	memset(&(t->stats), 0, sizeof(t->stats));
2146 #endif
2147 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2148 }
2149 
2150 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2151 {
2152 	struct trie_stat *stat;
2153 
2154 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2155 	if (!stat)
2156 		return -ENOMEM;
2157 
2158 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2159 		   sizeof(struct leaf), sizeof(struct tnode));
2160 
2161 	if (trie_local) {
2162 		seq_printf(seq, "Local:\n");
2163 		trie_collect_stats(trie_local, stat);
2164 		trie_show_stats(seq, stat);
2165 	}
2166 
2167 	if (trie_main) {
2168 		seq_printf(seq, "Main:\n");
2169 		trie_collect_stats(trie_main, stat);
2170 		trie_show_stats(seq, stat);
2171 	}
2172 	kfree(stat);
2173 
2174 	return 0;
2175 }
2176 
2177 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2178 {
2179 	return single_open(file, fib_triestat_seq_show, NULL);
2180 }
2181 
2182 static const struct file_operations fib_triestat_fops = {
2183 	.owner	= THIS_MODULE,
2184 	.open	= fib_triestat_seq_open,
2185 	.read	= seq_read,
2186 	.llseek	= seq_lseek,
2187 	.release = single_release,
2188 };
2189 
2190 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2191 				      loff_t pos)
2192 {
2193 	loff_t idx = 0;
2194 	struct node *n;
2195 
2196 	for (n = fib_trie_get_first(iter, trie_local);
2197 	     n; ++idx, n = fib_trie_get_next(iter)) {
2198 		if (pos == idx)
2199 			return n;
2200 	}
2201 
2202 	for (n = fib_trie_get_first(iter, trie_main);
2203 	     n; ++idx, n = fib_trie_get_next(iter)) {
2204 		if (pos == idx)
2205 			return n;
2206 	}
2207 	return NULL;
2208 }
2209 
2210 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2211 {
2212 	rcu_read_lock();
2213 	if (*pos == 0)
2214 		return SEQ_START_TOKEN;
2215 	return fib_trie_get_idx(seq->private, *pos - 1);
2216 }
2217 
2218 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2219 {
2220 	struct fib_trie_iter *iter = seq->private;
2221 	void *l = v;
2222 
2223 	++*pos;
2224 	if (v == SEQ_START_TOKEN)
2225 		return fib_trie_get_idx(iter, 0);
2226 
2227 	v = fib_trie_get_next(iter);
2228 	BUG_ON(v == l);
2229 	if (v)
2230 		return v;
2231 
2232 	/* continue scan in next trie */
2233 	if (iter->trie == trie_local)
2234 		return fib_trie_get_first(iter, trie_main);
2235 
2236 	return NULL;
2237 }
2238 
2239 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2240 {
2241 	rcu_read_unlock();
2242 }
2243 
2244 static void seq_indent(struct seq_file *seq, int n)
2245 {
2246 	while (n-- > 0) seq_puts(seq, "   ");
2247 }
2248 
2249 static inline const char *rtn_scope(enum rt_scope_t s)
2250 {
2251 	static char buf[32];
2252 
2253 	switch(s) {
2254 	case RT_SCOPE_UNIVERSE: return "universe";
2255 	case RT_SCOPE_SITE:	return "site";
2256 	case RT_SCOPE_LINK:	return "link";
2257 	case RT_SCOPE_HOST:	return "host";
2258 	case RT_SCOPE_NOWHERE:	return "nowhere";
2259 	default:
2260 		snprintf(buf, sizeof(buf), "scope=%d", s);
2261 		return buf;
2262 	}
2263 }
2264 
2265 static const char *rtn_type_names[__RTN_MAX] = {
2266 	[RTN_UNSPEC] = "UNSPEC",
2267 	[RTN_UNICAST] = "UNICAST",
2268 	[RTN_LOCAL] = "LOCAL",
2269 	[RTN_BROADCAST] = "BROADCAST",
2270 	[RTN_ANYCAST] = "ANYCAST",
2271 	[RTN_MULTICAST] = "MULTICAST",
2272 	[RTN_BLACKHOLE] = "BLACKHOLE",
2273 	[RTN_UNREACHABLE] = "UNREACHABLE",
2274 	[RTN_PROHIBIT] = "PROHIBIT",
2275 	[RTN_THROW] = "THROW",
2276 	[RTN_NAT] = "NAT",
2277 	[RTN_XRESOLVE] = "XRESOLVE",
2278 };
2279 
2280 static inline const char *rtn_type(unsigned t)
2281 {
2282 	static char buf[32];
2283 
2284 	if (t < __RTN_MAX && rtn_type_names[t])
2285 		return rtn_type_names[t];
2286 	snprintf(buf, sizeof(buf), "type %d", t);
2287 	return buf;
2288 }
2289 
2290 /* Pretty print the trie */
2291 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2292 {
2293 	const struct fib_trie_iter *iter = seq->private;
2294 	struct node *n = v;
2295 
2296 	if (v == SEQ_START_TOKEN)
2297 		return 0;
2298 
2299 	if (!NODE_PARENT(n)) {
2300 		if (iter->trie == trie_local)
2301 			seq_puts(seq, "<local>:\n");
2302 		else
2303 			seq_puts(seq, "<main>:\n");
2304 	}
2305 
2306 	if (IS_TNODE(n)) {
2307 		struct tnode *tn = (struct tnode *) n;
2308 		__be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2309 
2310 		seq_indent(seq, iter->depth-1);
2311 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2312 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2313 			   tn->empty_children);
2314 
2315 	} else {
2316 		struct leaf *l = (struct leaf *) n;
2317 		int i;
2318 		__be32 val = htonl(l->key);
2319 
2320 		seq_indent(seq, iter->depth);
2321 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2322 		for (i = 32; i >= 0; i--) {
2323 			struct leaf_info *li = find_leaf_info(l, i);
2324 			if (li) {
2325 				struct fib_alias *fa;
2326 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2327 					seq_indent(seq, iter->depth+1);
2328 					seq_printf(seq, "  /%d %s %s", i,
2329 						   rtn_scope(fa->fa_scope),
2330 						   rtn_type(fa->fa_type));
2331 					if (fa->fa_tos)
2332 						seq_printf(seq, "tos =%d\n",
2333 							   fa->fa_tos);
2334 					seq_putc(seq, '\n');
2335 				}
2336 			}
2337 		}
2338 	}
2339 
2340 	return 0;
2341 }
2342 
2343 static struct seq_operations fib_trie_seq_ops = {
2344 	.start  = fib_trie_seq_start,
2345 	.next   = fib_trie_seq_next,
2346 	.stop   = fib_trie_seq_stop,
2347 	.show   = fib_trie_seq_show,
2348 };
2349 
2350 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2351 {
2352 	struct seq_file *seq;
2353 	int rc = -ENOMEM;
2354 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2355 
2356 	if (!s)
2357 		goto out;
2358 
2359 	rc = seq_open(file, &fib_trie_seq_ops);
2360 	if (rc)
2361 		goto out_kfree;
2362 
2363 	seq	     = file->private_data;
2364 	seq->private = s;
2365 	memset(s, 0, sizeof(*s));
2366 out:
2367 	return rc;
2368 out_kfree:
2369 	kfree(s);
2370 	goto out;
2371 }
2372 
2373 static const struct file_operations fib_trie_fops = {
2374 	.owner  = THIS_MODULE,
2375 	.open   = fib_trie_seq_open,
2376 	.read   = seq_read,
2377 	.llseek = seq_lseek,
2378 	.release = seq_release_private,
2379 };
2380 
2381 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2382 {
2383 	static unsigned type2flags[RTN_MAX + 1] = {
2384 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2385 	};
2386 	unsigned flags = type2flags[type];
2387 
2388 	if (fi && fi->fib_nh->nh_gw)
2389 		flags |= RTF_GATEWAY;
2390 	if (mask == htonl(0xFFFFFFFF))
2391 		flags |= RTF_HOST;
2392 	flags |= RTF_UP;
2393 	return flags;
2394 }
2395 
2396 /*
2397  *	This outputs /proc/net/route.
2398  *	The format of the file is not supposed to be changed
2399  * 	and needs to be same as fib_hash output to avoid breaking
2400  *	legacy utilities
2401  */
2402 static int fib_route_seq_show(struct seq_file *seq, void *v)
2403 {
2404 	const struct fib_trie_iter *iter = seq->private;
2405 	struct leaf *l = v;
2406 	int i;
2407 	char bf[128];
2408 
2409 	if (v == SEQ_START_TOKEN) {
2410 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2411 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2412 			   "\tWindow\tIRTT");
2413 		return 0;
2414 	}
2415 
2416 	if (iter->trie == trie_local)
2417 		return 0;
2418 	if (IS_TNODE(l))
2419 		return 0;
2420 
2421 	for (i=32; i>=0; i--) {
2422 		struct leaf_info *li = find_leaf_info(l, i);
2423 		struct fib_alias *fa;
2424 		__be32 mask, prefix;
2425 
2426 		if (!li)
2427 			continue;
2428 
2429 		mask = inet_make_mask(li->plen);
2430 		prefix = htonl(l->key);
2431 
2432 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2433 			const struct fib_info *fi = fa->fa_info;
2434 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2435 
2436 			if (fa->fa_type == RTN_BROADCAST
2437 			    || fa->fa_type == RTN_MULTICAST)
2438 				continue;
2439 
2440 			if (fi)
2441 				snprintf(bf, sizeof(bf),
2442 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2443 					 fi->fib_dev ? fi->fib_dev->name : "*",
2444 					 prefix,
2445 					 fi->fib_nh->nh_gw, flags, 0, 0,
2446 					 fi->fib_priority,
2447 					 mask,
2448 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2449 					 fi->fib_window,
2450 					 fi->fib_rtt >> 3);
2451 			else
2452 				snprintf(bf, sizeof(bf),
2453 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2454 					 prefix, 0, flags, 0, 0, 0,
2455 					 mask, 0, 0, 0);
2456 
2457 			seq_printf(seq, "%-127s\n", bf);
2458 		}
2459 	}
2460 
2461 	return 0;
2462 }
2463 
2464 static struct seq_operations fib_route_seq_ops = {
2465 	.start  = fib_trie_seq_start,
2466 	.next   = fib_trie_seq_next,
2467 	.stop   = fib_trie_seq_stop,
2468 	.show   = fib_route_seq_show,
2469 };
2470 
2471 static int fib_route_seq_open(struct inode *inode, struct file *file)
2472 {
2473 	struct seq_file *seq;
2474 	int rc = -ENOMEM;
2475 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2476 
2477 	if (!s)
2478 		goto out;
2479 
2480 	rc = seq_open(file, &fib_route_seq_ops);
2481 	if (rc)
2482 		goto out_kfree;
2483 
2484 	seq	     = file->private_data;
2485 	seq->private = s;
2486 	memset(s, 0, sizeof(*s));
2487 out:
2488 	return rc;
2489 out_kfree:
2490 	kfree(s);
2491 	goto out;
2492 }
2493 
2494 static const struct file_operations fib_route_fops = {
2495 	.owner  = THIS_MODULE,
2496 	.open   = fib_route_seq_open,
2497 	.read   = seq_read,
2498 	.llseek = seq_lseek,
2499 	.release = seq_release_private,
2500 };
2501 
2502 int __init fib_proc_init(void)
2503 {
2504 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2505 		goto out1;
2506 
2507 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2508 		goto out2;
2509 
2510 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2511 		goto out3;
2512 
2513 	return 0;
2514 
2515 out3:
2516 	proc_net_remove("fib_triestat");
2517 out2:
2518 	proc_net_remove("fib_trie");
2519 out1:
2520 	return -ENOMEM;
2521 }
2522 
2523 void __init fib_proc_exit(void)
2524 {
2525 	proc_net_remove("fib_trie");
2526 	proc_net_remove("fib_triestat");
2527 	proc_net_remove("route");
2528 }
2529 
2530 #endif /* CONFIG_PROC_FS */
2531