xref: /linux/net/ipv4/fib_trie.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  */
45 
46 #define VERSION "0.402"
47 
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/rcupdate.h>
66 #include <linux/skbuff.h>
67 #include <linux/netlink.h>
68 #include <linux/init.h>
69 #include <linux/list.h>
70 #include <net/ip.h>
71 #include <net/protocol.h>
72 #include <net/route.h>
73 #include <net/tcp.h>
74 #include <net/sock.h>
75 #include <net/ip_fib.h>
76 #include "fib_lookup.h"
77 
78 #undef CONFIG_IP_FIB_TRIE_STATS
79 #define MAX_CHILDS 16384
80 
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 
85 typedef unsigned int t_key;
86 
87 #define T_TNODE 0
88 #define T_LEAF  1
89 #define NODE_TYPE_MASK	0x1UL
90 #define NODE_PARENT(node) \
91 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
92 
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 
95 #define NODE_SET_PARENT(node, ptr)		\
96 	rcu_assign_pointer((node)->parent,	\
97 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
98 
99 #define IS_TNODE(n) (!(n->parent & T_LEAF))
100 #define IS_LEAF(n) (n->parent & T_LEAF)
101 
102 struct node {
103 	t_key key;
104 	unsigned long parent;
105 };
106 
107 struct leaf {
108 	t_key key;
109 	unsigned long parent;
110 	struct hlist_head list;
111 	struct rcu_head rcu;
112 };
113 
114 struct leaf_info {
115 	struct hlist_node hlist;
116 	struct rcu_head rcu;
117 	int plen;
118 	struct list_head falh;
119 };
120 
121 struct tnode {
122 	t_key key;
123 	unsigned long parent;
124 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
125 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
126 	unsigned short full_children;	/* KEYLENGTH bits needed */
127 	unsigned short empty_children;	/* KEYLENGTH bits needed */
128 	struct rcu_head rcu;
129 	struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int nodesizes[MAX_CHILDS];
150 };
151 
152 struct trie {
153 	struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 	struct trie_use_stats stats;
156 #endif
157 	int size;
158 	unsigned int revision;
159 };
160 
161 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
162 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 static void tnode_free(struct tnode *tn);
167 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
168 
169 static kmem_cache_t *fn_alias_kmem __read_mostly;
170 static struct trie *trie_local = NULL, *trie_main = NULL;
171 
172 
173 /* rcu_read_lock needs to be hold by caller from readside */
174 
175 static inline struct node *tnode_get_child(struct tnode *tn, int i)
176 {
177 	BUG_ON(i >= 1 << tn->bits);
178 
179 	return rcu_dereference(tn->child[i]);
180 }
181 
182 static inline int tnode_child_length(const struct tnode *tn)
183 {
184 	return 1 << tn->bits;
185 }
186 
187 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
188 {
189 	if (offset < KEYLENGTH)
190 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
191 	else
192 		return 0;
193 }
194 
195 static inline int tkey_equals(t_key a, t_key b)
196 {
197 	return a == b;
198 }
199 
200 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
201 {
202 	if (bits == 0 || offset >= KEYLENGTH)
203 		return 1;
204 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
205 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
206 }
207 
208 static inline int tkey_mismatch(t_key a, int offset, t_key b)
209 {
210 	t_key diff = a ^ b;
211 	int i = offset;
212 
213 	if (!diff)
214 		return 0;
215 	while ((diff << i) >> (KEYLENGTH-1) == 0)
216 		i++;
217 	return i;
218 }
219 
220 /*
221   To understand this stuff, an understanding of keys and all their bits is
222   necessary. Every node in the trie has a key associated with it, but not
223   all of the bits in that key are significant.
224 
225   Consider a node 'n' and its parent 'tp'.
226 
227   If n is a leaf, every bit in its key is significant. Its presence is
228   necessitaded by path compression, since during a tree traversal (when
229   searching for a leaf - unless we are doing an insertion) we will completely
230   ignore all skipped bits we encounter. Thus we need to verify, at the end of
231   a potentially successful search, that we have indeed been walking the
232   correct key path.
233 
234   Note that we can never "miss" the correct key in the tree if present by
235   following the wrong path. Path compression ensures that segments of the key
236   that are the same for all keys with a given prefix are skipped, but the
237   skipped part *is* identical for each node in the subtrie below the skipped
238   bit! trie_insert() in this implementation takes care of that - note the
239   call to tkey_sub_equals() in trie_insert().
240 
241   if n is an internal node - a 'tnode' here, the various parts of its key
242   have many different meanings.
243 
244   Example:
245   _________________________________________________________________
246   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
247   -----------------------------------------------------------------
248     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
249 
250   _________________________________________________________________
251   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
252   -----------------------------------------------------------------
253    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
254 
255   tp->pos = 7
256   tp->bits = 3
257   n->pos = 15
258   n->bits = 4
259 
260   First, let's just ignore the bits that come before the parent tp, that is
261   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
262   not use them for anything.
263 
264   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
265   index into the parent's child array. That is, they will be used to find
266   'n' among tp's children.
267 
268   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
269   for the node n.
270 
271   All the bits we have seen so far are significant to the node n. The rest
272   of the bits are really not needed or indeed known in n->key.
273 
274   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
275   n's child array, and will of course be different for each child.
276 
277 
278   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
279   at this point.
280 
281 */
282 
283 static inline void check_tnode(const struct tnode *tn)
284 {
285 	WARN_ON(tn && tn->pos+tn->bits > 32);
286 }
287 
288 static int halve_threshold = 25;
289 static int inflate_threshold = 50;
290 
291 
292 static void __alias_free_mem(struct rcu_head *head)
293 {
294 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
295 	kmem_cache_free(fn_alias_kmem, fa);
296 }
297 
298 static inline void alias_free_mem_rcu(struct fib_alias *fa)
299 {
300 	call_rcu(&fa->rcu, __alias_free_mem);
301 }
302 
303 static void __leaf_free_rcu(struct rcu_head *head)
304 {
305 	kfree(container_of(head, struct leaf, rcu));
306 }
307 
308 static inline void free_leaf(struct leaf *leaf)
309 {
310 	call_rcu(&leaf->rcu, __leaf_free_rcu);
311 }
312 
313 static void __leaf_info_free_rcu(struct rcu_head *head)
314 {
315 	kfree(container_of(head, struct leaf_info, rcu));
316 }
317 
318 static inline void free_leaf_info(struct leaf_info *leaf)
319 {
320 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
321 }
322 
323 static struct tnode *tnode_alloc(unsigned int size)
324 {
325 	struct page *pages;
326 
327 	if (size <= PAGE_SIZE)
328 		return kcalloc(size, 1, GFP_KERNEL);
329 
330 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
331 	if (!pages)
332 		return NULL;
333 
334 	return page_address(pages);
335 }
336 
337 static void __tnode_free_rcu(struct rcu_head *head)
338 {
339 	struct tnode *tn = container_of(head, struct tnode, rcu);
340 	unsigned int size = sizeof(struct tnode) +
341 		(1 << tn->bits) * sizeof(struct node *);
342 
343 	if (size <= PAGE_SIZE)
344 		kfree(tn);
345 	else
346 		free_pages((unsigned long)tn, get_order(size));
347 }
348 
349 static inline void tnode_free(struct tnode *tn)
350 {
351 	call_rcu(&tn->rcu, __tnode_free_rcu);
352 }
353 
354 static struct leaf *leaf_new(void)
355 {
356 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
357 	if (l) {
358 		l->parent = T_LEAF;
359 		INIT_HLIST_HEAD(&l->list);
360 	}
361 	return l;
362 }
363 
364 static struct leaf_info *leaf_info_new(int plen)
365 {
366 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
367 	if (li) {
368 		li->plen = plen;
369 		INIT_LIST_HEAD(&li->falh);
370 	}
371 	return li;
372 }
373 
374 static struct tnode* tnode_new(t_key key, int pos, int bits)
375 {
376 	int nchildren = 1<<bits;
377 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
378 	struct tnode *tn = tnode_alloc(sz);
379 
380 	if (tn) {
381 		memset(tn, 0, sz);
382 		tn->parent = T_TNODE;
383 		tn->pos = pos;
384 		tn->bits = bits;
385 		tn->key = key;
386 		tn->full_children = 0;
387 		tn->empty_children = 1<<bits;
388 	}
389 
390 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
391 		 (unsigned int) (sizeof(struct node) * 1<<bits));
392 	return tn;
393 }
394 
395 /*
396  * Check whether a tnode 'n' is "full", i.e. it is an internal node
397  * and no bits are skipped. See discussion in dyntree paper p. 6
398  */
399 
400 static inline int tnode_full(const struct tnode *tn, const struct node *n)
401 {
402 	if (n == NULL || IS_LEAF(n))
403 		return 0;
404 
405 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
406 }
407 
408 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
409 {
410 	tnode_put_child_reorg(tn, i, n, -1);
411 }
412 
413  /*
414   * Add a child at position i overwriting the old value.
415   * Update the value of full_children and empty_children.
416   */
417 
418 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
419 {
420 	struct node *chi = tn->child[i];
421 	int isfull;
422 
423 	BUG_ON(i >= 1<<tn->bits);
424 
425 
426 	/* update emptyChildren */
427 	if (n == NULL && chi != NULL)
428 		tn->empty_children++;
429 	else if (n != NULL && chi == NULL)
430 		tn->empty_children--;
431 
432 	/* update fullChildren */
433 	if (wasfull == -1)
434 		wasfull = tnode_full(tn, chi);
435 
436 	isfull = tnode_full(tn, n);
437 	if (wasfull && !isfull)
438 		tn->full_children--;
439 	else if (!wasfull && isfull)
440 		tn->full_children++;
441 
442 	if (n)
443 		NODE_SET_PARENT(n, tn);
444 
445 	rcu_assign_pointer(tn->child[i], n);
446 }
447 
448 static struct node *resize(struct trie *t, struct tnode *tn)
449 {
450 	int i;
451 	int err = 0;
452 	struct tnode *old_tn;
453 
454  	if (!tn)
455 		return NULL;
456 
457 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
458 		 tn, inflate_threshold, halve_threshold);
459 
460 	/* No children */
461 	if (tn->empty_children == tnode_child_length(tn)) {
462 		tnode_free(tn);
463 		return NULL;
464 	}
465 	/* One child */
466 	if (tn->empty_children == tnode_child_length(tn) - 1)
467 		for (i = 0; i < tnode_child_length(tn); i++) {
468 			struct node *n;
469 
470 			n = tn->child[i];
471 			if (!n)
472 				continue;
473 
474 			/* compress one level */
475 			NODE_SET_PARENT(n, NULL);
476 			tnode_free(tn);
477 			return n;
478 		}
479 	/*
480 	 * Double as long as the resulting node has a number of
481 	 * nonempty nodes that are above the threshold.
482 	 */
483 
484 	/*
485 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
486 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
487 	 * Telecommunications, page 6:
488 	 * "A node is doubled if the ratio of non-empty children to all
489 	 * children in the *doubled* node is at least 'high'."
490 	 *
491 	 * 'high' in this instance is the variable 'inflate_threshold'. It
492 	 * is expressed as a percentage, so we multiply it with
493 	 * tnode_child_length() and instead of multiplying by 2 (since the
494 	 * child array will be doubled by inflate()) and multiplying
495 	 * the left-hand side by 100 (to handle the percentage thing) we
496 	 * multiply the left-hand side by 50.
497 	 *
498 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
499 	 * - tn->empty_children is of course the number of non-null children
500 	 * in the current node. tn->full_children is the number of "full"
501 	 * children, that is non-null tnodes with a skip value of 0.
502 	 * All of those will be doubled in the resulting inflated tnode, so
503 	 * we just count them one extra time here.
504 	 *
505 	 * A clearer way to write this would be:
506 	 *
507 	 * to_be_doubled = tn->full_children;
508 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
509 	 *     tn->full_children;
510 	 *
511 	 * new_child_length = tnode_child_length(tn) * 2;
512 	 *
513 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
514 	 *      new_child_length;
515 	 * if (new_fill_factor >= inflate_threshold)
516 	 *
517 	 * ...and so on, tho it would mess up the while () loop.
518 	 *
519 	 * anyway,
520 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
521 	 *      inflate_threshold
522 	 *
523 	 * avoid a division:
524 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
525 	 *      inflate_threshold * new_child_length
526 	 *
527 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
528 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
529 	 *    tn->full_children) >= inflate_threshold * new_child_length
530 	 *
531 	 * expand new_child_length:
532 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
533 	 *    tn->full_children) >=
534 	 *      inflate_threshold * tnode_child_length(tn) * 2
535 	 *
536 	 * shorten again:
537 	 * 50 * (tn->full_children + tnode_child_length(tn) -
538 	 *    tn->empty_children) >= inflate_threshold *
539 	 *    tnode_child_length(tn)
540 	 *
541 	 */
542 
543 	check_tnode(tn);
544 
545 	err = 0;
546 	while ((tn->full_children > 0 &&
547 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
548 				inflate_threshold * tnode_child_length(tn))) {
549 
550 		old_tn = tn;
551 		tn = inflate(t, tn);
552 		if (IS_ERR(tn)) {
553 			tn = old_tn;
554 #ifdef CONFIG_IP_FIB_TRIE_STATS
555 			t->stats.resize_node_skipped++;
556 #endif
557 			break;
558 		}
559 	}
560 
561 	check_tnode(tn);
562 
563 	/*
564 	 * Halve as long as the number of empty children in this
565 	 * node is above threshold.
566 	 */
567 
568 	err = 0;
569 	while (tn->bits > 1 &&
570 	       100 * (tnode_child_length(tn) - tn->empty_children) <
571 	       halve_threshold * tnode_child_length(tn)) {
572 
573 		old_tn = tn;
574 		tn = halve(t, tn);
575 		if (IS_ERR(tn)) {
576 			tn = old_tn;
577 #ifdef CONFIG_IP_FIB_TRIE_STATS
578 			t->stats.resize_node_skipped++;
579 #endif
580 			break;
581 		}
582 	}
583 
584 
585 	/* Only one child remains */
586 	if (tn->empty_children == tnode_child_length(tn) - 1)
587 		for (i = 0; i < tnode_child_length(tn); i++) {
588 			struct node *n;
589 
590 			n = tn->child[i];
591 			if (!n)
592 				continue;
593 
594 			/* compress one level */
595 
596 			NODE_SET_PARENT(n, NULL);
597 			tnode_free(tn);
598 			return n;
599 		}
600 
601 	return (struct node *) tn;
602 }
603 
604 static struct tnode *inflate(struct trie *t, struct tnode *tn)
605 {
606 	struct tnode *inode;
607 	struct tnode *oldtnode = tn;
608 	int olen = tnode_child_length(tn);
609 	int i;
610 
611 	pr_debug("In inflate\n");
612 
613 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
614 
615 	if (!tn)
616 		return ERR_PTR(-ENOMEM);
617 
618 	/*
619 	 * Preallocate and store tnodes before the actual work so we
620 	 * don't get into an inconsistent state if memory allocation
621 	 * fails. In case of failure we return the oldnode and  inflate
622 	 * of tnode is ignored.
623 	 */
624 
625 	for (i = 0; i < olen; i++) {
626 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
627 
628 		if (inode &&
629 		    IS_TNODE(inode) &&
630 		    inode->pos == oldtnode->pos + oldtnode->bits &&
631 		    inode->bits > 1) {
632 			struct tnode *left, *right;
633 			t_key m = TKEY_GET_MASK(inode->pos, 1);
634 
635 			left = tnode_new(inode->key&(~m), inode->pos + 1,
636 					 inode->bits - 1);
637 			if (!left)
638 				goto nomem;
639 
640 			right = tnode_new(inode->key|m, inode->pos + 1,
641 					  inode->bits - 1);
642 
643                         if (!right) {
644 				tnode_free(left);
645 				goto nomem;
646                         }
647 
648 			put_child(t, tn, 2*i, (struct node *) left);
649 			put_child(t, tn, 2*i+1, (struct node *) right);
650 		}
651 	}
652 
653 	for (i = 0; i < olen; i++) {
654 		struct node *node = tnode_get_child(oldtnode, i);
655 		struct tnode *left, *right;
656 		int size, j;
657 
658 		/* An empty child */
659 		if (node == NULL)
660 			continue;
661 
662 		/* A leaf or an internal node with skipped bits */
663 
664 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
665 		   tn->pos + tn->bits - 1) {
666 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
667 					     1) == 0)
668 				put_child(t, tn, 2*i, node);
669 			else
670 				put_child(t, tn, 2*i+1, node);
671 			continue;
672 		}
673 
674 		/* An internal node with two children */
675 		inode = (struct tnode *) node;
676 
677 		if (inode->bits == 1) {
678 			put_child(t, tn, 2*i, inode->child[0]);
679 			put_child(t, tn, 2*i+1, inode->child[1]);
680 
681 			tnode_free(inode);
682 			continue;
683 		}
684 
685 		/* An internal node with more than two children */
686 
687 		/* We will replace this node 'inode' with two new
688 		 * ones, 'left' and 'right', each with half of the
689 		 * original children. The two new nodes will have
690 		 * a position one bit further down the key and this
691 		 * means that the "significant" part of their keys
692 		 * (see the discussion near the top of this file)
693 		 * will differ by one bit, which will be "0" in
694 		 * left's key and "1" in right's key. Since we are
695 		 * moving the key position by one step, the bit that
696 		 * we are moving away from - the bit at position
697 		 * (inode->pos) - is the one that will differ between
698 		 * left and right. So... we synthesize that bit in the
699 		 * two  new keys.
700 		 * The mask 'm' below will be a single "one" bit at
701 		 * the position (inode->pos)
702 		 */
703 
704 		/* Use the old key, but set the new significant
705 		 *   bit to zero.
706 		 */
707 
708 		left = (struct tnode *) tnode_get_child(tn, 2*i);
709 		put_child(t, tn, 2*i, NULL);
710 
711 		BUG_ON(!left);
712 
713 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
714 		put_child(t, tn, 2*i+1, NULL);
715 
716 		BUG_ON(!right);
717 
718 		size = tnode_child_length(left);
719 		for (j = 0; j < size; j++) {
720 			put_child(t, left, j, inode->child[j]);
721 			put_child(t, right, j, inode->child[j + size]);
722 		}
723 		put_child(t, tn, 2*i, resize(t, left));
724 		put_child(t, tn, 2*i+1, resize(t, right));
725 
726 		tnode_free(inode);
727 	}
728 	tnode_free(oldtnode);
729 	return tn;
730 nomem:
731 	{
732 		int size = tnode_child_length(tn);
733 		int j;
734 
735 		for (j = 0; j < size; j++)
736 			if (tn->child[j])
737 				tnode_free((struct tnode *)tn->child[j]);
738 
739 		tnode_free(tn);
740 
741 		return ERR_PTR(-ENOMEM);
742 	}
743 }
744 
745 static struct tnode *halve(struct trie *t, struct tnode *tn)
746 {
747 	struct tnode *oldtnode = tn;
748 	struct node *left, *right;
749 	int i;
750 	int olen = tnode_child_length(tn);
751 
752 	pr_debug("In halve\n");
753 
754 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
755 
756 	if (!tn)
757 		return ERR_PTR(-ENOMEM);
758 
759 	/*
760 	 * Preallocate and store tnodes before the actual work so we
761 	 * don't get into an inconsistent state if memory allocation
762 	 * fails. In case of failure we return the oldnode and halve
763 	 * of tnode is ignored.
764 	 */
765 
766 	for (i = 0; i < olen; i += 2) {
767 		left = tnode_get_child(oldtnode, i);
768 		right = tnode_get_child(oldtnode, i+1);
769 
770 		/* Two nonempty children */
771 		if (left && right) {
772 			struct tnode *newn;
773 
774 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
775 
776 			if (!newn)
777 				goto nomem;
778 
779 			put_child(t, tn, i/2, (struct node *)newn);
780 		}
781 
782 	}
783 
784 	for (i = 0; i < olen; i += 2) {
785 		struct tnode *newBinNode;
786 
787 		left = tnode_get_child(oldtnode, i);
788 		right = tnode_get_child(oldtnode, i+1);
789 
790 		/* At least one of the children is empty */
791 		if (left == NULL) {
792 			if (right == NULL)    /* Both are empty */
793 				continue;
794 			put_child(t, tn, i/2, right);
795 			continue;
796 		}
797 
798 		if (right == NULL) {
799 			put_child(t, tn, i/2, left);
800 			continue;
801 		}
802 
803 		/* Two nonempty children */
804 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
805 		put_child(t, tn, i/2, NULL);
806 		put_child(t, newBinNode, 0, left);
807 		put_child(t, newBinNode, 1, right);
808 		put_child(t, tn, i/2, resize(t, newBinNode));
809 	}
810 	tnode_free(oldtnode);
811 	return tn;
812 nomem:
813 	{
814 		int size = tnode_child_length(tn);
815 		int j;
816 
817 		for (j = 0; j < size; j++)
818 			if (tn->child[j])
819 				tnode_free((struct tnode *)tn->child[j]);
820 
821 		tnode_free(tn);
822 
823 		return ERR_PTR(-ENOMEM);
824 	}
825 }
826 
827 static void trie_init(struct trie *t)
828 {
829 	if (!t)
830 		return;
831 
832 	t->size = 0;
833 	rcu_assign_pointer(t->trie, NULL);
834 	t->revision = 0;
835 #ifdef CONFIG_IP_FIB_TRIE_STATS
836 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
837 #endif
838 }
839 
840 /* readside most use rcu_read_lock currently dump routines
841  via get_fa_head and dump */
842 
843 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
844 {
845 	struct hlist_node *node;
846 	struct leaf_info *li;
847 
848 	hlist_for_each_entry_rcu(li, node, head, hlist)
849 		if (li->plen == plen)
850 			return li;
851 
852 	return NULL;
853 }
854 
855 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
856 {
857 	struct leaf_info *li = find_leaf_info(&l->list, plen);
858 
859 	if (!li)
860 		return NULL;
861 
862 	return &li->falh;
863 }
864 
865 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
866 {
867         struct leaf_info *li = NULL, *last = NULL;
868         struct hlist_node *node;
869 
870         if (hlist_empty(head)) {
871                 hlist_add_head_rcu(&new->hlist, head);
872         } else {
873                 hlist_for_each_entry(li, node, head, hlist) {
874                         if (new->plen > li->plen)
875                                 break;
876 
877                         last = li;
878                 }
879                 if (last)
880                         hlist_add_after_rcu(&last->hlist, &new->hlist);
881                 else
882                         hlist_add_before_rcu(&new->hlist, &li->hlist);
883         }
884 }
885 
886 /* rcu_read_lock needs to be hold by caller from readside */
887 
888 static struct leaf *
889 fib_find_node(struct trie *t, u32 key)
890 {
891 	int pos;
892 	struct tnode *tn;
893 	struct node *n;
894 
895 	pos = 0;
896 	n = rcu_dereference(t->trie);
897 
898 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
899 		tn = (struct tnode *) n;
900 
901 		check_tnode(tn);
902 
903 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
904 			pos = tn->pos + tn->bits;
905 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
906 		} else
907 			break;
908 	}
909 	/* Case we have found a leaf. Compare prefixes */
910 
911 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
912 		return (struct leaf *)n;
913 
914 	return NULL;
915 }
916 
917 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
918 {
919 	int wasfull;
920 	t_key cindex, key;
921 	struct tnode *tp = NULL;
922 
923 	key = tn->key;
924 
925 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
926 
927 		tp = NODE_PARENT(tn);
928 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
929 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
930 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
931 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
932 
933 		if (!NODE_PARENT(tn))
934 			break;
935 
936 		tn = NODE_PARENT(tn);
937 	}
938 	/* Handle last (top) tnode */
939 	if (IS_TNODE(tn))
940 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
941 
942 	return (struct node*) tn;
943 }
944 
945 /* only used from updater-side */
946 
947 static  struct list_head *
948 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
949 {
950 	int pos, newpos;
951 	struct tnode *tp = NULL, *tn = NULL;
952 	struct node *n;
953 	struct leaf *l;
954 	int missbit;
955 	struct list_head *fa_head = NULL;
956 	struct leaf_info *li;
957 	t_key cindex;
958 
959 	pos = 0;
960 	n = t->trie;
961 
962 	/* If we point to NULL, stop. Either the tree is empty and we should
963 	 * just put a new leaf in if, or we have reached an empty child slot,
964 	 * and we should just put our new leaf in that.
965 	 * If we point to a T_TNODE, check if it matches our key. Note that
966 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
967 	 * not be the parent's 'pos'+'bits'!
968 	 *
969 	 * If it does match the current key, get pos/bits from it, extract
970 	 * the index from our key, push the T_TNODE and walk the tree.
971 	 *
972 	 * If it doesn't, we have to replace it with a new T_TNODE.
973 	 *
974 	 * If we point to a T_LEAF, it might or might not have the same key
975 	 * as we do. If it does, just change the value, update the T_LEAF's
976 	 * value, and return it.
977 	 * If it doesn't, we need to replace it with a T_TNODE.
978 	 */
979 
980 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
981 		tn = (struct tnode *) n;
982 
983 		check_tnode(tn);
984 
985 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
986 			tp = tn;
987 			pos = tn->pos + tn->bits;
988 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
989 
990 			BUG_ON(n && NODE_PARENT(n) != tn);
991 		} else
992 			break;
993 	}
994 
995 	/*
996 	 * n  ----> NULL, LEAF or TNODE
997 	 *
998 	 * tp is n's (parent) ----> NULL or TNODE
999 	 */
1000 
1001 	BUG_ON(tp && IS_LEAF(tp));
1002 
1003 	/* Case 1: n is a leaf. Compare prefixes */
1004 
1005 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1006 		struct leaf *l = (struct leaf *) n;
1007 
1008 		li = leaf_info_new(plen);
1009 
1010 		if (!li) {
1011 			*err = -ENOMEM;
1012 			goto err;
1013 		}
1014 
1015 		fa_head = &li->falh;
1016 		insert_leaf_info(&l->list, li);
1017 		goto done;
1018 	}
1019 	t->size++;
1020 	l = leaf_new();
1021 
1022 	if (!l) {
1023 		*err = -ENOMEM;
1024 		goto err;
1025 	}
1026 
1027 	l->key = key;
1028 	li = leaf_info_new(plen);
1029 
1030 	if (!li) {
1031 		tnode_free((struct tnode *) l);
1032 		*err = -ENOMEM;
1033 		goto err;
1034 	}
1035 
1036 	fa_head = &li->falh;
1037 	insert_leaf_info(&l->list, li);
1038 
1039 	if (t->trie && n == NULL) {
1040 		/* Case 2: n is NULL, and will just insert a new leaf */
1041 
1042 		NODE_SET_PARENT(l, tp);
1043 
1044 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1045 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1046 	} else {
1047 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1048 		/*
1049 		 *  Add a new tnode here
1050 		 *  first tnode need some special handling
1051 		 */
1052 
1053 		if (tp)
1054 			pos = tp->pos+tp->bits;
1055 		else
1056 			pos = 0;
1057 
1058 		if (n) {
1059 			newpos = tkey_mismatch(key, pos, n->key);
1060 			tn = tnode_new(n->key, newpos, 1);
1061 		} else {
1062 			newpos = 0;
1063 			tn = tnode_new(key, newpos, 1); /* First tnode */
1064 		}
1065 
1066 		if (!tn) {
1067 			free_leaf_info(li);
1068 			tnode_free((struct tnode *) l);
1069 			*err = -ENOMEM;
1070 			goto err;
1071 		}
1072 
1073 		NODE_SET_PARENT(tn, tp);
1074 
1075 		missbit = tkey_extract_bits(key, newpos, 1);
1076 		put_child(t, tn, missbit, (struct node *)l);
1077 		put_child(t, tn, 1-missbit, n);
1078 
1079 		if (tp) {
1080 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1081 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1082 		} else {
1083 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1084 			tp = tn;
1085 		}
1086 	}
1087 
1088 	if (tp && tp->pos + tp->bits > 32)
1089 		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1090 		       tp, tp->pos, tp->bits, key, plen);
1091 
1092 	/* Rebalance the trie */
1093 
1094 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1095 done:
1096 	t->revision++;
1097 err:
1098 	return fa_head;
1099 }
1100 
1101 static int
1102 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1103 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1104 {
1105 	struct trie *t = (struct trie *) tb->tb_data;
1106 	struct fib_alias *fa, *new_fa;
1107 	struct list_head *fa_head = NULL;
1108 	struct fib_info *fi;
1109 	int plen = r->rtm_dst_len;
1110 	int type = r->rtm_type;
1111 	u8 tos = r->rtm_tos;
1112 	u32 key, mask;
1113 	int err;
1114 	struct leaf *l;
1115 
1116 	if (plen > 32)
1117 		return -EINVAL;
1118 
1119 	key = 0;
1120 	if (rta->rta_dst)
1121 		memcpy(&key, rta->rta_dst, 4);
1122 
1123 	key = ntohl(key);
1124 
1125 	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1126 
1127 	mask = ntohl(inet_make_mask(plen));
1128 
1129 	if (key & ~mask)
1130 		return -EINVAL;
1131 
1132 	key = key & mask;
1133 
1134 	fi = fib_create_info(r, rta, nlhdr, &err);
1135 
1136 	if (!fi)
1137 		goto err;
1138 
1139 	l = fib_find_node(t, key);
1140 	fa = NULL;
1141 
1142 	if (l) {
1143 		fa_head = get_fa_head(l, plen);
1144 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1145 	}
1146 
1147 	/* Now fa, if non-NULL, points to the first fib alias
1148 	 * with the same keys [prefix,tos,priority], if such key already
1149 	 * exists or to the node before which we will insert new one.
1150 	 *
1151 	 * If fa is NULL, we will need to allocate a new one and
1152 	 * insert to the head of f.
1153 	 *
1154 	 * If f is NULL, no fib node matched the destination key
1155 	 * and we need to allocate a new one of those as well.
1156 	 */
1157 
1158 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1159 		struct fib_alias *fa_orig;
1160 
1161 		err = -EEXIST;
1162 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1163 			goto out;
1164 
1165 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1166 			struct fib_info *fi_drop;
1167 			u8 state;
1168 
1169 			err = -ENOBUFS;
1170 			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1171 			if (new_fa == NULL)
1172 				goto out;
1173 
1174 			fi_drop = fa->fa_info;
1175 			new_fa->fa_tos = fa->fa_tos;
1176 			new_fa->fa_info = fi;
1177 			new_fa->fa_type = type;
1178 			new_fa->fa_scope = r->rtm_scope;
1179 			state = fa->fa_state;
1180 			new_fa->fa_state &= ~FA_S_ACCESSED;
1181 
1182 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1183 			alias_free_mem_rcu(fa);
1184 
1185 			fib_release_info(fi_drop);
1186 			if (state & FA_S_ACCESSED)
1187 				rt_cache_flush(-1);
1188 
1189 			goto succeeded;
1190 		}
1191 		/* Error if we find a perfect match which
1192 		 * uses the same scope, type, and nexthop
1193 		 * information.
1194 		 */
1195 		fa_orig = fa;
1196 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1197 			if (fa->fa_tos != tos)
1198 				break;
1199 			if (fa->fa_info->fib_priority != fi->fib_priority)
1200 				break;
1201 			if (fa->fa_type == type &&
1202 			    fa->fa_scope == r->rtm_scope &&
1203 			    fa->fa_info == fi) {
1204 				goto out;
1205 			}
1206 		}
1207 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1208 			fa = fa_orig;
1209 	}
1210 	err = -ENOENT;
1211 	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1212 		goto out;
1213 
1214 	err = -ENOBUFS;
1215 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1216 	if (new_fa == NULL)
1217 		goto out;
1218 
1219 	new_fa->fa_info = fi;
1220 	new_fa->fa_tos = tos;
1221 	new_fa->fa_type = type;
1222 	new_fa->fa_scope = r->rtm_scope;
1223 	new_fa->fa_state = 0;
1224 	/*
1225 	 * Insert new entry to the list.
1226 	 */
1227 
1228 	if (!fa_head) {
1229 		fa_head = fib_insert_node(t, &err, key, plen);
1230 		err = 0;
1231 		if (err)
1232 			goto out_free_new_fa;
1233 	}
1234 
1235 	list_add_tail_rcu(&new_fa->fa_list,
1236 			  (fa ? &fa->fa_list : fa_head));
1237 
1238 	rt_cache_flush(-1);
1239 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1240 succeeded:
1241 	return 0;
1242 
1243 out_free_new_fa:
1244 	kmem_cache_free(fn_alias_kmem, new_fa);
1245 out:
1246 	fib_release_info(fi);
1247 err:
1248 	return err;
1249 }
1250 
1251 
1252 /* should be clalled with rcu_read_lock */
1253 static inline int check_leaf(struct trie *t, struct leaf *l,
1254 			     t_key key, int *plen, const struct flowi *flp,
1255 			     struct fib_result *res)
1256 {
1257 	int err, i;
1258 	t_key mask;
1259 	struct leaf_info *li;
1260 	struct hlist_head *hhead = &l->list;
1261 	struct hlist_node *node;
1262 
1263 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1264 		i = li->plen;
1265 		mask = ntohl(inet_make_mask(i));
1266 		if (l->key != (key & mask))
1267 			continue;
1268 
1269 		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1270 			*plen = i;
1271 #ifdef CONFIG_IP_FIB_TRIE_STATS
1272 			t->stats.semantic_match_passed++;
1273 #endif
1274 			return err;
1275 		}
1276 #ifdef CONFIG_IP_FIB_TRIE_STATS
1277 		t->stats.semantic_match_miss++;
1278 #endif
1279 	}
1280 	return 1;
1281 }
1282 
1283 static int
1284 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1285 {
1286 	struct trie *t = (struct trie *) tb->tb_data;
1287 	int plen, ret = 0;
1288 	struct node *n;
1289 	struct tnode *pn;
1290 	int pos, bits;
1291 	t_key key = ntohl(flp->fl4_dst);
1292 	int chopped_off;
1293 	t_key cindex = 0;
1294 	int current_prefix_length = KEYLENGTH;
1295 	struct tnode *cn;
1296 	t_key node_prefix, key_prefix, pref_mismatch;
1297 	int mp;
1298 
1299 	rcu_read_lock();
1300 
1301 	n = rcu_dereference(t->trie);
1302 	if (!n)
1303 		goto failed;
1304 
1305 #ifdef CONFIG_IP_FIB_TRIE_STATS
1306 	t->stats.gets++;
1307 #endif
1308 
1309 	/* Just a leaf? */
1310 	if (IS_LEAF(n)) {
1311 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1312 			goto found;
1313 		goto failed;
1314 	}
1315 	pn = (struct tnode *) n;
1316 	chopped_off = 0;
1317 
1318 	while (pn) {
1319 		pos = pn->pos;
1320 		bits = pn->bits;
1321 
1322 		if (!chopped_off)
1323 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1324 
1325 		n = tnode_get_child(pn, cindex);
1326 
1327 		if (n == NULL) {
1328 #ifdef CONFIG_IP_FIB_TRIE_STATS
1329 			t->stats.null_node_hit++;
1330 #endif
1331 			goto backtrace;
1332 		}
1333 
1334 		if (IS_LEAF(n)) {
1335 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1336 				goto found;
1337 			else
1338 				goto backtrace;
1339 		}
1340 
1341 #define HL_OPTIMIZE
1342 #ifdef HL_OPTIMIZE
1343 		cn = (struct tnode *)n;
1344 
1345 		/*
1346 		 * It's a tnode, and we can do some extra checks here if we
1347 		 * like, to avoid descending into a dead-end branch.
1348 		 * This tnode is in the parent's child array at index
1349 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1350 		 * chopped off, so in reality the index may be just a
1351 		 * subprefix, padded with zero at the end.
1352 		 * We can also take a look at any skipped bits in this
1353 		 * tnode - everything up to p_pos is supposed to be ok,
1354 		 * and the non-chopped bits of the index (se previous
1355 		 * paragraph) are also guaranteed ok, but the rest is
1356 		 * considered unknown.
1357 		 *
1358 		 * The skipped bits are key[pos+bits..cn->pos].
1359 		 */
1360 
1361 		/* If current_prefix_length < pos+bits, we are already doing
1362 		 * actual prefix  matching, which means everything from
1363 		 * pos+(bits-chopped_off) onward must be zero along some
1364 		 * branch of this subtree - otherwise there is *no* valid
1365 		 * prefix present. Here we can only check the skipped
1366 		 * bits. Remember, since we have already indexed into the
1367 		 * parent's child array, we know that the bits we chopped of
1368 		 * *are* zero.
1369 		 */
1370 
1371 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1372 
1373 		if (current_prefix_length < pos+bits) {
1374 			if (tkey_extract_bits(cn->key, current_prefix_length,
1375 						cn->pos - current_prefix_length) != 0 ||
1376 			    !(cn->child[0]))
1377 				goto backtrace;
1378 		}
1379 
1380 		/*
1381 		 * If chopped_off=0, the index is fully validated and we
1382 		 * only need to look at the skipped bits for this, the new,
1383 		 * tnode. What we actually want to do is to find out if
1384 		 * these skipped bits match our key perfectly, or if we will
1385 		 * have to count on finding a matching prefix further down,
1386 		 * because if we do, we would like to have some way of
1387 		 * verifying the existence of such a prefix at this point.
1388 		 */
1389 
1390 		/* The only thing we can do at this point is to verify that
1391 		 * any such matching prefix can indeed be a prefix to our
1392 		 * key, and if the bits in the node we are inspecting that
1393 		 * do not match our key are not ZERO, this cannot be true.
1394 		 * Thus, find out where there is a mismatch (before cn->pos)
1395 		 * and verify that all the mismatching bits are zero in the
1396 		 * new tnode's key.
1397 		 */
1398 
1399 		/* Note: We aren't very concerned about the piece of the key
1400 		 * that precede pn->pos+pn->bits, since these have already been
1401 		 * checked. The bits after cn->pos aren't checked since these are
1402 		 * by definition "unknown" at this point. Thus, what we want to
1403 		 * see is if we are about to enter the "prefix matching" state,
1404 		 * and in that case verify that the skipped bits that will prevail
1405 		 * throughout this subtree are zero, as they have to be if we are
1406 		 * to find a matching prefix.
1407 		 */
1408 
1409 		node_prefix = MASK_PFX(cn->key, cn->pos);
1410 		key_prefix = MASK_PFX(key, cn->pos);
1411 		pref_mismatch = key_prefix^node_prefix;
1412 		mp = 0;
1413 
1414 		/* In short: If skipped bits in this node do not match the search
1415 		 * key, enter the "prefix matching" state.directly.
1416 		 */
1417 		if (pref_mismatch) {
1418 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1419 				mp++;
1420 				pref_mismatch = pref_mismatch <<1;
1421 			}
1422 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1423 
1424 			if (key_prefix != 0)
1425 				goto backtrace;
1426 
1427 			if (current_prefix_length >= cn->pos)
1428 				current_prefix_length = mp;
1429 		}
1430 #endif
1431 		pn = (struct tnode *)n; /* Descend */
1432 		chopped_off = 0;
1433 		continue;
1434 
1435 backtrace:
1436 		chopped_off++;
1437 
1438 		/* As zero don't change the child key (cindex) */
1439 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1440 			chopped_off++;
1441 
1442 		/* Decrease current_... with bits chopped off */
1443 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1444 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1445 
1446 		/*
1447 		 * Either we do the actual chop off according or if we have
1448 		 * chopped off all bits in this tnode walk up to our parent.
1449 		 */
1450 
1451 		if (chopped_off <= pn->bits) {
1452 			cindex &= ~(1 << (chopped_off-1));
1453 		} else {
1454 			if (NODE_PARENT(pn) == NULL)
1455 				goto failed;
1456 
1457 			/* Get Child's index */
1458 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1459 			pn = NODE_PARENT(pn);
1460 			chopped_off = 0;
1461 
1462 #ifdef CONFIG_IP_FIB_TRIE_STATS
1463 			t->stats.backtrack++;
1464 #endif
1465 			goto backtrace;
1466 		}
1467 	}
1468 failed:
1469 	ret = 1;
1470 found:
1471 	rcu_read_unlock();
1472 	return ret;
1473 }
1474 
1475 /* only called from updater side */
1476 static int trie_leaf_remove(struct trie *t, t_key key)
1477 {
1478 	t_key cindex;
1479 	struct tnode *tp = NULL;
1480 	struct node *n = t->trie;
1481 	struct leaf *l;
1482 
1483 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1484 
1485 	/* Note that in the case skipped bits, those bits are *not* checked!
1486 	 * When we finish this, we will have NULL or a T_LEAF, and the
1487 	 * T_LEAF may or may not match our key.
1488 	 */
1489 
1490 	while (n != NULL && IS_TNODE(n)) {
1491 		struct tnode *tn = (struct tnode *) n;
1492 		check_tnode(tn);
1493 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1494 
1495 		BUG_ON(n && NODE_PARENT(n) != tn);
1496 	}
1497 	l = (struct leaf *) n;
1498 
1499 	if (!n || !tkey_equals(l->key, key))
1500 		return 0;
1501 
1502 	/*
1503 	 * Key found.
1504 	 * Remove the leaf and rebalance the tree
1505 	 */
1506 
1507 	t->revision++;
1508 	t->size--;
1509 
1510 	preempt_disable();
1511 	tp = NODE_PARENT(n);
1512 	tnode_free((struct tnode *) n);
1513 
1514 	if (tp) {
1515 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1516 		put_child(t, (struct tnode *)tp, cindex, NULL);
1517 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1518 	} else
1519 		rcu_assign_pointer(t->trie, NULL);
1520 	preempt_enable();
1521 
1522 	return 1;
1523 }
1524 
1525 static int
1526 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1527 		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1528 {
1529 	struct trie *t = (struct trie *) tb->tb_data;
1530 	u32 key, mask;
1531 	int plen = r->rtm_dst_len;
1532 	u8 tos = r->rtm_tos;
1533 	struct fib_alias *fa, *fa_to_delete;
1534 	struct list_head *fa_head;
1535 	struct leaf *l;
1536 	struct leaf_info *li;
1537 
1538 
1539 	if (plen > 32)
1540 		return -EINVAL;
1541 
1542 	key = 0;
1543 	if (rta->rta_dst)
1544 		memcpy(&key, rta->rta_dst, 4);
1545 
1546 	key = ntohl(key);
1547 	mask = ntohl(inet_make_mask(plen));
1548 
1549 	if (key & ~mask)
1550 		return -EINVAL;
1551 
1552 	key = key & mask;
1553 	l = fib_find_node(t, key);
1554 
1555 	if (!l)
1556 		return -ESRCH;
1557 
1558 	fa_head = get_fa_head(l, plen);
1559 	fa = fib_find_alias(fa_head, tos, 0);
1560 
1561 	if (!fa)
1562 		return -ESRCH;
1563 
1564 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1565 
1566 	fa_to_delete = NULL;
1567 	fa_head = fa->fa_list.prev;
1568 
1569 	list_for_each_entry(fa, fa_head, fa_list) {
1570 		struct fib_info *fi = fa->fa_info;
1571 
1572 		if (fa->fa_tos != tos)
1573 			break;
1574 
1575 		if ((!r->rtm_type ||
1576 		     fa->fa_type == r->rtm_type) &&
1577 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1578 		     fa->fa_scope == r->rtm_scope) &&
1579 		    (!r->rtm_protocol ||
1580 		     fi->fib_protocol == r->rtm_protocol) &&
1581 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1582 			fa_to_delete = fa;
1583 			break;
1584 		}
1585 	}
1586 
1587 	if (!fa_to_delete)
1588 		return -ESRCH;
1589 
1590 	fa = fa_to_delete;
1591 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1592 
1593 	l = fib_find_node(t, key);
1594 	li = find_leaf_info(&l->list, plen);
1595 
1596 	list_del_rcu(&fa->fa_list);
1597 
1598 	if (list_empty(fa_head)) {
1599 		hlist_del_rcu(&li->hlist);
1600 		free_leaf_info(li);
1601 	}
1602 
1603 	if (hlist_empty(&l->list))
1604 		trie_leaf_remove(t, key);
1605 
1606 	if (fa->fa_state & FA_S_ACCESSED)
1607 		rt_cache_flush(-1);
1608 
1609 	fib_release_info(fa->fa_info);
1610 	alias_free_mem_rcu(fa);
1611 	return 0;
1612 }
1613 
1614 static int trie_flush_list(struct trie *t, struct list_head *head)
1615 {
1616 	struct fib_alias *fa, *fa_node;
1617 	int found = 0;
1618 
1619 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1620 		struct fib_info *fi = fa->fa_info;
1621 
1622 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1623 			list_del_rcu(&fa->fa_list);
1624 			fib_release_info(fa->fa_info);
1625 			alias_free_mem_rcu(fa);
1626 			found++;
1627 		}
1628 	}
1629 	return found;
1630 }
1631 
1632 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1633 {
1634 	int found = 0;
1635 	struct hlist_head *lih = &l->list;
1636 	struct hlist_node *node, *tmp;
1637 	struct leaf_info *li = NULL;
1638 
1639 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1640 		found += trie_flush_list(t, &li->falh);
1641 
1642 		if (list_empty(&li->falh)) {
1643 			hlist_del_rcu(&li->hlist);
1644 			free_leaf_info(li);
1645 		}
1646 	}
1647 	return found;
1648 }
1649 
1650 /* rcu_read_lock needs to be hold by caller from readside */
1651 
1652 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1653 {
1654 	struct node *c = (struct node *) thisleaf;
1655 	struct tnode *p;
1656 	int idx;
1657 	struct node *trie = rcu_dereference(t->trie);
1658 
1659 	if (c == NULL) {
1660 		if (trie == NULL)
1661 			return NULL;
1662 
1663 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1664 			return (struct leaf *) trie;
1665 
1666 		p = (struct tnode*) trie;  /* Start */
1667 	} else
1668 		p = (struct tnode *) NODE_PARENT(c);
1669 
1670 	while (p) {
1671 		int pos, last;
1672 
1673 		/*  Find the next child of the parent */
1674 		if (c)
1675 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1676 		else
1677 			pos = 0;
1678 
1679 		last = 1 << p->bits;
1680 		for (idx = pos; idx < last ; idx++) {
1681 			c = rcu_dereference(p->child[idx]);
1682 
1683 			if (!c)
1684 				continue;
1685 
1686 			/* Decend if tnode */
1687 			while (IS_TNODE(c)) {
1688 				p = (struct tnode *) c;
1689   				idx = 0;
1690 
1691 				/* Rightmost non-NULL branch */
1692 				if (p && IS_TNODE(p))
1693 					while (!(c = rcu_dereference(p->child[idx]))
1694 					       && idx < (1<<p->bits)) idx++;
1695 
1696 				/* Done with this tnode? */
1697 				if (idx >= (1 << p->bits) || !c)
1698 					goto up;
1699 			}
1700 			return (struct leaf *) c;
1701 		}
1702 up:
1703 		/* No more children go up one step  */
1704 		c = (struct node *) p;
1705 		p = (struct tnode *) NODE_PARENT(p);
1706 	}
1707 	return NULL; /* Ready. Root of trie */
1708 }
1709 
1710 static int fn_trie_flush(struct fib_table *tb)
1711 {
1712 	struct trie *t = (struct trie *) tb->tb_data;
1713 	struct leaf *ll = NULL, *l = NULL;
1714 	int found = 0, h;
1715 
1716 	t->revision++;
1717 
1718 	rcu_read_lock();
1719 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1720 		found += trie_flush_leaf(t, l);
1721 
1722 		if (ll && hlist_empty(&ll->list))
1723 			trie_leaf_remove(t, ll->key);
1724 		ll = l;
1725 	}
1726 	rcu_read_unlock();
1727 
1728 	if (ll && hlist_empty(&ll->list))
1729 		trie_leaf_remove(t, ll->key);
1730 
1731 	pr_debug("trie_flush found=%d\n", found);
1732 	return found;
1733 }
1734 
1735 static int trie_last_dflt = -1;
1736 
1737 static void
1738 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1739 {
1740 	struct trie *t = (struct trie *) tb->tb_data;
1741 	int order, last_idx;
1742 	struct fib_info *fi = NULL;
1743 	struct fib_info *last_resort;
1744 	struct fib_alias *fa = NULL;
1745 	struct list_head *fa_head;
1746 	struct leaf *l;
1747 
1748 	last_idx = -1;
1749 	last_resort = NULL;
1750 	order = -1;
1751 
1752 	rcu_read_lock();
1753 
1754 	l = fib_find_node(t, 0);
1755 	if (!l)
1756 		goto out;
1757 
1758 	fa_head = get_fa_head(l, 0);
1759 	if (!fa_head)
1760 		goto out;
1761 
1762 	if (list_empty(fa_head))
1763 		goto out;
1764 
1765 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1766 		struct fib_info *next_fi = fa->fa_info;
1767 
1768 		if (fa->fa_scope != res->scope ||
1769 		    fa->fa_type != RTN_UNICAST)
1770 			continue;
1771 
1772 		if (next_fi->fib_priority > res->fi->fib_priority)
1773 			break;
1774 		if (!next_fi->fib_nh[0].nh_gw ||
1775 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1776 			continue;
1777 		fa->fa_state |= FA_S_ACCESSED;
1778 
1779 		if (fi == NULL) {
1780 			if (next_fi != res->fi)
1781 				break;
1782 		} else if (!fib_detect_death(fi, order, &last_resort,
1783 					     &last_idx, &trie_last_dflt)) {
1784 			if (res->fi)
1785 				fib_info_put(res->fi);
1786 			res->fi = fi;
1787 			atomic_inc(&fi->fib_clntref);
1788 			trie_last_dflt = order;
1789 			goto out;
1790 		}
1791 		fi = next_fi;
1792 		order++;
1793 	}
1794 	if (order <= 0 || fi == NULL) {
1795 		trie_last_dflt = -1;
1796 		goto out;
1797 	}
1798 
1799 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1800 		if (res->fi)
1801 			fib_info_put(res->fi);
1802 		res->fi = fi;
1803 		atomic_inc(&fi->fib_clntref);
1804 		trie_last_dflt = order;
1805 		goto out;
1806 	}
1807 	if (last_idx >= 0) {
1808 		if (res->fi)
1809 			fib_info_put(res->fi);
1810 		res->fi = last_resort;
1811 		if (last_resort)
1812 			atomic_inc(&last_resort->fib_clntref);
1813 	}
1814 	trie_last_dflt = last_idx;
1815  out:;
1816 	rcu_read_unlock();
1817 }
1818 
1819 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1820 			   struct sk_buff *skb, struct netlink_callback *cb)
1821 {
1822 	int i, s_i;
1823 	struct fib_alias *fa;
1824 
1825 	u32 xkey = htonl(key);
1826 
1827 	s_i = cb->args[3];
1828 	i = 0;
1829 
1830 	/* rcu_read_lock is hold by caller */
1831 
1832 	list_for_each_entry_rcu(fa, fah, fa_list) {
1833 		if (i < s_i) {
1834 			i++;
1835 			continue;
1836 		}
1837 		if (fa->fa_info->fib_nh == NULL) {
1838 			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1839 			i++;
1840 			continue;
1841 		}
1842 		if (fa->fa_info == NULL) {
1843 			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1844 			i++;
1845 			continue;
1846 		}
1847 
1848 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1849 				  cb->nlh->nlmsg_seq,
1850 				  RTM_NEWROUTE,
1851 				  tb->tb_id,
1852 				  fa->fa_type,
1853 				  fa->fa_scope,
1854 				  &xkey,
1855 				  plen,
1856 				  fa->fa_tos,
1857 				  fa->fa_info, 0) < 0) {
1858 			cb->args[3] = i;
1859 			return -1;
1860 		}
1861 		i++;
1862 	}
1863 	cb->args[3] = i;
1864 	return skb->len;
1865 }
1866 
1867 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1868 			     struct netlink_callback *cb)
1869 {
1870 	int h, s_h;
1871 	struct list_head *fa_head;
1872 	struct leaf *l = NULL;
1873 
1874 	s_h = cb->args[2];
1875 
1876 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1877 		if (h < s_h)
1878 			continue;
1879 		if (h > s_h)
1880 			memset(&cb->args[3], 0,
1881 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1882 
1883 		fa_head = get_fa_head(l, plen);
1884 
1885 		if (!fa_head)
1886 			continue;
1887 
1888 		if (list_empty(fa_head))
1889 			continue;
1890 
1891 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1892 			cb->args[2] = h;
1893 			return -1;
1894 		}
1895 	}
1896 	cb->args[2] = h;
1897 	return skb->len;
1898 }
1899 
1900 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1901 {
1902 	int m, s_m;
1903 	struct trie *t = (struct trie *) tb->tb_data;
1904 
1905 	s_m = cb->args[1];
1906 
1907 	rcu_read_lock();
1908 	for (m = 0; m <= 32; m++) {
1909 		if (m < s_m)
1910 			continue;
1911 		if (m > s_m)
1912 			memset(&cb->args[2], 0,
1913 				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1914 
1915 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1916 			cb->args[1] = m;
1917 			goto out;
1918 		}
1919 	}
1920 	rcu_read_unlock();
1921 	cb->args[1] = m;
1922 	return skb->len;
1923 out:
1924 	rcu_read_unlock();
1925 	return -1;
1926 }
1927 
1928 /* Fix more generic FIB names for init later */
1929 
1930 #ifdef CONFIG_IP_MULTIPLE_TABLES
1931 struct fib_table * fib_hash_init(int id)
1932 #else
1933 struct fib_table * __init fib_hash_init(int id)
1934 #endif
1935 {
1936 	struct fib_table *tb;
1937 	struct trie *t;
1938 
1939 	if (fn_alias_kmem == NULL)
1940 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1941 						  sizeof(struct fib_alias),
1942 						  0, SLAB_HWCACHE_ALIGN,
1943 						  NULL, NULL);
1944 
1945 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1946 		     GFP_KERNEL);
1947 	if (tb == NULL)
1948 		return NULL;
1949 
1950 	tb->tb_id = id;
1951 	tb->tb_lookup = fn_trie_lookup;
1952 	tb->tb_insert = fn_trie_insert;
1953 	tb->tb_delete = fn_trie_delete;
1954 	tb->tb_flush = fn_trie_flush;
1955 	tb->tb_select_default = fn_trie_select_default;
1956 	tb->tb_dump = fn_trie_dump;
1957 	memset(tb->tb_data, 0, sizeof(struct trie));
1958 
1959 	t = (struct trie *) tb->tb_data;
1960 
1961 	trie_init(t);
1962 
1963 	if (id == RT_TABLE_LOCAL)
1964 		trie_local = t;
1965 	else if (id == RT_TABLE_MAIN)
1966 		trie_main = t;
1967 
1968 	if (id == RT_TABLE_LOCAL)
1969 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1970 
1971 	return tb;
1972 }
1973 
1974 /* Trie dump functions */
1975 
1976 static void putspace_seq(struct seq_file *seq, int n)
1977 {
1978 	while (n--)
1979 		seq_printf(seq, " ");
1980 }
1981 
1982 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1983 {
1984 	while (bits--)
1985 		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1986 }
1987 
1988 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
1989 		   int pend, int cindex, int bits)
1990 {
1991 	putspace_seq(seq, indent);
1992 	if (IS_LEAF(n))
1993 		seq_printf(seq, "|");
1994 	else
1995 		seq_printf(seq, "+");
1996 	if (bits) {
1997 		seq_printf(seq, "%d/", cindex);
1998 		printbin_seq(seq, cindex, bits);
1999 		seq_printf(seq, ": ");
2000 	} else
2001 		seq_printf(seq, "<root>: ");
2002 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2003 
2004 	if (IS_LEAF(n)) {
2005 		struct leaf *l = (struct leaf *)n;
2006 		struct fib_alias *fa;
2007 		int i;
2008 
2009 		seq_printf(seq, "key=%d.%d.%d.%d\n",
2010 			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2011 
2012 		for (i = 32; i >= 0; i--)
2013 			if (find_leaf_info(&l->list, i)) {
2014 				struct list_head *fa_head = get_fa_head(l, i);
2015 
2016 				if (!fa_head)
2017 					continue;
2018 
2019 				if (list_empty(fa_head))
2020 					continue;
2021 
2022 				putspace_seq(seq, indent+2);
2023 				seq_printf(seq, "{/%d...dumping}\n", i);
2024 
2025 				list_for_each_entry_rcu(fa, fa_head, fa_list) {
2026 					putspace_seq(seq, indent+2);
2027 					if (fa->fa_info == NULL) {
2028 						seq_printf(seq, "Error fa_info=NULL\n");
2029 						continue;
2030 					}
2031 					if (fa->fa_info->fib_nh == NULL) {
2032 						seq_printf(seq, "Error _fib_nh=NULL\n");
2033 						continue;
2034 					}
2035 
2036 					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2037 					      fa->fa_type,
2038 					      fa->fa_scope,
2039 					      fa->fa_tos);
2040 				}
2041 			}
2042 	} else {
2043 		struct tnode *tn = (struct tnode *)n;
2044 		int plen = ((struct tnode *)n)->pos;
2045 		t_key prf = MASK_PFX(n->key, plen);
2046 
2047 		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2048 			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2049 
2050 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2051 		seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2052 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2053 		seq_printf(seq, "}\n");
2054 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2055 		seq_printf(seq, "{pos=%d", tn->pos);
2056 		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2057 		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2058 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
2059 		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2060 	}
2061 }
2062 
2063 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2064 {
2065 	struct node *n;
2066 	int cindex = 0;
2067 	int indent = 1;
2068 	int pend = 0;
2069 	int depth = 0;
2070 	struct tnode *tn;
2071 
2072 	rcu_read_lock();
2073 	n = rcu_dereference(t->trie);
2074 	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2075 
2076 	if (!n) {
2077 		seq_printf(seq, "------ trie is empty\n");
2078 
2079 		rcu_read_unlock();
2080 		return;
2081 	}
2082 
2083 	printnode_seq(seq, indent, n, pend, cindex, 0);
2084 
2085 	if (!IS_TNODE(n)) {
2086 		rcu_read_unlock();
2087 		return;
2088 	}
2089 
2090 	tn = (struct tnode *)n;
2091 	pend = tn->pos+tn->bits;
2092 	putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2093 	indent += 3;
2094 	depth++;
2095 
2096 	while (tn && cindex < (1 << tn->bits)) {
2097 		struct node *child = rcu_dereference(tn->child[cindex]);
2098 		if (!child)
2099 			cindex++;
2100 		else {
2101 			/* Got a child */
2102 			printnode_seq(seq, indent, child, pend,
2103 				      cindex, tn->bits);
2104 
2105 			if (IS_LEAF(child))
2106 				cindex++;
2107 
2108 			else {
2109 				/*
2110 				 * New tnode. Decend one level
2111 				 */
2112 
2113 				depth++;
2114 				n = child;
2115 				tn = (struct tnode *)n;
2116 				pend = tn->pos+tn->bits;
2117 				putspace_seq(seq, indent);
2118 				seq_printf(seq, "\\--\n");
2119 				indent += 3;
2120 				cindex = 0;
2121 			}
2122 		}
2123 
2124 		/*
2125 		 * Test if we are done
2126 		 */
2127 
2128 		while (cindex >= (1 << tn->bits)) {
2129 			/*
2130 			 * Move upwards and test for root
2131 			 * pop off all traversed  nodes
2132 			 */
2133 
2134 			if (NODE_PARENT(tn) == NULL) {
2135 				tn = NULL;
2136 				break;
2137 			}
2138 
2139 			cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2140 			cindex++;
2141 			tn = NODE_PARENT(tn);
2142 			pend = tn->pos + tn->bits;
2143 			indent -= 3;
2144 			depth--;
2145 		}
2146 	}
2147 	rcu_read_unlock();
2148 }
2149 
2150 static struct trie_stat *trie_stat_new(void)
2151 {
2152 	struct trie_stat *s;
2153 	int i;
2154 
2155 	s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2156 	if (!s)
2157 		return NULL;
2158 
2159 	s->totdepth = 0;
2160 	s->maxdepth = 0;
2161 	s->tnodes = 0;
2162 	s->leaves = 0;
2163 	s->nullpointers = 0;
2164 
2165 	for (i = 0; i < MAX_CHILDS; i++)
2166 		s->nodesizes[i] = 0;
2167 
2168 	return s;
2169 }
2170 
2171 static struct trie_stat *trie_collect_stats(struct trie *t)
2172 {
2173 	struct node *n;
2174 	struct trie_stat *s = trie_stat_new();
2175 	int cindex = 0;
2176 	int pend = 0;
2177 	int depth = 0;
2178 
2179 	if (!s)
2180 		return NULL;
2181 
2182 	rcu_read_lock();
2183 	n = rcu_dereference(t->trie);
2184 
2185 	if (!n)
2186 		return s;
2187 
2188 	if (IS_TNODE(n)) {
2189 		struct tnode *tn = (struct tnode *)n;
2190 		pend = tn->pos+tn->bits;
2191 		s->nodesizes[tn->bits]++;
2192 		depth++;
2193 
2194 		while (tn && cindex < (1 << tn->bits)) {
2195 			struct node *ch = rcu_dereference(tn->child[cindex]);
2196 			if (ch) {
2197 
2198 				/* Got a child */
2199 
2200 				if (IS_LEAF(tn->child[cindex])) {
2201 					cindex++;
2202 
2203 					/* stats */
2204 					if (depth > s->maxdepth)
2205 						s->maxdepth = depth;
2206 					s->totdepth += depth;
2207 					s->leaves++;
2208 				} else {
2209 					/*
2210 					 * New tnode. Decend one level
2211 					 */
2212 
2213 					s->tnodes++;
2214 					s->nodesizes[tn->bits]++;
2215 					depth++;
2216 
2217 					n = ch;
2218 					tn = (struct tnode *)n;
2219 					pend = tn->pos+tn->bits;
2220 
2221 					cindex = 0;
2222 				}
2223 			} else {
2224 				cindex++;
2225 				s->nullpointers++;
2226 			}
2227 
2228 			/*
2229 			 * Test if we are done
2230 			 */
2231 
2232 			while (cindex >= (1 << tn->bits)) {
2233 				/*
2234 				 * Move upwards and test for root
2235 				 * pop off all traversed  nodes
2236 				 */
2237 
2238 				if (NODE_PARENT(tn) == NULL) {
2239 					tn = NULL;
2240 					n = NULL;
2241 					break;
2242 				}
2243 
2244 				cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2245 				tn = NODE_PARENT(tn);
2246 				cindex++;
2247 				n = (struct node *)tn;
2248 				pend = tn->pos+tn->bits;
2249 				depth--;
2250  			}
2251 		}
2252 	}
2253 
2254 	rcu_read_unlock();
2255 	return s;
2256 }
2257 
2258 #ifdef CONFIG_PROC_FS
2259 
2260 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2261 {
2262 	return NULL;
2263 }
2264 
2265 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2266 {
2267 	return NULL;
2268 }
2269 
2270 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2271 {
2272 	if (!ip_fib_main_table)
2273 		return NULL;
2274 
2275 	if (*pos)
2276 		return fib_triestat_get_next(seq);
2277 	else
2278 		return SEQ_START_TOKEN;
2279 }
2280 
2281 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 {
2283 	++*pos;
2284 	if (v == SEQ_START_TOKEN)
2285 		return fib_triestat_get_first(seq);
2286 	else
2287 		return fib_triestat_get_next(seq);
2288 }
2289 
2290 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2291 {
2292 
2293 }
2294 
2295 /*
2296  *	This outputs /proc/net/fib_triestats
2297  *
2298  *	It always works in backward compatibility mode.
2299  *	The format of the file is not supposed to be changed.
2300  */
2301 
2302 static void collect_and_show(struct trie *t, struct seq_file *seq)
2303 {
2304 	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2305 	int i, max, pointers;
2306 	struct trie_stat *stat;
2307 	int avdepth;
2308 
2309 	stat = trie_collect_stats(t);
2310 
2311 	bytes = 0;
2312 	seq_printf(seq, "trie=%p\n", t);
2313 
2314 	if (stat) {
2315 		if (stat->leaves)
2316 			avdepth = stat->totdepth*100 / stat->leaves;
2317 		else
2318 			avdepth = 0;
2319 		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2320 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2321 
2322 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
2323 		bytes += sizeof(struct leaf) * stat->leaves;
2324 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2325 		bytes += sizeof(struct tnode) * stat->tnodes;
2326 
2327 		max = MAX_CHILDS-1;
2328 
2329 		while (max >= 0 && stat->nodesizes[max] == 0)
2330 			max--;
2331 		pointers = 0;
2332 
2333 		for (i = 1; i <= max; i++)
2334 			if (stat->nodesizes[i] != 0) {
2335 				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2336 				pointers += (1<<i) * stat->nodesizes[i];
2337 			}
2338 		seq_printf(seq, "\n");
2339 		seq_printf(seq, "Pointers: %d\n", pointers);
2340 		bytes += sizeof(struct node *) * pointers;
2341 		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2342 		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
2343 
2344 		kfree(stat);
2345 	}
2346 
2347 #ifdef CONFIG_IP_FIB_TRIE_STATS
2348 	seq_printf(seq, "Counters:\n---------\n");
2349 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2350 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2351 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2352 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2353 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2354 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2355 #ifdef CLEAR_STATS
2356 	memset(&(t->stats), 0, sizeof(t->stats));
2357 #endif
2358 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2359 }
2360 
2361 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2362 {
2363 	char bf[128];
2364 
2365 	if (v == SEQ_START_TOKEN) {
2366 		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2367 			   sizeof(struct leaf), sizeof(struct tnode));
2368 		if (trie_local)
2369 			collect_and_show(trie_local, seq);
2370 
2371 		if (trie_main)
2372 			collect_and_show(trie_main, seq);
2373 	} else {
2374 		snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2375 
2376 		seq_printf(seq, "%-127s\n", bf);
2377 	}
2378 	return 0;
2379 }
2380 
2381 static struct seq_operations fib_triestat_seq_ops = {
2382 	.start = fib_triestat_seq_start,
2383 	.next  = fib_triestat_seq_next,
2384 	.stop  = fib_triestat_seq_stop,
2385 	.show  = fib_triestat_seq_show,
2386 };
2387 
2388 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2389 {
2390 	struct seq_file *seq;
2391 	int rc = -ENOMEM;
2392 
2393 	rc = seq_open(file, &fib_triestat_seq_ops);
2394 	if (rc)
2395 		goto out_kfree;
2396 
2397 	seq = file->private_data;
2398 out:
2399 	return rc;
2400 out_kfree:
2401 	goto out;
2402 }
2403 
2404 static struct file_operations fib_triestat_seq_fops = {
2405 	.owner	= THIS_MODULE,
2406 	.open	= fib_triestat_seq_open,
2407 	.read	= seq_read,
2408 	.llseek	= seq_lseek,
2409 	.release = seq_release_private,
2410 };
2411 
2412 int __init fib_stat_proc_init(void)
2413 {
2414 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2415 		return -ENOMEM;
2416 	return 0;
2417 }
2418 
2419 void __init fib_stat_proc_exit(void)
2420 {
2421 	proc_net_remove("fib_triestat");
2422 }
2423 
2424 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2425 {
2426 	return NULL;
2427 }
2428 
2429 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2430 {
2431 	return NULL;
2432 }
2433 
2434 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2435 {
2436 	if (!ip_fib_main_table)
2437 		return NULL;
2438 
2439 	if (*pos)
2440 		return fib_trie_get_next(seq);
2441 	else
2442 		return SEQ_START_TOKEN;
2443 }
2444 
2445 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2446 {
2447 	++*pos;
2448 	if (v == SEQ_START_TOKEN)
2449 		return fib_trie_get_first(seq);
2450 	else
2451 		return fib_trie_get_next(seq);
2452 
2453 }
2454 
2455 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2456 {
2457 }
2458 
2459 /*
2460  *	This outputs /proc/net/fib_trie.
2461  *
2462  *	It always works in backward compatibility mode.
2463  *	The format of the file is not supposed to be changed.
2464  */
2465 
2466 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2467 {
2468 	char bf[128];
2469 
2470 	if (v == SEQ_START_TOKEN) {
2471 		if (trie_local)
2472 			trie_dump_seq(seq, trie_local);
2473 
2474 		if (trie_main)
2475 			trie_dump_seq(seq, trie_main);
2476 	} else {
2477 		snprintf(bf, sizeof(bf),
2478 			 "*\t%08X\t%08X", 200, 400);
2479 		seq_printf(seq, "%-127s\n", bf);
2480 	}
2481 
2482 	return 0;
2483 }
2484 
2485 static struct seq_operations fib_trie_seq_ops = {
2486 	.start = fib_trie_seq_start,
2487 	.next  = fib_trie_seq_next,
2488 	.stop  = fib_trie_seq_stop,
2489 	.show  = fib_trie_seq_show,
2490 };
2491 
2492 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2493 {
2494 	struct seq_file *seq;
2495 	int rc = -ENOMEM;
2496 
2497 	rc = seq_open(file, &fib_trie_seq_ops);
2498 	if (rc)
2499 		goto out_kfree;
2500 
2501 	seq = file->private_data;
2502 out:
2503 	return rc;
2504 out_kfree:
2505 	goto out;
2506 }
2507 
2508 static struct file_operations fib_trie_seq_fops = {
2509 	.owner	= THIS_MODULE,
2510 	.open	= fib_trie_seq_open,
2511 	.read	= seq_read,
2512 	.llseek	= seq_lseek,
2513 	.release= seq_release_private,
2514 };
2515 
2516 int __init fib_proc_init(void)
2517 {
2518 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2519 		return -ENOMEM;
2520 	return 0;
2521 }
2522 
2523 void __init fib_proc_exit(void)
2524 {
2525 	proc_net_remove("fib_trie");
2526 }
2527 
2528 #endif /* CONFIG_PROC_FS */
2529