xref: /linux/net/ipv4/fib_trie.c (revision eb2bce7f5e7ac1ca6da434461217fadf3c688d2c)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.408"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #undef CONFIG_IP_FIB_TRIE_STATS
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90 
91 typedef unsigned int t_key;
92 
93 #define T_TNODE 0
94 #define T_LEAF  1
95 #define NODE_TYPE_MASK	0x1UL
96 #define NODE_PARENT(node) \
97 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
98 
99 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
100 
101 #define NODE_SET_PARENT(node, ptr)		\
102 	rcu_assign_pointer((node)->parent,	\
103 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
104 
105 #define IS_TNODE(n) (!(n->parent & T_LEAF))
106 #define IS_LEAF(n) (n->parent & T_LEAF)
107 
108 struct node {
109 	t_key key;
110 	unsigned long parent;
111 };
112 
113 struct leaf {
114 	t_key key;
115 	unsigned long parent;
116 	struct hlist_head list;
117 	struct rcu_head rcu;
118 };
119 
120 struct leaf_info {
121 	struct hlist_node hlist;
122 	struct rcu_head rcu;
123 	int plen;
124 	struct list_head falh;
125 };
126 
127 struct tnode {
128 	t_key key;
129 	unsigned long parent;
130 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
131 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
132 	unsigned short full_children;	/* KEYLENGTH bits needed */
133 	unsigned short empty_children;	/* KEYLENGTH bits needed */
134 	struct rcu_head rcu;
135 	struct node *child[0];
136 };
137 
138 #ifdef CONFIG_IP_FIB_TRIE_STATS
139 struct trie_use_stats {
140 	unsigned int gets;
141 	unsigned int backtrack;
142 	unsigned int semantic_match_passed;
143 	unsigned int semantic_match_miss;
144 	unsigned int null_node_hit;
145 	unsigned int resize_node_skipped;
146 };
147 #endif
148 
149 struct trie_stat {
150 	unsigned int totdepth;
151 	unsigned int maxdepth;
152 	unsigned int tnodes;
153 	unsigned int leaves;
154 	unsigned int nullpointers;
155 	unsigned int nodesizes[MAX_STAT_DEPTH];
156 };
157 
158 struct trie {
159 	struct node *trie;
160 #ifdef CONFIG_IP_FIB_TRIE_STATS
161 	struct trie_use_stats stats;
162 #endif
163 	int size;
164 	unsigned int revision;
165 };
166 
167 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
168 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
169 static struct node *resize(struct trie *t, struct tnode *tn);
170 static struct tnode *inflate(struct trie *t, struct tnode *tn);
171 static struct tnode *halve(struct trie *t, struct tnode *tn);
172 static void tnode_free(struct tnode *tn);
173 
174 static struct kmem_cache *fn_alias_kmem __read_mostly;
175 static struct trie *trie_local = NULL, *trie_main = NULL;
176 
177 
178 /* rcu_read_lock needs to be hold by caller from readside */
179 
180 static inline struct node *tnode_get_child(struct tnode *tn, int i)
181 {
182 	BUG_ON(i >= 1 << tn->bits);
183 
184 	return rcu_dereference(tn->child[i]);
185 }
186 
187 static inline int tnode_child_length(const struct tnode *tn)
188 {
189 	return 1 << tn->bits;
190 }
191 
192 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
193 {
194 	if (offset < KEYLENGTH)
195 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
196 	else
197 		return 0;
198 }
199 
200 static inline int tkey_equals(t_key a, t_key b)
201 {
202 	return a == b;
203 }
204 
205 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
206 {
207 	if (bits == 0 || offset >= KEYLENGTH)
208 		return 1;
209 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
210 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
211 }
212 
213 static inline int tkey_mismatch(t_key a, int offset, t_key b)
214 {
215 	t_key diff = a ^ b;
216 	int i = offset;
217 
218 	if (!diff)
219 		return 0;
220 	while ((diff << i) >> (KEYLENGTH-1) == 0)
221 		i++;
222 	return i;
223 }
224 
225 /*
226   To understand this stuff, an understanding of keys and all their bits is
227   necessary. Every node in the trie has a key associated with it, but not
228   all of the bits in that key are significant.
229 
230   Consider a node 'n' and its parent 'tp'.
231 
232   If n is a leaf, every bit in its key is significant. Its presence is
233   necessitated by path compression, since during a tree traversal (when
234   searching for a leaf - unless we are doing an insertion) we will completely
235   ignore all skipped bits we encounter. Thus we need to verify, at the end of
236   a potentially successful search, that we have indeed been walking the
237   correct key path.
238 
239   Note that we can never "miss" the correct key in the tree if present by
240   following the wrong path. Path compression ensures that segments of the key
241   that are the same for all keys with a given prefix are skipped, but the
242   skipped part *is* identical for each node in the subtrie below the skipped
243   bit! trie_insert() in this implementation takes care of that - note the
244   call to tkey_sub_equals() in trie_insert().
245 
246   if n is an internal node - a 'tnode' here, the various parts of its key
247   have many different meanings.
248 
249   Example:
250   _________________________________________________________________
251   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252   -----------------------------------------------------------------
253     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
254 
255   _________________________________________________________________
256   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257   -----------------------------------------------------------------
258    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
259 
260   tp->pos = 7
261   tp->bits = 3
262   n->pos = 15
263   n->bits = 4
264 
265   First, let's just ignore the bits that come before the parent tp, that is
266   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267   not use them for anything.
268 
269   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270   index into the parent's child array. That is, they will be used to find
271   'n' among tp's children.
272 
273   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274   for the node n.
275 
276   All the bits we have seen so far are significant to the node n. The rest
277   of the bits are really not needed or indeed known in n->key.
278 
279   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280   n's child array, and will of course be different for each child.
281 
282 
283   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284   at this point.
285 
286 */
287 
288 static inline void check_tnode(const struct tnode *tn)
289 {
290 	WARN_ON(tn && tn->pos+tn->bits > 32);
291 }
292 
293 static int halve_threshold = 25;
294 static int inflate_threshold = 50;
295 static int halve_threshold_root = 8;
296 static int inflate_threshold_root = 15;
297 
298 
299 static void __alias_free_mem(struct rcu_head *head)
300 {
301 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
302 	kmem_cache_free(fn_alias_kmem, fa);
303 }
304 
305 static inline void alias_free_mem_rcu(struct fib_alias *fa)
306 {
307 	call_rcu(&fa->rcu, __alias_free_mem);
308 }
309 
310 static void __leaf_free_rcu(struct rcu_head *head)
311 {
312 	kfree(container_of(head, struct leaf, rcu));
313 }
314 
315 static void __leaf_info_free_rcu(struct rcu_head *head)
316 {
317 	kfree(container_of(head, struct leaf_info, rcu));
318 }
319 
320 static inline void free_leaf_info(struct leaf_info *leaf)
321 {
322 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
323 }
324 
325 static struct tnode *tnode_alloc(unsigned int size)
326 {
327 	struct page *pages;
328 
329 	if (size <= PAGE_SIZE)
330 		return kcalloc(size, 1, GFP_KERNEL);
331 
332 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
333 	if (!pages)
334 		return NULL;
335 
336 	return page_address(pages);
337 }
338 
339 static void __tnode_free_rcu(struct rcu_head *head)
340 {
341 	struct tnode *tn = container_of(head, struct tnode, rcu);
342 	unsigned int size = sizeof(struct tnode) +
343 		(1 << tn->bits) * sizeof(struct node *);
344 
345 	if (size <= PAGE_SIZE)
346 		kfree(tn);
347 	else
348 		free_pages((unsigned long)tn, get_order(size));
349 }
350 
351 static inline void tnode_free(struct tnode *tn)
352 {
353 	if (IS_LEAF(tn)) {
354 		struct leaf *l = (struct leaf *) tn;
355 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
356 	} else
357 		call_rcu(&tn->rcu, __tnode_free_rcu);
358 }
359 
360 static struct leaf *leaf_new(void)
361 {
362 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
363 	if (l) {
364 		l->parent = T_LEAF;
365 		INIT_HLIST_HEAD(&l->list);
366 	}
367 	return l;
368 }
369 
370 static struct leaf_info *leaf_info_new(int plen)
371 {
372 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
373 	if (li) {
374 		li->plen = plen;
375 		INIT_LIST_HEAD(&li->falh);
376 	}
377 	return li;
378 }
379 
380 static struct tnode* tnode_new(t_key key, int pos, int bits)
381 {
382 	int nchildren = 1<<bits;
383 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
384 	struct tnode *tn = tnode_alloc(sz);
385 
386 	if (tn) {
387 		memset(tn, 0, sz);
388 		tn->parent = T_TNODE;
389 		tn->pos = pos;
390 		tn->bits = bits;
391 		tn->key = key;
392 		tn->full_children = 0;
393 		tn->empty_children = 1<<bits;
394 	}
395 
396 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
397 		 (unsigned int) (sizeof(struct node) * 1<<bits));
398 	return tn;
399 }
400 
401 /*
402  * Check whether a tnode 'n' is "full", i.e. it is an internal node
403  * and no bits are skipped. See discussion in dyntree paper p. 6
404  */
405 
406 static inline int tnode_full(const struct tnode *tn, const struct node *n)
407 {
408 	if (n == NULL || IS_LEAF(n))
409 		return 0;
410 
411 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
412 }
413 
414 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
415 {
416 	tnode_put_child_reorg(tn, i, n, -1);
417 }
418 
419  /*
420   * Add a child at position i overwriting the old value.
421   * Update the value of full_children and empty_children.
422   */
423 
424 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
425 {
426 	struct node *chi = tn->child[i];
427 	int isfull;
428 
429 	BUG_ON(i >= 1<<tn->bits);
430 
431 
432 	/* update emptyChildren */
433 	if (n == NULL && chi != NULL)
434 		tn->empty_children++;
435 	else if (n != NULL && chi == NULL)
436 		tn->empty_children--;
437 
438 	/* update fullChildren */
439 	if (wasfull == -1)
440 		wasfull = tnode_full(tn, chi);
441 
442 	isfull = tnode_full(tn, n);
443 	if (wasfull && !isfull)
444 		tn->full_children--;
445 	else if (!wasfull && isfull)
446 		tn->full_children++;
447 
448 	if (n)
449 		NODE_SET_PARENT(n, tn);
450 
451 	rcu_assign_pointer(tn->child[i], n);
452 }
453 
454 static struct node *resize(struct trie *t, struct tnode *tn)
455 {
456 	int i;
457 	int err = 0;
458 	struct tnode *old_tn;
459 	int inflate_threshold_use;
460 	int halve_threshold_use;
461 	int max_resize;
462 
463 	if (!tn)
464 		return NULL;
465 
466 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 		 tn, inflate_threshold, halve_threshold);
468 
469 	/* No children */
470 	if (tn->empty_children == tnode_child_length(tn)) {
471 		tnode_free(tn);
472 		return NULL;
473 	}
474 	/* One child */
475 	if (tn->empty_children == tnode_child_length(tn) - 1)
476 		for (i = 0; i < tnode_child_length(tn); i++) {
477 			struct node *n;
478 
479 			n = tn->child[i];
480 			if (!n)
481 				continue;
482 
483 			/* compress one level */
484 			NODE_SET_PARENT(n, NULL);
485 			tnode_free(tn);
486 			return n;
487 		}
488 	/*
489 	 * Double as long as the resulting node has a number of
490 	 * nonempty nodes that are above the threshold.
491 	 */
492 
493 	/*
494 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
495 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
496 	 * Telecommunications, page 6:
497 	 * "A node is doubled if the ratio of non-empty children to all
498 	 * children in the *doubled* node is at least 'high'."
499 	 *
500 	 * 'high' in this instance is the variable 'inflate_threshold'. It
501 	 * is expressed as a percentage, so we multiply it with
502 	 * tnode_child_length() and instead of multiplying by 2 (since the
503 	 * child array will be doubled by inflate()) and multiplying
504 	 * the left-hand side by 100 (to handle the percentage thing) we
505 	 * multiply the left-hand side by 50.
506 	 *
507 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
508 	 * - tn->empty_children is of course the number of non-null children
509 	 * in the current node. tn->full_children is the number of "full"
510 	 * children, that is non-null tnodes with a skip value of 0.
511 	 * All of those will be doubled in the resulting inflated tnode, so
512 	 * we just count them one extra time here.
513 	 *
514 	 * A clearer way to write this would be:
515 	 *
516 	 * to_be_doubled = tn->full_children;
517 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
518 	 *     tn->full_children;
519 	 *
520 	 * new_child_length = tnode_child_length(tn) * 2;
521 	 *
522 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
523 	 *      new_child_length;
524 	 * if (new_fill_factor >= inflate_threshold)
525 	 *
526 	 * ...and so on, tho it would mess up the while () loop.
527 	 *
528 	 * anyway,
529 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
530 	 *      inflate_threshold
531 	 *
532 	 * avoid a division:
533 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
534 	 *      inflate_threshold * new_child_length
535 	 *
536 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
537 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
538 	 *    tn->full_children) >= inflate_threshold * new_child_length
539 	 *
540 	 * expand new_child_length:
541 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 	 *    tn->full_children) >=
543 	 *      inflate_threshold * tnode_child_length(tn) * 2
544 	 *
545 	 * shorten again:
546 	 * 50 * (tn->full_children + tnode_child_length(tn) -
547 	 *    tn->empty_children) >= inflate_threshold *
548 	 *    tnode_child_length(tn)
549 	 *
550 	 */
551 
552 	check_tnode(tn);
553 
554 	/* Keep root node larger  */
555 
556 	if (!tn->parent)
557 		inflate_threshold_use = inflate_threshold_root;
558 	else
559 		inflate_threshold_use = inflate_threshold;
560 
561 	err = 0;
562 	max_resize = 10;
563 	while ((tn->full_children > 0 &&  max_resize-- &&
564 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
565 				inflate_threshold_use * tnode_child_length(tn))) {
566 
567 		old_tn = tn;
568 		tn = inflate(t, tn);
569 		if (IS_ERR(tn)) {
570 			tn = old_tn;
571 #ifdef CONFIG_IP_FIB_TRIE_STATS
572 			t->stats.resize_node_skipped++;
573 #endif
574 			break;
575 		}
576 	}
577 
578 	if (max_resize < 0) {
579 		if (!tn->parent)
580 			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
581 			       inflate_threshold_root, tn->bits);
582 		else
583 			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
584 			       inflate_threshold, tn->bits);
585 	}
586 
587 	check_tnode(tn);
588 
589 	/*
590 	 * Halve as long as the number of empty children in this
591 	 * node is above threshold.
592 	 */
593 
594 
595 	/* Keep root node larger  */
596 
597 	if (!tn->parent)
598 		halve_threshold_use = halve_threshold_root;
599 	else
600 		halve_threshold_use = halve_threshold;
601 
602 	err = 0;
603 	max_resize = 10;
604 	while (tn->bits > 1 &&  max_resize-- &&
605 	       100 * (tnode_child_length(tn) - tn->empty_children) <
606 	       halve_threshold_use * tnode_child_length(tn)) {
607 
608 		old_tn = tn;
609 		tn = halve(t, tn);
610 		if (IS_ERR(tn)) {
611 			tn = old_tn;
612 #ifdef CONFIG_IP_FIB_TRIE_STATS
613 			t->stats.resize_node_skipped++;
614 #endif
615 			break;
616 		}
617 	}
618 
619 	if (max_resize < 0) {
620 		if (!tn->parent)
621 			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
622 			       halve_threshold_root, tn->bits);
623 		else
624 			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
625 			       halve_threshold, tn->bits);
626 	}
627 
628 	/* Only one child remains */
629 	if (tn->empty_children == tnode_child_length(tn) - 1)
630 		for (i = 0; i < tnode_child_length(tn); i++) {
631 			struct node *n;
632 
633 			n = tn->child[i];
634 			if (!n)
635 				continue;
636 
637 			/* compress one level */
638 
639 			NODE_SET_PARENT(n, NULL);
640 			tnode_free(tn);
641 			return n;
642 		}
643 
644 	return (struct node *) tn;
645 }
646 
647 static struct tnode *inflate(struct trie *t, struct tnode *tn)
648 {
649 	struct tnode *inode;
650 	struct tnode *oldtnode = tn;
651 	int olen = tnode_child_length(tn);
652 	int i;
653 
654 	pr_debug("In inflate\n");
655 
656 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657 
658 	if (!tn)
659 		return ERR_PTR(-ENOMEM);
660 
661 	/*
662 	 * Preallocate and store tnodes before the actual work so we
663 	 * don't get into an inconsistent state if memory allocation
664 	 * fails. In case of failure we return the oldnode and  inflate
665 	 * of tnode is ignored.
666 	 */
667 
668 	for (i = 0; i < olen; i++) {
669 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670 
671 		if (inode &&
672 		    IS_TNODE(inode) &&
673 		    inode->pos == oldtnode->pos + oldtnode->bits &&
674 		    inode->bits > 1) {
675 			struct tnode *left, *right;
676 			t_key m = TKEY_GET_MASK(inode->pos, 1);
677 
678 			left = tnode_new(inode->key&(~m), inode->pos + 1,
679 					 inode->bits - 1);
680 			if (!left)
681 				goto nomem;
682 
683 			right = tnode_new(inode->key|m, inode->pos + 1,
684 					  inode->bits - 1);
685 
686 			if (!right) {
687 				tnode_free(left);
688 				goto nomem;
689 			}
690 
691 			put_child(t, tn, 2*i, (struct node *) left);
692 			put_child(t, tn, 2*i+1, (struct node *) right);
693 		}
694 	}
695 
696 	for (i = 0; i < olen; i++) {
697 		struct node *node = tnode_get_child(oldtnode, i);
698 		struct tnode *left, *right;
699 		int size, j;
700 
701 		/* An empty child */
702 		if (node == NULL)
703 			continue;
704 
705 		/* A leaf or an internal node with skipped bits */
706 
707 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
708 		   tn->pos + tn->bits - 1) {
709 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
710 					     1) == 0)
711 				put_child(t, tn, 2*i, node);
712 			else
713 				put_child(t, tn, 2*i+1, node);
714 			continue;
715 		}
716 
717 		/* An internal node with two children */
718 		inode = (struct tnode *) node;
719 
720 		if (inode->bits == 1) {
721 			put_child(t, tn, 2*i, inode->child[0]);
722 			put_child(t, tn, 2*i+1, inode->child[1]);
723 
724 			tnode_free(inode);
725 			continue;
726 		}
727 
728 		/* An internal node with more than two children */
729 
730 		/* We will replace this node 'inode' with two new
731 		 * ones, 'left' and 'right', each with half of the
732 		 * original children. The two new nodes will have
733 		 * a position one bit further down the key and this
734 		 * means that the "significant" part of their keys
735 		 * (see the discussion near the top of this file)
736 		 * will differ by one bit, which will be "0" in
737 		 * left's key and "1" in right's key. Since we are
738 		 * moving the key position by one step, the bit that
739 		 * we are moving away from - the bit at position
740 		 * (inode->pos) - is the one that will differ between
741 		 * left and right. So... we synthesize that bit in the
742 		 * two  new keys.
743 		 * The mask 'm' below will be a single "one" bit at
744 		 * the position (inode->pos)
745 		 */
746 
747 		/* Use the old key, but set the new significant
748 		 *   bit to zero.
749 		 */
750 
751 		left = (struct tnode *) tnode_get_child(tn, 2*i);
752 		put_child(t, tn, 2*i, NULL);
753 
754 		BUG_ON(!left);
755 
756 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
757 		put_child(t, tn, 2*i+1, NULL);
758 
759 		BUG_ON(!right);
760 
761 		size = tnode_child_length(left);
762 		for (j = 0; j < size; j++) {
763 			put_child(t, left, j, inode->child[j]);
764 			put_child(t, right, j, inode->child[j + size]);
765 		}
766 		put_child(t, tn, 2*i, resize(t, left));
767 		put_child(t, tn, 2*i+1, resize(t, right));
768 
769 		tnode_free(inode);
770 	}
771 	tnode_free(oldtnode);
772 	return tn;
773 nomem:
774 	{
775 		int size = tnode_child_length(tn);
776 		int j;
777 
778 		for (j = 0; j < size; j++)
779 			if (tn->child[j])
780 				tnode_free((struct tnode *)tn->child[j]);
781 
782 		tnode_free(tn);
783 
784 		return ERR_PTR(-ENOMEM);
785 	}
786 }
787 
788 static struct tnode *halve(struct trie *t, struct tnode *tn)
789 {
790 	struct tnode *oldtnode = tn;
791 	struct node *left, *right;
792 	int i;
793 	int olen = tnode_child_length(tn);
794 
795 	pr_debug("In halve\n");
796 
797 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
798 
799 	if (!tn)
800 		return ERR_PTR(-ENOMEM);
801 
802 	/*
803 	 * Preallocate and store tnodes before the actual work so we
804 	 * don't get into an inconsistent state if memory allocation
805 	 * fails. In case of failure we return the oldnode and halve
806 	 * of tnode is ignored.
807 	 */
808 
809 	for (i = 0; i < olen; i += 2) {
810 		left = tnode_get_child(oldtnode, i);
811 		right = tnode_get_child(oldtnode, i+1);
812 
813 		/* Two nonempty children */
814 		if (left && right) {
815 			struct tnode *newn;
816 
817 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
818 
819 			if (!newn)
820 				goto nomem;
821 
822 			put_child(t, tn, i/2, (struct node *)newn);
823 		}
824 
825 	}
826 
827 	for (i = 0; i < olen; i += 2) {
828 		struct tnode *newBinNode;
829 
830 		left = tnode_get_child(oldtnode, i);
831 		right = tnode_get_child(oldtnode, i+1);
832 
833 		/* At least one of the children is empty */
834 		if (left == NULL) {
835 			if (right == NULL)    /* Both are empty */
836 				continue;
837 			put_child(t, tn, i/2, right);
838 			continue;
839 		}
840 
841 		if (right == NULL) {
842 			put_child(t, tn, i/2, left);
843 			continue;
844 		}
845 
846 		/* Two nonempty children */
847 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
848 		put_child(t, tn, i/2, NULL);
849 		put_child(t, newBinNode, 0, left);
850 		put_child(t, newBinNode, 1, right);
851 		put_child(t, tn, i/2, resize(t, newBinNode));
852 	}
853 	tnode_free(oldtnode);
854 	return tn;
855 nomem:
856 	{
857 		int size = tnode_child_length(tn);
858 		int j;
859 
860 		for (j = 0; j < size; j++)
861 			if (tn->child[j])
862 				tnode_free((struct tnode *)tn->child[j]);
863 
864 		tnode_free(tn);
865 
866 		return ERR_PTR(-ENOMEM);
867 	}
868 }
869 
870 static void trie_init(struct trie *t)
871 {
872 	if (!t)
873 		return;
874 
875 	t->size = 0;
876 	rcu_assign_pointer(t->trie, NULL);
877 	t->revision = 0;
878 #ifdef CONFIG_IP_FIB_TRIE_STATS
879 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
880 #endif
881 }
882 
883 /* readside must use rcu_read_lock currently dump routines
884  via get_fa_head and dump */
885 
886 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
887 {
888 	struct hlist_head *head = &l->list;
889 	struct hlist_node *node;
890 	struct leaf_info *li;
891 
892 	hlist_for_each_entry_rcu(li, node, head, hlist)
893 		if (li->plen == plen)
894 			return li;
895 
896 	return NULL;
897 }
898 
899 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
900 {
901 	struct leaf_info *li = find_leaf_info(l, plen);
902 
903 	if (!li)
904 		return NULL;
905 
906 	return &li->falh;
907 }
908 
909 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
910 {
911 	struct leaf_info *li = NULL, *last = NULL;
912 	struct hlist_node *node;
913 
914 	if (hlist_empty(head)) {
915 		hlist_add_head_rcu(&new->hlist, head);
916 	} else {
917 		hlist_for_each_entry(li, node, head, hlist) {
918 			if (new->plen > li->plen)
919 				break;
920 
921 			last = li;
922 		}
923 		if (last)
924 			hlist_add_after_rcu(&last->hlist, &new->hlist);
925 		else
926 			hlist_add_before_rcu(&new->hlist, &li->hlist);
927 	}
928 }
929 
930 /* rcu_read_lock needs to be hold by caller from readside */
931 
932 static struct leaf *
933 fib_find_node(struct trie *t, u32 key)
934 {
935 	int pos;
936 	struct tnode *tn;
937 	struct node *n;
938 
939 	pos = 0;
940 	n = rcu_dereference(t->trie);
941 
942 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
943 		tn = (struct tnode *) n;
944 
945 		check_tnode(tn);
946 
947 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
948 			pos = tn->pos + tn->bits;
949 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
950 		} else
951 			break;
952 	}
953 	/* Case we have found a leaf. Compare prefixes */
954 
955 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
956 		return (struct leaf *)n;
957 
958 	return NULL;
959 }
960 
961 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
962 {
963 	int wasfull;
964 	t_key cindex, key;
965 	struct tnode *tp = NULL;
966 
967 	key = tn->key;
968 
969 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
970 
971 		tp = NODE_PARENT(tn);
972 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
973 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
974 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
975 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
976 
977 		if (!NODE_PARENT(tn))
978 			break;
979 
980 		tn = NODE_PARENT(tn);
981 	}
982 	/* Handle last (top) tnode */
983 	if (IS_TNODE(tn))
984 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
985 
986 	return (struct node*) tn;
987 }
988 
989 /* only used from updater-side */
990 
991 static  struct list_head *
992 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
993 {
994 	int pos, newpos;
995 	struct tnode *tp = NULL, *tn = NULL;
996 	struct node *n;
997 	struct leaf *l;
998 	int missbit;
999 	struct list_head *fa_head = NULL;
1000 	struct leaf_info *li;
1001 	t_key cindex;
1002 
1003 	pos = 0;
1004 	n = t->trie;
1005 
1006 	/* If we point to NULL, stop. Either the tree is empty and we should
1007 	 * just put a new leaf in if, or we have reached an empty child slot,
1008 	 * and we should just put our new leaf in that.
1009 	 * If we point to a T_TNODE, check if it matches our key. Note that
1010 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1011 	 * not be the parent's 'pos'+'bits'!
1012 	 *
1013 	 * If it does match the current key, get pos/bits from it, extract
1014 	 * the index from our key, push the T_TNODE and walk the tree.
1015 	 *
1016 	 * If it doesn't, we have to replace it with a new T_TNODE.
1017 	 *
1018 	 * If we point to a T_LEAF, it might or might not have the same key
1019 	 * as we do. If it does, just change the value, update the T_LEAF's
1020 	 * value, and return it.
1021 	 * If it doesn't, we need to replace it with a T_TNODE.
1022 	 */
1023 
1024 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1025 		tn = (struct tnode *) n;
1026 
1027 		check_tnode(tn);
1028 
1029 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1030 			tp = tn;
1031 			pos = tn->pos + tn->bits;
1032 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1033 
1034 			BUG_ON(n && NODE_PARENT(n) != tn);
1035 		} else
1036 			break;
1037 	}
1038 
1039 	/*
1040 	 * n  ----> NULL, LEAF or TNODE
1041 	 *
1042 	 * tp is n's (parent) ----> NULL or TNODE
1043 	 */
1044 
1045 	BUG_ON(tp && IS_LEAF(tp));
1046 
1047 	/* Case 1: n is a leaf. Compare prefixes */
1048 
1049 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1050 		struct leaf *l = (struct leaf *) n;
1051 
1052 		li = leaf_info_new(plen);
1053 
1054 		if (!li) {
1055 			*err = -ENOMEM;
1056 			goto err;
1057 		}
1058 
1059 		fa_head = &li->falh;
1060 		insert_leaf_info(&l->list, li);
1061 		goto done;
1062 	}
1063 	t->size++;
1064 	l = leaf_new();
1065 
1066 	if (!l) {
1067 		*err = -ENOMEM;
1068 		goto err;
1069 	}
1070 
1071 	l->key = key;
1072 	li = leaf_info_new(plen);
1073 
1074 	if (!li) {
1075 		tnode_free((struct tnode *) l);
1076 		*err = -ENOMEM;
1077 		goto err;
1078 	}
1079 
1080 	fa_head = &li->falh;
1081 	insert_leaf_info(&l->list, li);
1082 
1083 	if (t->trie && n == NULL) {
1084 		/* Case 2: n is NULL, and will just insert a new leaf */
1085 
1086 		NODE_SET_PARENT(l, tp);
1087 
1088 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1089 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1090 	} else {
1091 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1092 		/*
1093 		 *  Add a new tnode here
1094 		 *  first tnode need some special handling
1095 		 */
1096 
1097 		if (tp)
1098 			pos = tp->pos+tp->bits;
1099 		else
1100 			pos = 0;
1101 
1102 		if (n) {
1103 			newpos = tkey_mismatch(key, pos, n->key);
1104 			tn = tnode_new(n->key, newpos, 1);
1105 		} else {
1106 			newpos = 0;
1107 			tn = tnode_new(key, newpos, 1); /* First tnode */
1108 		}
1109 
1110 		if (!tn) {
1111 			free_leaf_info(li);
1112 			tnode_free((struct tnode *) l);
1113 			*err = -ENOMEM;
1114 			goto err;
1115 		}
1116 
1117 		NODE_SET_PARENT(tn, tp);
1118 
1119 		missbit = tkey_extract_bits(key, newpos, 1);
1120 		put_child(t, tn, missbit, (struct node *)l);
1121 		put_child(t, tn, 1-missbit, n);
1122 
1123 		if (tp) {
1124 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1126 		} else {
1127 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1128 			tp = tn;
1129 		}
1130 	}
1131 
1132 	if (tp && tp->pos + tp->bits > 32)
1133 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1134 		       tp, tp->pos, tp->bits, key, plen);
1135 
1136 	/* Rebalance the trie */
1137 
1138 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1139 done:
1140 	t->revision++;
1141 err:
1142 	return fa_head;
1143 }
1144 
1145 /*
1146  * Caller must hold RTNL.
1147  */
1148 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1149 {
1150 	struct trie *t = (struct trie *) tb->tb_data;
1151 	struct fib_alias *fa, *new_fa;
1152 	struct list_head *fa_head = NULL;
1153 	struct fib_info *fi;
1154 	int plen = cfg->fc_dst_len;
1155 	u8 tos = cfg->fc_tos;
1156 	u32 key, mask;
1157 	int err;
1158 	struct leaf *l;
1159 
1160 	if (plen > 32)
1161 		return -EINVAL;
1162 
1163 	key = ntohl(cfg->fc_dst);
1164 
1165 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1166 
1167 	mask = ntohl(inet_make_mask(plen));
1168 
1169 	if (key & ~mask)
1170 		return -EINVAL;
1171 
1172 	key = key & mask;
1173 
1174 	fi = fib_create_info(cfg);
1175 	if (IS_ERR(fi)) {
1176 		err = PTR_ERR(fi);
1177 		goto err;
1178 	}
1179 
1180 	l = fib_find_node(t, key);
1181 	fa = NULL;
1182 
1183 	if (l) {
1184 		fa_head = get_fa_head(l, plen);
1185 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1186 	}
1187 
1188 	/* Now fa, if non-NULL, points to the first fib alias
1189 	 * with the same keys [prefix,tos,priority], if such key already
1190 	 * exists or to the node before which we will insert new one.
1191 	 *
1192 	 * If fa is NULL, we will need to allocate a new one and
1193 	 * insert to the head of f.
1194 	 *
1195 	 * If f is NULL, no fib node matched the destination key
1196 	 * and we need to allocate a new one of those as well.
1197 	 */
1198 
1199 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1200 		struct fib_alias *fa_orig;
1201 
1202 		err = -EEXIST;
1203 		if (cfg->fc_nlflags & NLM_F_EXCL)
1204 			goto out;
1205 
1206 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1207 			struct fib_info *fi_drop;
1208 			u8 state;
1209 
1210 			err = -ENOBUFS;
1211 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1212 			if (new_fa == NULL)
1213 				goto out;
1214 
1215 			fi_drop = fa->fa_info;
1216 			new_fa->fa_tos = fa->fa_tos;
1217 			new_fa->fa_info = fi;
1218 			new_fa->fa_type = cfg->fc_type;
1219 			new_fa->fa_scope = cfg->fc_scope;
1220 			state = fa->fa_state;
1221 			new_fa->fa_state &= ~FA_S_ACCESSED;
1222 
1223 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1224 			alias_free_mem_rcu(fa);
1225 
1226 			fib_release_info(fi_drop);
1227 			if (state & FA_S_ACCESSED)
1228 				rt_cache_flush(-1);
1229 
1230 			goto succeeded;
1231 		}
1232 		/* Error if we find a perfect match which
1233 		 * uses the same scope, type, and nexthop
1234 		 * information.
1235 		 */
1236 		fa_orig = fa;
1237 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1238 			if (fa->fa_tos != tos)
1239 				break;
1240 			if (fa->fa_info->fib_priority != fi->fib_priority)
1241 				break;
1242 			if (fa->fa_type == cfg->fc_type &&
1243 			    fa->fa_scope == cfg->fc_scope &&
1244 			    fa->fa_info == fi) {
1245 				goto out;
1246 			}
1247 		}
1248 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1249 			fa = fa_orig;
1250 	}
1251 	err = -ENOENT;
1252 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1253 		goto out;
1254 
1255 	err = -ENOBUFS;
1256 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1257 	if (new_fa == NULL)
1258 		goto out;
1259 
1260 	new_fa->fa_info = fi;
1261 	new_fa->fa_tos = tos;
1262 	new_fa->fa_type = cfg->fc_type;
1263 	new_fa->fa_scope = cfg->fc_scope;
1264 	new_fa->fa_state = 0;
1265 	/*
1266 	 * Insert new entry to the list.
1267 	 */
1268 
1269 	if (!fa_head) {
1270 		err = 0;
1271 		fa_head = fib_insert_node(t, &err, key, plen);
1272 		if (err)
1273 			goto out_free_new_fa;
1274 	}
1275 
1276 	list_add_tail_rcu(&new_fa->fa_list,
1277 			  (fa ? &fa->fa_list : fa_head));
1278 
1279 	rt_cache_flush(-1);
1280 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1281 		  &cfg->fc_nlinfo);
1282 succeeded:
1283 	return 0;
1284 
1285 out_free_new_fa:
1286 	kmem_cache_free(fn_alias_kmem, new_fa);
1287 out:
1288 	fib_release_info(fi);
1289 err:
1290 	return err;
1291 }
1292 
1293 
1294 /* should be called with rcu_read_lock */
1295 static inline int check_leaf(struct trie *t, struct leaf *l,
1296 			     t_key key, int *plen, const struct flowi *flp,
1297 			     struct fib_result *res)
1298 {
1299 	int err, i;
1300 	__be32 mask;
1301 	struct leaf_info *li;
1302 	struct hlist_head *hhead = &l->list;
1303 	struct hlist_node *node;
1304 
1305 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1306 		i = li->plen;
1307 		mask = inet_make_mask(i);
1308 		if (l->key != (key & ntohl(mask)))
1309 			continue;
1310 
1311 		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1312 			*plen = i;
1313 #ifdef CONFIG_IP_FIB_TRIE_STATS
1314 			t->stats.semantic_match_passed++;
1315 #endif
1316 			return err;
1317 		}
1318 #ifdef CONFIG_IP_FIB_TRIE_STATS
1319 		t->stats.semantic_match_miss++;
1320 #endif
1321 	}
1322 	return 1;
1323 }
1324 
1325 static int
1326 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1327 {
1328 	struct trie *t = (struct trie *) tb->tb_data;
1329 	int plen, ret = 0;
1330 	struct node *n;
1331 	struct tnode *pn;
1332 	int pos, bits;
1333 	t_key key = ntohl(flp->fl4_dst);
1334 	int chopped_off;
1335 	t_key cindex = 0;
1336 	int current_prefix_length = KEYLENGTH;
1337 	struct tnode *cn;
1338 	t_key node_prefix, key_prefix, pref_mismatch;
1339 	int mp;
1340 
1341 	rcu_read_lock();
1342 
1343 	n = rcu_dereference(t->trie);
1344 	if (!n)
1345 		goto failed;
1346 
1347 #ifdef CONFIG_IP_FIB_TRIE_STATS
1348 	t->stats.gets++;
1349 #endif
1350 
1351 	/* Just a leaf? */
1352 	if (IS_LEAF(n)) {
1353 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1354 			goto found;
1355 		goto failed;
1356 	}
1357 	pn = (struct tnode *) n;
1358 	chopped_off = 0;
1359 
1360 	while (pn) {
1361 		pos = pn->pos;
1362 		bits = pn->bits;
1363 
1364 		if (!chopped_off)
1365 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1366 
1367 		n = tnode_get_child(pn, cindex);
1368 
1369 		if (n == NULL) {
1370 #ifdef CONFIG_IP_FIB_TRIE_STATS
1371 			t->stats.null_node_hit++;
1372 #endif
1373 			goto backtrace;
1374 		}
1375 
1376 		if (IS_LEAF(n)) {
1377 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1378 				goto found;
1379 			else
1380 				goto backtrace;
1381 		}
1382 
1383 #define HL_OPTIMIZE
1384 #ifdef HL_OPTIMIZE
1385 		cn = (struct tnode *)n;
1386 
1387 		/*
1388 		 * It's a tnode, and we can do some extra checks here if we
1389 		 * like, to avoid descending into a dead-end branch.
1390 		 * This tnode is in the parent's child array at index
1391 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1392 		 * chopped off, so in reality the index may be just a
1393 		 * subprefix, padded with zero at the end.
1394 		 * We can also take a look at any skipped bits in this
1395 		 * tnode - everything up to p_pos is supposed to be ok,
1396 		 * and the non-chopped bits of the index (se previous
1397 		 * paragraph) are also guaranteed ok, but the rest is
1398 		 * considered unknown.
1399 		 *
1400 		 * The skipped bits are key[pos+bits..cn->pos].
1401 		 */
1402 
1403 		/* If current_prefix_length < pos+bits, we are already doing
1404 		 * actual prefix  matching, which means everything from
1405 		 * pos+(bits-chopped_off) onward must be zero along some
1406 		 * branch of this subtree - otherwise there is *no* valid
1407 		 * prefix present. Here we can only check the skipped
1408 		 * bits. Remember, since we have already indexed into the
1409 		 * parent's child array, we know that the bits we chopped of
1410 		 * *are* zero.
1411 		 */
1412 
1413 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1414 
1415 		if (current_prefix_length < pos+bits) {
1416 			if (tkey_extract_bits(cn->key, current_prefix_length,
1417 						cn->pos - current_prefix_length) != 0 ||
1418 			    !(cn->child[0]))
1419 				goto backtrace;
1420 		}
1421 
1422 		/*
1423 		 * If chopped_off=0, the index is fully validated and we
1424 		 * only need to look at the skipped bits for this, the new,
1425 		 * tnode. What we actually want to do is to find out if
1426 		 * these skipped bits match our key perfectly, or if we will
1427 		 * have to count on finding a matching prefix further down,
1428 		 * because if we do, we would like to have some way of
1429 		 * verifying the existence of such a prefix at this point.
1430 		 */
1431 
1432 		/* The only thing we can do at this point is to verify that
1433 		 * any such matching prefix can indeed be a prefix to our
1434 		 * key, and if the bits in the node we are inspecting that
1435 		 * do not match our key are not ZERO, this cannot be true.
1436 		 * Thus, find out where there is a mismatch (before cn->pos)
1437 		 * and verify that all the mismatching bits are zero in the
1438 		 * new tnode's key.
1439 		 */
1440 
1441 		/* Note: We aren't very concerned about the piece of the key
1442 		 * that precede pn->pos+pn->bits, since these have already been
1443 		 * checked. The bits after cn->pos aren't checked since these are
1444 		 * by definition "unknown" at this point. Thus, what we want to
1445 		 * see is if we are about to enter the "prefix matching" state,
1446 		 * and in that case verify that the skipped bits that will prevail
1447 		 * throughout this subtree are zero, as they have to be if we are
1448 		 * to find a matching prefix.
1449 		 */
1450 
1451 		node_prefix = MASK_PFX(cn->key, cn->pos);
1452 		key_prefix = MASK_PFX(key, cn->pos);
1453 		pref_mismatch = key_prefix^node_prefix;
1454 		mp = 0;
1455 
1456 		/* In short: If skipped bits in this node do not match the search
1457 		 * key, enter the "prefix matching" state.directly.
1458 		 */
1459 		if (pref_mismatch) {
1460 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1461 				mp++;
1462 				pref_mismatch = pref_mismatch <<1;
1463 			}
1464 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1465 
1466 			if (key_prefix != 0)
1467 				goto backtrace;
1468 
1469 			if (current_prefix_length >= cn->pos)
1470 				current_prefix_length = mp;
1471 		}
1472 #endif
1473 		pn = (struct tnode *)n; /* Descend */
1474 		chopped_off = 0;
1475 		continue;
1476 
1477 backtrace:
1478 		chopped_off++;
1479 
1480 		/* As zero don't change the child key (cindex) */
1481 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1482 			chopped_off++;
1483 
1484 		/* Decrease current_... with bits chopped off */
1485 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1486 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1487 
1488 		/*
1489 		 * Either we do the actual chop off according or if we have
1490 		 * chopped off all bits in this tnode walk up to our parent.
1491 		 */
1492 
1493 		if (chopped_off <= pn->bits) {
1494 			cindex &= ~(1 << (chopped_off-1));
1495 		} else {
1496 			if (NODE_PARENT(pn) == NULL)
1497 				goto failed;
1498 
1499 			/* Get Child's index */
1500 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1501 			pn = NODE_PARENT(pn);
1502 			chopped_off = 0;
1503 
1504 #ifdef CONFIG_IP_FIB_TRIE_STATS
1505 			t->stats.backtrack++;
1506 #endif
1507 			goto backtrace;
1508 		}
1509 	}
1510 failed:
1511 	ret = 1;
1512 found:
1513 	rcu_read_unlock();
1514 	return ret;
1515 }
1516 
1517 /* only called from updater side */
1518 static int trie_leaf_remove(struct trie *t, t_key key)
1519 {
1520 	t_key cindex;
1521 	struct tnode *tp = NULL;
1522 	struct node *n = t->trie;
1523 	struct leaf *l;
1524 
1525 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1526 
1527 	/* Note that in the case skipped bits, those bits are *not* checked!
1528 	 * When we finish this, we will have NULL or a T_LEAF, and the
1529 	 * T_LEAF may or may not match our key.
1530 	 */
1531 
1532 	while (n != NULL && IS_TNODE(n)) {
1533 		struct tnode *tn = (struct tnode *) n;
1534 		check_tnode(tn);
1535 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1536 
1537 		BUG_ON(n && NODE_PARENT(n) != tn);
1538 	}
1539 	l = (struct leaf *) n;
1540 
1541 	if (!n || !tkey_equals(l->key, key))
1542 		return 0;
1543 
1544 	/*
1545 	 * Key found.
1546 	 * Remove the leaf and rebalance the tree
1547 	 */
1548 
1549 	t->revision++;
1550 	t->size--;
1551 
1552 	tp = NODE_PARENT(n);
1553 	tnode_free((struct tnode *) n);
1554 
1555 	if (tp) {
1556 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1557 		put_child(t, (struct tnode *)tp, cindex, NULL);
1558 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1559 	} else
1560 		rcu_assign_pointer(t->trie, NULL);
1561 
1562 	return 1;
1563 }
1564 
1565 /*
1566  * Caller must hold RTNL.
1567  */
1568 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1569 {
1570 	struct trie *t = (struct trie *) tb->tb_data;
1571 	u32 key, mask;
1572 	int plen = cfg->fc_dst_len;
1573 	u8 tos = cfg->fc_tos;
1574 	struct fib_alias *fa, *fa_to_delete;
1575 	struct list_head *fa_head;
1576 	struct leaf *l;
1577 	struct leaf_info *li;
1578 
1579 	if (plen > 32)
1580 		return -EINVAL;
1581 
1582 	key = ntohl(cfg->fc_dst);
1583 	mask = ntohl(inet_make_mask(plen));
1584 
1585 	if (key & ~mask)
1586 		return -EINVAL;
1587 
1588 	key = key & mask;
1589 	l = fib_find_node(t, key);
1590 
1591 	if (!l)
1592 		return -ESRCH;
1593 
1594 	fa_head = get_fa_head(l, plen);
1595 	fa = fib_find_alias(fa_head, tos, 0);
1596 
1597 	if (!fa)
1598 		return -ESRCH;
1599 
1600 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1601 
1602 	fa_to_delete = NULL;
1603 	fa_head = fa->fa_list.prev;
1604 
1605 	list_for_each_entry(fa, fa_head, fa_list) {
1606 		struct fib_info *fi = fa->fa_info;
1607 
1608 		if (fa->fa_tos != tos)
1609 			break;
1610 
1611 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1612 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1613 		     fa->fa_scope == cfg->fc_scope) &&
1614 		    (!cfg->fc_protocol ||
1615 		     fi->fib_protocol == cfg->fc_protocol) &&
1616 		    fib_nh_match(cfg, fi) == 0) {
1617 			fa_to_delete = fa;
1618 			break;
1619 		}
1620 	}
1621 
1622 	if (!fa_to_delete)
1623 		return -ESRCH;
1624 
1625 	fa = fa_to_delete;
1626 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1627 		  &cfg->fc_nlinfo);
1628 
1629 	l = fib_find_node(t, key);
1630 	li = find_leaf_info(l, plen);
1631 
1632 	list_del_rcu(&fa->fa_list);
1633 
1634 	if (list_empty(fa_head)) {
1635 		hlist_del_rcu(&li->hlist);
1636 		free_leaf_info(li);
1637 	}
1638 
1639 	if (hlist_empty(&l->list))
1640 		trie_leaf_remove(t, key);
1641 
1642 	if (fa->fa_state & FA_S_ACCESSED)
1643 		rt_cache_flush(-1);
1644 
1645 	fib_release_info(fa->fa_info);
1646 	alias_free_mem_rcu(fa);
1647 	return 0;
1648 }
1649 
1650 static int trie_flush_list(struct trie *t, struct list_head *head)
1651 {
1652 	struct fib_alias *fa, *fa_node;
1653 	int found = 0;
1654 
1655 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1656 		struct fib_info *fi = fa->fa_info;
1657 
1658 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1659 			list_del_rcu(&fa->fa_list);
1660 			fib_release_info(fa->fa_info);
1661 			alias_free_mem_rcu(fa);
1662 			found++;
1663 		}
1664 	}
1665 	return found;
1666 }
1667 
1668 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1669 {
1670 	int found = 0;
1671 	struct hlist_head *lih = &l->list;
1672 	struct hlist_node *node, *tmp;
1673 	struct leaf_info *li = NULL;
1674 
1675 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1676 		found += trie_flush_list(t, &li->falh);
1677 
1678 		if (list_empty(&li->falh)) {
1679 			hlist_del_rcu(&li->hlist);
1680 			free_leaf_info(li);
1681 		}
1682 	}
1683 	return found;
1684 }
1685 
1686 /* rcu_read_lock needs to be hold by caller from readside */
1687 
1688 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1689 {
1690 	struct node *c = (struct node *) thisleaf;
1691 	struct tnode *p;
1692 	int idx;
1693 	struct node *trie = rcu_dereference(t->trie);
1694 
1695 	if (c == NULL) {
1696 		if (trie == NULL)
1697 			return NULL;
1698 
1699 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1700 			return (struct leaf *) trie;
1701 
1702 		p = (struct tnode*) trie;  /* Start */
1703 	} else
1704 		p = (struct tnode *) NODE_PARENT(c);
1705 
1706 	while (p) {
1707 		int pos, last;
1708 
1709 		/*  Find the next child of the parent */
1710 		if (c)
1711 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1712 		else
1713 			pos = 0;
1714 
1715 		last = 1 << p->bits;
1716 		for (idx = pos; idx < last ; idx++) {
1717 			c = rcu_dereference(p->child[idx]);
1718 
1719 			if (!c)
1720 				continue;
1721 
1722 			/* Decend if tnode */
1723 			while (IS_TNODE(c)) {
1724 				p = (struct tnode *) c;
1725 				idx = 0;
1726 
1727 				/* Rightmost non-NULL branch */
1728 				if (p && IS_TNODE(p))
1729 					while (!(c = rcu_dereference(p->child[idx]))
1730 					       && idx < (1<<p->bits)) idx++;
1731 
1732 				/* Done with this tnode? */
1733 				if (idx >= (1 << p->bits) || !c)
1734 					goto up;
1735 			}
1736 			return (struct leaf *) c;
1737 		}
1738 up:
1739 		/* No more children go up one step  */
1740 		c = (struct node *) p;
1741 		p = (struct tnode *) NODE_PARENT(p);
1742 	}
1743 	return NULL; /* Ready. Root of trie */
1744 }
1745 
1746 /*
1747  * Caller must hold RTNL.
1748  */
1749 static int fn_trie_flush(struct fib_table *tb)
1750 {
1751 	struct trie *t = (struct trie *) tb->tb_data;
1752 	struct leaf *ll = NULL, *l = NULL;
1753 	int found = 0, h;
1754 
1755 	t->revision++;
1756 
1757 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1758 		found += trie_flush_leaf(t, l);
1759 
1760 		if (ll && hlist_empty(&ll->list))
1761 			trie_leaf_remove(t, ll->key);
1762 		ll = l;
1763 	}
1764 
1765 	if (ll && hlist_empty(&ll->list))
1766 		trie_leaf_remove(t, ll->key);
1767 
1768 	pr_debug("trie_flush found=%d\n", found);
1769 	return found;
1770 }
1771 
1772 static int trie_last_dflt = -1;
1773 
1774 static void
1775 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1776 {
1777 	struct trie *t = (struct trie *) tb->tb_data;
1778 	int order, last_idx;
1779 	struct fib_info *fi = NULL;
1780 	struct fib_info *last_resort;
1781 	struct fib_alias *fa = NULL;
1782 	struct list_head *fa_head;
1783 	struct leaf *l;
1784 
1785 	last_idx = -1;
1786 	last_resort = NULL;
1787 	order = -1;
1788 
1789 	rcu_read_lock();
1790 
1791 	l = fib_find_node(t, 0);
1792 	if (!l)
1793 		goto out;
1794 
1795 	fa_head = get_fa_head(l, 0);
1796 	if (!fa_head)
1797 		goto out;
1798 
1799 	if (list_empty(fa_head))
1800 		goto out;
1801 
1802 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1803 		struct fib_info *next_fi = fa->fa_info;
1804 
1805 		if (fa->fa_scope != res->scope ||
1806 		    fa->fa_type != RTN_UNICAST)
1807 			continue;
1808 
1809 		if (next_fi->fib_priority > res->fi->fib_priority)
1810 			break;
1811 		if (!next_fi->fib_nh[0].nh_gw ||
1812 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1813 			continue;
1814 		fa->fa_state |= FA_S_ACCESSED;
1815 
1816 		if (fi == NULL) {
1817 			if (next_fi != res->fi)
1818 				break;
1819 		} else if (!fib_detect_death(fi, order, &last_resort,
1820 					     &last_idx, &trie_last_dflt)) {
1821 			if (res->fi)
1822 				fib_info_put(res->fi);
1823 			res->fi = fi;
1824 			atomic_inc(&fi->fib_clntref);
1825 			trie_last_dflt = order;
1826 			goto out;
1827 		}
1828 		fi = next_fi;
1829 		order++;
1830 	}
1831 	if (order <= 0 || fi == NULL) {
1832 		trie_last_dflt = -1;
1833 		goto out;
1834 	}
1835 
1836 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1837 		if (res->fi)
1838 			fib_info_put(res->fi);
1839 		res->fi = fi;
1840 		atomic_inc(&fi->fib_clntref);
1841 		trie_last_dflt = order;
1842 		goto out;
1843 	}
1844 	if (last_idx >= 0) {
1845 		if (res->fi)
1846 			fib_info_put(res->fi);
1847 		res->fi = last_resort;
1848 		if (last_resort)
1849 			atomic_inc(&last_resort->fib_clntref);
1850 	}
1851 	trie_last_dflt = last_idx;
1852  out:;
1853 	rcu_read_unlock();
1854 }
1855 
1856 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1857 			   struct sk_buff *skb, struct netlink_callback *cb)
1858 {
1859 	int i, s_i;
1860 	struct fib_alias *fa;
1861 
1862 	__be32 xkey = htonl(key);
1863 
1864 	s_i = cb->args[4];
1865 	i = 0;
1866 
1867 	/* rcu_read_lock is hold by caller */
1868 
1869 	list_for_each_entry_rcu(fa, fah, fa_list) {
1870 		if (i < s_i) {
1871 			i++;
1872 			continue;
1873 		}
1874 		BUG_ON(!fa->fa_info);
1875 
1876 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1877 				  cb->nlh->nlmsg_seq,
1878 				  RTM_NEWROUTE,
1879 				  tb->tb_id,
1880 				  fa->fa_type,
1881 				  fa->fa_scope,
1882 				  xkey,
1883 				  plen,
1884 				  fa->fa_tos,
1885 				  fa->fa_info, 0) < 0) {
1886 			cb->args[4] = i;
1887 			return -1;
1888 		}
1889 		i++;
1890 	}
1891 	cb->args[4] = i;
1892 	return skb->len;
1893 }
1894 
1895 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1896 			     struct netlink_callback *cb)
1897 {
1898 	int h, s_h;
1899 	struct list_head *fa_head;
1900 	struct leaf *l = NULL;
1901 
1902 	s_h = cb->args[3];
1903 
1904 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1905 		if (h < s_h)
1906 			continue;
1907 		if (h > s_h)
1908 			memset(&cb->args[4], 0,
1909 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1910 
1911 		fa_head = get_fa_head(l, plen);
1912 
1913 		if (!fa_head)
1914 			continue;
1915 
1916 		if (list_empty(fa_head))
1917 			continue;
1918 
1919 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1920 			cb->args[3] = h;
1921 			return -1;
1922 		}
1923 	}
1924 	cb->args[3] = h;
1925 	return skb->len;
1926 }
1927 
1928 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1929 {
1930 	int m, s_m;
1931 	struct trie *t = (struct trie *) tb->tb_data;
1932 
1933 	s_m = cb->args[2];
1934 
1935 	rcu_read_lock();
1936 	for (m = 0; m <= 32; m++) {
1937 		if (m < s_m)
1938 			continue;
1939 		if (m > s_m)
1940 			memset(&cb->args[3], 0,
1941 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1942 
1943 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1944 			cb->args[2] = m;
1945 			goto out;
1946 		}
1947 	}
1948 	rcu_read_unlock();
1949 	cb->args[2] = m;
1950 	return skb->len;
1951 out:
1952 	rcu_read_unlock();
1953 	return -1;
1954 }
1955 
1956 /* Fix more generic FIB names for init later */
1957 
1958 #ifdef CONFIG_IP_MULTIPLE_TABLES
1959 struct fib_table * fib_hash_init(u32 id)
1960 #else
1961 struct fib_table * __init fib_hash_init(u32 id)
1962 #endif
1963 {
1964 	struct fib_table *tb;
1965 	struct trie *t;
1966 
1967 	if (fn_alias_kmem == NULL)
1968 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1969 						  sizeof(struct fib_alias),
1970 						  0, SLAB_HWCACHE_ALIGN,
1971 						  NULL, NULL);
1972 
1973 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1974 		     GFP_KERNEL);
1975 	if (tb == NULL)
1976 		return NULL;
1977 
1978 	tb->tb_id = id;
1979 	tb->tb_lookup = fn_trie_lookup;
1980 	tb->tb_insert = fn_trie_insert;
1981 	tb->tb_delete = fn_trie_delete;
1982 	tb->tb_flush = fn_trie_flush;
1983 	tb->tb_select_default = fn_trie_select_default;
1984 	tb->tb_dump = fn_trie_dump;
1985 	memset(tb->tb_data, 0, sizeof(struct trie));
1986 
1987 	t = (struct trie *) tb->tb_data;
1988 
1989 	trie_init(t);
1990 
1991 	if (id == RT_TABLE_LOCAL)
1992 		trie_local = t;
1993 	else if (id == RT_TABLE_MAIN)
1994 		trie_main = t;
1995 
1996 	if (id == RT_TABLE_LOCAL)
1997 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1998 
1999 	return tb;
2000 }
2001 
2002 #ifdef CONFIG_PROC_FS
2003 /* Depth first Trie walk iterator */
2004 struct fib_trie_iter {
2005 	struct tnode *tnode;
2006 	struct trie *trie;
2007 	unsigned index;
2008 	unsigned depth;
2009 };
2010 
2011 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2012 {
2013 	struct tnode *tn = iter->tnode;
2014 	unsigned cindex = iter->index;
2015 	struct tnode *p;
2016 
2017 	/* A single entry routing table */
2018 	if (!tn)
2019 		return NULL;
2020 
2021 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2022 		 iter->tnode, iter->index, iter->depth);
2023 rescan:
2024 	while (cindex < (1<<tn->bits)) {
2025 		struct node *n = tnode_get_child(tn, cindex);
2026 
2027 		if (n) {
2028 			if (IS_LEAF(n)) {
2029 				iter->tnode = tn;
2030 				iter->index = cindex + 1;
2031 			} else {
2032 				/* push down one level */
2033 				iter->tnode = (struct tnode *) n;
2034 				iter->index = 0;
2035 				++iter->depth;
2036 			}
2037 			return n;
2038 		}
2039 
2040 		++cindex;
2041 	}
2042 
2043 	/* Current node exhausted, pop back up */
2044 	p = NODE_PARENT(tn);
2045 	if (p) {
2046 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2047 		tn = p;
2048 		--iter->depth;
2049 		goto rescan;
2050 	}
2051 
2052 	/* got root? */
2053 	return NULL;
2054 }
2055 
2056 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2057 				       struct trie *t)
2058 {
2059 	struct node *n ;
2060 
2061 	if (!t)
2062 		return NULL;
2063 
2064 	n = rcu_dereference(t->trie);
2065 
2066 	if (!iter)
2067 		return NULL;
2068 
2069 	if (n) {
2070 		if (IS_TNODE(n)) {
2071 			iter->tnode = (struct tnode *) n;
2072 			iter->trie = t;
2073 			iter->index = 0;
2074 			iter->depth = 1;
2075 		} else {
2076 			iter->tnode = NULL;
2077 			iter->trie  = t;
2078 			iter->index = 0;
2079 			iter->depth = 0;
2080 		}
2081 		return n;
2082 	}
2083 	return NULL;
2084 }
2085 
2086 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2087 {
2088 	struct node *n;
2089 	struct fib_trie_iter iter;
2090 
2091 	memset(s, 0, sizeof(*s));
2092 
2093 	rcu_read_lock();
2094 	for (n = fib_trie_get_first(&iter, t); n;
2095 	     n = fib_trie_get_next(&iter)) {
2096 		if (IS_LEAF(n)) {
2097 			s->leaves++;
2098 			s->totdepth += iter.depth;
2099 			if (iter.depth > s->maxdepth)
2100 				s->maxdepth = iter.depth;
2101 		} else {
2102 			const struct tnode *tn = (const struct tnode *) n;
2103 			int i;
2104 
2105 			s->tnodes++;
2106 			if (tn->bits < MAX_STAT_DEPTH)
2107 				s->nodesizes[tn->bits]++;
2108 
2109 			for (i = 0; i < (1<<tn->bits); i++)
2110 				if (!tn->child[i])
2111 					s->nullpointers++;
2112 		}
2113 	}
2114 	rcu_read_unlock();
2115 }
2116 
2117 /*
2118  *	This outputs /proc/net/fib_triestats
2119  */
2120 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2121 {
2122 	unsigned i, max, pointers, bytes, avdepth;
2123 
2124 	if (stat->leaves)
2125 		avdepth = stat->totdepth*100 / stat->leaves;
2126 	else
2127 		avdepth = 0;
2128 
2129 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2130 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2131 
2132 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2133 
2134 	bytes = sizeof(struct leaf) * stat->leaves;
2135 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2136 	bytes += sizeof(struct tnode) * stat->tnodes;
2137 
2138 	max = MAX_STAT_DEPTH;
2139 	while (max > 0 && stat->nodesizes[max-1] == 0)
2140 		max--;
2141 
2142 	pointers = 0;
2143 	for (i = 1; i <= max; i++)
2144 		if (stat->nodesizes[i] != 0) {
2145 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2146 			pointers += (1<<i) * stat->nodesizes[i];
2147 		}
2148 	seq_putc(seq, '\n');
2149 	seq_printf(seq, "\tPointers: %d\n", pointers);
2150 
2151 	bytes += sizeof(struct node *) * pointers;
2152 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2153 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2154 
2155 #ifdef CONFIG_IP_FIB_TRIE_STATS
2156 	seq_printf(seq, "Counters:\n---------\n");
2157 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2158 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2159 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2160 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2161 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2162 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2163 #ifdef CLEAR_STATS
2164 	memset(&(t->stats), 0, sizeof(t->stats));
2165 #endif
2166 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2167 }
2168 
2169 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2170 {
2171 	struct trie_stat *stat;
2172 
2173 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2174 	if (!stat)
2175 		return -ENOMEM;
2176 
2177 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2178 		   sizeof(struct leaf), sizeof(struct tnode));
2179 
2180 	if (trie_local) {
2181 		seq_printf(seq, "Local:\n");
2182 		trie_collect_stats(trie_local, stat);
2183 		trie_show_stats(seq, stat);
2184 	}
2185 
2186 	if (trie_main) {
2187 		seq_printf(seq, "Main:\n");
2188 		trie_collect_stats(trie_main, stat);
2189 		trie_show_stats(seq, stat);
2190 	}
2191 	kfree(stat);
2192 
2193 	return 0;
2194 }
2195 
2196 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2197 {
2198 	return single_open(file, fib_triestat_seq_show, NULL);
2199 }
2200 
2201 static const struct file_operations fib_triestat_fops = {
2202 	.owner	= THIS_MODULE,
2203 	.open	= fib_triestat_seq_open,
2204 	.read	= seq_read,
2205 	.llseek	= seq_lseek,
2206 	.release = single_release,
2207 };
2208 
2209 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2210 				      loff_t pos)
2211 {
2212 	loff_t idx = 0;
2213 	struct node *n;
2214 
2215 	for (n = fib_trie_get_first(iter, trie_local);
2216 	     n; ++idx, n = fib_trie_get_next(iter)) {
2217 		if (pos == idx)
2218 			return n;
2219 	}
2220 
2221 	for (n = fib_trie_get_first(iter, trie_main);
2222 	     n; ++idx, n = fib_trie_get_next(iter)) {
2223 		if (pos == idx)
2224 			return n;
2225 	}
2226 	return NULL;
2227 }
2228 
2229 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2230 {
2231 	rcu_read_lock();
2232 	if (*pos == 0)
2233 		return SEQ_START_TOKEN;
2234 	return fib_trie_get_idx(seq->private, *pos - 1);
2235 }
2236 
2237 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2238 {
2239 	struct fib_trie_iter *iter = seq->private;
2240 	void *l = v;
2241 
2242 	++*pos;
2243 	if (v == SEQ_START_TOKEN)
2244 		return fib_trie_get_idx(iter, 0);
2245 
2246 	v = fib_trie_get_next(iter);
2247 	BUG_ON(v == l);
2248 	if (v)
2249 		return v;
2250 
2251 	/* continue scan in next trie */
2252 	if (iter->trie == trie_local)
2253 		return fib_trie_get_first(iter, trie_main);
2254 
2255 	return NULL;
2256 }
2257 
2258 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2259 {
2260 	rcu_read_unlock();
2261 }
2262 
2263 static void seq_indent(struct seq_file *seq, int n)
2264 {
2265 	while (n-- > 0) seq_puts(seq, "   ");
2266 }
2267 
2268 static inline const char *rtn_scope(enum rt_scope_t s)
2269 {
2270 	static char buf[32];
2271 
2272 	switch (s) {
2273 	case RT_SCOPE_UNIVERSE: return "universe";
2274 	case RT_SCOPE_SITE:	return "site";
2275 	case RT_SCOPE_LINK:	return "link";
2276 	case RT_SCOPE_HOST:	return "host";
2277 	case RT_SCOPE_NOWHERE:	return "nowhere";
2278 	default:
2279 		snprintf(buf, sizeof(buf), "scope=%d", s);
2280 		return buf;
2281 	}
2282 }
2283 
2284 static const char *rtn_type_names[__RTN_MAX] = {
2285 	[RTN_UNSPEC] = "UNSPEC",
2286 	[RTN_UNICAST] = "UNICAST",
2287 	[RTN_LOCAL] = "LOCAL",
2288 	[RTN_BROADCAST] = "BROADCAST",
2289 	[RTN_ANYCAST] = "ANYCAST",
2290 	[RTN_MULTICAST] = "MULTICAST",
2291 	[RTN_BLACKHOLE] = "BLACKHOLE",
2292 	[RTN_UNREACHABLE] = "UNREACHABLE",
2293 	[RTN_PROHIBIT] = "PROHIBIT",
2294 	[RTN_THROW] = "THROW",
2295 	[RTN_NAT] = "NAT",
2296 	[RTN_XRESOLVE] = "XRESOLVE",
2297 };
2298 
2299 static inline const char *rtn_type(unsigned t)
2300 {
2301 	static char buf[32];
2302 
2303 	if (t < __RTN_MAX && rtn_type_names[t])
2304 		return rtn_type_names[t];
2305 	snprintf(buf, sizeof(buf), "type %d", t);
2306 	return buf;
2307 }
2308 
2309 /* Pretty print the trie */
2310 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2311 {
2312 	const struct fib_trie_iter *iter = seq->private;
2313 	struct node *n = v;
2314 
2315 	if (v == SEQ_START_TOKEN)
2316 		return 0;
2317 
2318 	if (!NODE_PARENT(n)) {
2319 		if (iter->trie == trie_local)
2320 			seq_puts(seq, "<local>:\n");
2321 		else
2322 			seq_puts(seq, "<main>:\n");
2323 	}
2324 
2325 	if (IS_TNODE(n)) {
2326 		struct tnode *tn = (struct tnode *) n;
2327 		__be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2328 
2329 		seq_indent(seq, iter->depth-1);
2330 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2331 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2332 			   tn->empty_children);
2333 
2334 	} else {
2335 		struct leaf *l = (struct leaf *) n;
2336 		int i;
2337 		__be32 val = htonl(l->key);
2338 
2339 		seq_indent(seq, iter->depth);
2340 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2341 		for (i = 32; i >= 0; i--) {
2342 			struct leaf_info *li = find_leaf_info(l, i);
2343 			if (li) {
2344 				struct fib_alias *fa;
2345 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2346 					seq_indent(seq, iter->depth+1);
2347 					seq_printf(seq, "  /%d %s %s", i,
2348 						   rtn_scope(fa->fa_scope),
2349 						   rtn_type(fa->fa_type));
2350 					if (fa->fa_tos)
2351 						seq_printf(seq, "tos =%d\n",
2352 							   fa->fa_tos);
2353 					seq_putc(seq, '\n');
2354 				}
2355 			}
2356 		}
2357 	}
2358 
2359 	return 0;
2360 }
2361 
2362 static const struct seq_operations fib_trie_seq_ops = {
2363 	.start  = fib_trie_seq_start,
2364 	.next   = fib_trie_seq_next,
2365 	.stop   = fib_trie_seq_stop,
2366 	.show   = fib_trie_seq_show,
2367 };
2368 
2369 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2370 {
2371 	struct seq_file *seq;
2372 	int rc = -ENOMEM;
2373 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2374 
2375 	if (!s)
2376 		goto out;
2377 
2378 	rc = seq_open(file, &fib_trie_seq_ops);
2379 	if (rc)
2380 		goto out_kfree;
2381 
2382 	seq	     = file->private_data;
2383 	seq->private = s;
2384 	memset(s, 0, sizeof(*s));
2385 out:
2386 	return rc;
2387 out_kfree:
2388 	kfree(s);
2389 	goto out;
2390 }
2391 
2392 static const struct file_operations fib_trie_fops = {
2393 	.owner  = THIS_MODULE,
2394 	.open   = fib_trie_seq_open,
2395 	.read   = seq_read,
2396 	.llseek = seq_lseek,
2397 	.release = seq_release_private,
2398 };
2399 
2400 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2401 {
2402 	static unsigned type2flags[RTN_MAX + 1] = {
2403 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2404 	};
2405 	unsigned flags = type2flags[type];
2406 
2407 	if (fi && fi->fib_nh->nh_gw)
2408 		flags |= RTF_GATEWAY;
2409 	if (mask == htonl(0xFFFFFFFF))
2410 		flags |= RTF_HOST;
2411 	flags |= RTF_UP;
2412 	return flags;
2413 }
2414 
2415 /*
2416  *	This outputs /proc/net/route.
2417  *	The format of the file is not supposed to be changed
2418  * 	and needs to be same as fib_hash output to avoid breaking
2419  *	legacy utilities
2420  */
2421 static int fib_route_seq_show(struct seq_file *seq, void *v)
2422 {
2423 	const struct fib_trie_iter *iter = seq->private;
2424 	struct leaf *l = v;
2425 	int i;
2426 	char bf[128];
2427 
2428 	if (v == SEQ_START_TOKEN) {
2429 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2430 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2431 			   "\tWindow\tIRTT");
2432 		return 0;
2433 	}
2434 
2435 	if (iter->trie == trie_local)
2436 		return 0;
2437 	if (IS_TNODE(l))
2438 		return 0;
2439 
2440 	for (i=32; i>=0; i--) {
2441 		struct leaf_info *li = find_leaf_info(l, i);
2442 		struct fib_alias *fa;
2443 		__be32 mask, prefix;
2444 
2445 		if (!li)
2446 			continue;
2447 
2448 		mask = inet_make_mask(li->plen);
2449 		prefix = htonl(l->key);
2450 
2451 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2452 			const struct fib_info *fi = fa->fa_info;
2453 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2454 
2455 			if (fa->fa_type == RTN_BROADCAST
2456 			    || fa->fa_type == RTN_MULTICAST)
2457 				continue;
2458 
2459 			if (fi)
2460 				snprintf(bf, sizeof(bf),
2461 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2462 					 fi->fib_dev ? fi->fib_dev->name : "*",
2463 					 prefix,
2464 					 fi->fib_nh->nh_gw, flags, 0, 0,
2465 					 fi->fib_priority,
2466 					 mask,
2467 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2468 					 fi->fib_window,
2469 					 fi->fib_rtt >> 3);
2470 			else
2471 				snprintf(bf, sizeof(bf),
2472 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2473 					 prefix, 0, flags, 0, 0, 0,
2474 					 mask, 0, 0, 0);
2475 
2476 			seq_printf(seq, "%-127s\n", bf);
2477 		}
2478 	}
2479 
2480 	return 0;
2481 }
2482 
2483 static const struct seq_operations fib_route_seq_ops = {
2484 	.start  = fib_trie_seq_start,
2485 	.next   = fib_trie_seq_next,
2486 	.stop   = fib_trie_seq_stop,
2487 	.show   = fib_route_seq_show,
2488 };
2489 
2490 static int fib_route_seq_open(struct inode *inode, struct file *file)
2491 {
2492 	struct seq_file *seq;
2493 	int rc = -ENOMEM;
2494 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2495 
2496 	if (!s)
2497 		goto out;
2498 
2499 	rc = seq_open(file, &fib_route_seq_ops);
2500 	if (rc)
2501 		goto out_kfree;
2502 
2503 	seq	     = file->private_data;
2504 	seq->private = s;
2505 	memset(s, 0, sizeof(*s));
2506 out:
2507 	return rc;
2508 out_kfree:
2509 	kfree(s);
2510 	goto out;
2511 }
2512 
2513 static const struct file_operations fib_route_fops = {
2514 	.owner  = THIS_MODULE,
2515 	.open   = fib_route_seq_open,
2516 	.read   = seq_read,
2517 	.llseek = seq_lseek,
2518 	.release = seq_release_private,
2519 };
2520 
2521 int __init fib_proc_init(void)
2522 {
2523 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2524 		goto out1;
2525 
2526 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2527 		goto out2;
2528 
2529 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2530 		goto out3;
2531 
2532 	return 0;
2533 
2534 out3:
2535 	proc_net_remove("fib_triestat");
2536 out2:
2537 	proc_net_remove("fib_trie");
2538 out1:
2539 	return -ENOMEM;
2540 }
2541 
2542 void __init fib_proc_exit(void)
2543 {
2544 	proc_net_remove("fib_trie");
2545 	proc_net_remove("fib_triestat");
2546 	proc_net_remove("route");
2547 }
2548 
2549 #endif /* CONFIG_PROC_FS */
2550