xref: /linux/net/ipv4/fib_trie.c (revision c145211d1f9e2ef19e7b4c2b943f68366daa97af)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <linux/slab.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #define MAX_STAT_DEPTH 32
85 
86 #define KEYLENGTH (8*sizeof(t_key))
87 
88 typedef unsigned int t_key;
89 
90 #define T_TNODE 0
91 #define T_LEAF  1
92 #define NODE_TYPE_MASK	0x1UL
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 
95 #define IS_TNODE(n) (!(n->parent & T_LEAF))
96 #define IS_LEAF(n) (n->parent & T_LEAF)
97 
98 struct node {
99 	unsigned long parent;
100 	t_key key;
101 };
102 
103 struct leaf {
104 	unsigned long parent;
105 	t_key key;
106 	struct hlist_head list;
107 	struct rcu_head rcu;
108 };
109 
110 struct leaf_info {
111 	struct hlist_node hlist;
112 	struct rcu_head rcu;
113 	int plen;
114 	struct list_head falh;
115 };
116 
117 struct tnode {
118 	unsigned long parent;
119 	t_key key;
120 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122 	unsigned int full_children;	/* KEYLENGTH bits needed */
123 	unsigned int empty_children;	/* KEYLENGTH bits needed */
124 	union {
125 		struct rcu_head rcu;
126 		struct work_struct work;
127 		struct tnode *tnode_free;
128 	};
129 	struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int prefixes;
150 	unsigned int nodesizes[MAX_STAT_DEPTH];
151 };
152 
153 struct trie {
154 	struct node *trie;
155 #ifdef CONFIG_IP_FIB_TRIE_STATS
156 	struct trie_use_stats stats;
157 #endif
158 };
159 
160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 				  int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 /* tnodes to free after resize(); protected by RTNL */
167 static struct tnode *tnode_free_head;
168 static size_t tnode_free_size;
169 
170 /*
171  * synchronize_rcu after call_rcu for that many pages; it should be especially
172  * useful before resizing the root node with PREEMPT_NONE configs; the value was
173  * obtained experimentally, aiming to avoid visible slowdown.
174  */
175 static const int sync_pages = 128;
176 
177 static struct kmem_cache *fn_alias_kmem __read_mostly;
178 static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 
180 static inline struct tnode *node_parent(struct node *node)
181 {
182 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183 }
184 
185 static inline struct tnode *node_parent_rcu(struct node *node)
186 {
187 	struct tnode *ret = node_parent(node);
188 
189 	return rcu_dereference(ret);
190 }
191 
192 /* Same as rcu_assign_pointer
193  * but that macro() assumes that value is a pointer.
194  */
195 static inline void node_set_parent(struct node *node, struct tnode *ptr)
196 {
197 	smp_wmb();
198 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
199 }
200 
201 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
202 {
203 	BUG_ON(i >= 1U << tn->bits);
204 
205 	return tn->child[i];
206 }
207 
208 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209 {
210 	struct node *ret = tnode_get_child(tn, i);
211 
212 	return rcu_dereference_check(ret,
213 				     rcu_read_lock_held() ||
214 				     lockdep_rtnl_is_held());
215 }
216 
217 static inline int tnode_child_length(const struct tnode *tn)
218 {
219 	return 1 << tn->bits;
220 }
221 
222 static inline t_key mask_pfx(t_key k, unsigned short l)
223 {
224 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
225 }
226 
227 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
228 {
229 	if (offset < KEYLENGTH)
230 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
231 	else
232 		return 0;
233 }
234 
235 static inline int tkey_equals(t_key a, t_key b)
236 {
237 	return a == b;
238 }
239 
240 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
241 {
242 	if (bits == 0 || offset >= KEYLENGTH)
243 		return 1;
244 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
245 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
246 }
247 
248 static inline int tkey_mismatch(t_key a, int offset, t_key b)
249 {
250 	t_key diff = a ^ b;
251 	int i = offset;
252 
253 	if (!diff)
254 		return 0;
255 	while ((diff << i) >> (KEYLENGTH-1) == 0)
256 		i++;
257 	return i;
258 }
259 
260 /*
261   To understand this stuff, an understanding of keys and all their bits is
262   necessary. Every node in the trie has a key associated with it, but not
263   all of the bits in that key are significant.
264 
265   Consider a node 'n' and its parent 'tp'.
266 
267   If n is a leaf, every bit in its key is significant. Its presence is
268   necessitated by path compression, since during a tree traversal (when
269   searching for a leaf - unless we are doing an insertion) we will completely
270   ignore all skipped bits we encounter. Thus we need to verify, at the end of
271   a potentially successful search, that we have indeed been walking the
272   correct key path.
273 
274   Note that we can never "miss" the correct key in the tree if present by
275   following the wrong path. Path compression ensures that segments of the key
276   that are the same for all keys with a given prefix are skipped, but the
277   skipped part *is* identical for each node in the subtrie below the skipped
278   bit! trie_insert() in this implementation takes care of that - note the
279   call to tkey_sub_equals() in trie_insert().
280 
281   if n is an internal node - a 'tnode' here, the various parts of its key
282   have many different meanings.
283 
284   Example:
285   _________________________________________________________________
286   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287   -----------------------------------------------------------------
288     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
289 
290   _________________________________________________________________
291   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292   -----------------------------------------------------------------
293    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
294 
295   tp->pos = 7
296   tp->bits = 3
297   n->pos = 15
298   n->bits = 4
299 
300   First, let's just ignore the bits that come before the parent tp, that is
301   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302   not use them for anything.
303 
304   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305   index into the parent's child array. That is, they will be used to find
306   'n' among tp's children.
307 
308   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309   for the node n.
310 
311   All the bits we have seen so far are significant to the node n. The rest
312   of the bits are really not needed or indeed known in n->key.
313 
314   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315   n's child array, and will of course be different for each child.
316 
317 
318   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319   at this point.
320 
321 */
322 
323 static inline void check_tnode(const struct tnode *tn)
324 {
325 	WARN_ON(tn && tn->pos+tn->bits > 32);
326 }
327 
328 static const int halve_threshold = 25;
329 static const int inflate_threshold = 50;
330 static const int halve_threshold_root = 15;
331 static const int inflate_threshold_root = 30;
332 
333 static void __alias_free_mem(struct rcu_head *head)
334 {
335 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
336 	kmem_cache_free(fn_alias_kmem, fa);
337 }
338 
339 static inline void alias_free_mem_rcu(struct fib_alias *fa)
340 {
341 	call_rcu(&fa->rcu, __alias_free_mem);
342 }
343 
344 static void __leaf_free_rcu(struct rcu_head *head)
345 {
346 	struct leaf *l = container_of(head, struct leaf, rcu);
347 	kmem_cache_free(trie_leaf_kmem, l);
348 }
349 
350 static inline void free_leaf(struct leaf *l)
351 {
352 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
353 }
354 
355 static void __leaf_info_free_rcu(struct rcu_head *head)
356 {
357 	kfree(container_of(head, struct leaf_info, rcu));
358 }
359 
360 static inline void free_leaf_info(struct leaf_info *leaf)
361 {
362 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
363 }
364 
365 static struct tnode *tnode_alloc(size_t size)
366 {
367 	if (size <= PAGE_SIZE)
368 		return kzalloc(size, GFP_KERNEL);
369 	else
370 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
371 }
372 
373 static void __tnode_vfree(struct work_struct *arg)
374 {
375 	struct tnode *tn = container_of(arg, struct tnode, work);
376 	vfree(tn);
377 }
378 
379 static void __tnode_free_rcu(struct rcu_head *head)
380 {
381 	struct tnode *tn = container_of(head, struct tnode, rcu);
382 	size_t size = sizeof(struct tnode) +
383 		      (sizeof(struct node *) << tn->bits);
384 
385 	if (size <= PAGE_SIZE)
386 		kfree(tn);
387 	else {
388 		INIT_WORK(&tn->work, __tnode_vfree);
389 		schedule_work(&tn->work);
390 	}
391 }
392 
393 static inline void tnode_free(struct tnode *tn)
394 {
395 	if (IS_LEAF(tn))
396 		free_leaf((struct leaf *) tn);
397 	else
398 		call_rcu(&tn->rcu, __tnode_free_rcu);
399 }
400 
401 static void tnode_free_safe(struct tnode *tn)
402 {
403 	BUG_ON(IS_LEAF(tn));
404 	tn->tnode_free = tnode_free_head;
405 	tnode_free_head = tn;
406 	tnode_free_size += sizeof(struct tnode) +
407 			   (sizeof(struct node *) << tn->bits);
408 }
409 
410 static void tnode_free_flush(void)
411 {
412 	struct tnode *tn;
413 
414 	while ((tn = tnode_free_head)) {
415 		tnode_free_head = tn->tnode_free;
416 		tn->tnode_free = NULL;
417 		tnode_free(tn);
418 	}
419 
420 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
421 		tnode_free_size = 0;
422 		synchronize_rcu();
423 	}
424 }
425 
426 static struct leaf *leaf_new(void)
427 {
428 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
429 	if (l) {
430 		l->parent = T_LEAF;
431 		INIT_HLIST_HEAD(&l->list);
432 	}
433 	return l;
434 }
435 
436 static struct leaf_info *leaf_info_new(int plen)
437 {
438 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
439 	if (li) {
440 		li->plen = plen;
441 		INIT_LIST_HEAD(&li->falh);
442 	}
443 	return li;
444 }
445 
446 static struct tnode *tnode_new(t_key key, int pos, int bits)
447 {
448 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
449 	struct tnode *tn = tnode_alloc(sz);
450 
451 	if (tn) {
452 		tn->parent = T_TNODE;
453 		tn->pos = pos;
454 		tn->bits = bits;
455 		tn->key = key;
456 		tn->full_children = 0;
457 		tn->empty_children = 1<<bits;
458 	}
459 
460 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
461 		 (unsigned long) (sizeof(struct node) << bits));
462 	return tn;
463 }
464 
465 /*
466  * Check whether a tnode 'n' is "full", i.e. it is an internal node
467  * and no bits are skipped. See discussion in dyntree paper p. 6
468  */
469 
470 static inline int tnode_full(const struct tnode *tn, const struct node *n)
471 {
472 	if (n == NULL || IS_LEAF(n))
473 		return 0;
474 
475 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
476 }
477 
478 static inline void put_child(struct trie *t, struct tnode *tn, int i,
479 			     struct node *n)
480 {
481 	tnode_put_child_reorg(tn, i, n, -1);
482 }
483 
484  /*
485   * Add a child at position i overwriting the old value.
486   * Update the value of full_children and empty_children.
487   */
488 
489 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
490 				  int wasfull)
491 {
492 	struct node *chi = tn->child[i];
493 	int isfull;
494 
495 	BUG_ON(i >= 1<<tn->bits);
496 
497 	/* update emptyChildren */
498 	if (n == NULL && chi != NULL)
499 		tn->empty_children++;
500 	else if (n != NULL && chi == NULL)
501 		tn->empty_children--;
502 
503 	/* update fullChildren */
504 	if (wasfull == -1)
505 		wasfull = tnode_full(tn, chi);
506 
507 	isfull = tnode_full(tn, n);
508 	if (wasfull && !isfull)
509 		tn->full_children--;
510 	else if (!wasfull && isfull)
511 		tn->full_children++;
512 
513 	if (n)
514 		node_set_parent(n, tn);
515 
516 	rcu_assign_pointer(tn->child[i], n);
517 }
518 
519 #define MAX_WORK 10
520 static struct node *resize(struct trie *t, struct tnode *tn)
521 {
522 	int i;
523 	struct tnode *old_tn;
524 	int inflate_threshold_use;
525 	int halve_threshold_use;
526 	int max_work;
527 
528 	if (!tn)
529 		return NULL;
530 
531 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
532 		 tn, inflate_threshold, halve_threshold);
533 
534 	/* No children */
535 	if (tn->empty_children == tnode_child_length(tn)) {
536 		tnode_free_safe(tn);
537 		return NULL;
538 	}
539 	/* One child */
540 	if (tn->empty_children == tnode_child_length(tn) - 1)
541 		goto one_child;
542 	/*
543 	 * Double as long as the resulting node has a number of
544 	 * nonempty nodes that are above the threshold.
545 	 */
546 
547 	/*
548 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
549 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
550 	 * Telecommunications, page 6:
551 	 * "A node is doubled if the ratio of non-empty children to all
552 	 * children in the *doubled* node is at least 'high'."
553 	 *
554 	 * 'high' in this instance is the variable 'inflate_threshold'. It
555 	 * is expressed as a percentage, so we multiply it with
556 	 * tnode_child_length() and instead of multiplying by 2 (since the
557 	 * child array will be doubled by inflate()) and multiplying
558 	 * the left-hand side by 100 (to handle the percentage thing) we
559 	 * multiply the left-hand side by 50.
560 	 *
561 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
562 	 * - tn->empty_children is of course the number of non-null children
563 	 * in the current node. tn->full_children is the number of "full"
564 	 * children, that is non-null tnodes with a skip value of 0.
565 	 * All of those will be doubled in the resulting inflated tnode, so
566 	 * we just count them one extra time here.
567 	 *
568 	 * A clearer way to write this would be:
569 	 *
570 	 * to_be_doubled = tn->full_children;
571 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
572 	 *     tn->full_children;
573 	 *
574 	 * new_child_length = tnode_child_length(tn) * 2;
575 	 *
576 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
577 	 *      new_child_length;
578 	 * if (new_fill_factor >= inflate_threshold)
579 	 *
580 	 * ...and so on, tho it would mess up the while () loop.
581 	 *
582 	 * anyway,
583 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
584 	 *      inflate_threshold
585 	 *
586 	 * avoid a division:
587 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
588 	 *      inflate_threshold * new_child_length
589 	 *
590 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
591 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
592 	 *    tn->full_children) >= inflate_threshold * new_child_length
593 	 *
594 	 * expand new_child_length:
595 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 	 *    tn->full_children) >=
597 	 *      inflate_threshold * tnode_child_length(tn) * 2
598 	 *
599 	 * shorten again:
600 	 * 50 * (tn->full_children + tnode_child_length(tn) -
601 	 *    tn->empty_children) >= inflate_threshold *
602 	 *    tnode_child_length(tn)
603 	 *
604 	 */
605 
606 	check_tnode(tn);
607 
608 	/* Keep root node larger  */
609 
610 	if (!node_parent((struct node*) tn)) {
611 		inflate_threshold_use = inflate_threshold_root;
612 		halve_threshold_use = halve_threshold_root;
613 	}
614 	else {
615 		inflate_threshold_use = inflate_threshold;
616 		halve_threshold_use = halve_threshold;
617 	}
618 
619 	max_work = MAX_WORK;
620 	while ((tn->full_children > 0 &&  max_work-- &&
621 		50 * (tn->full_children + tnode_child_length(tn)
622 		      - tn->empty_children)
623 		>= inflate_threshold_use * tnode_child_length(tn))) {
624 
625 		old_tn = tn;
626 		tn = inflate(t, tn);
627 
628 		if (IS_ERR(tn)) {
629 			tn = old_tn;
630 #ifdef CONFIG_IP_FIB_TRIE_STATS
631 			t->stats.resize_node_skipped++;
632 #endif
633 			break;
634 		}
635 	}
636 
637 	check_tnode(tn);
638 
639 	/* Return if at least one inflate is run */
640 	if( max_work != MAX_WORK)
641 		return (struct node *) tn;
642 
643 	/*
644 	 * Halve as long as the number of empty children in this
645 	 * node is above threshold.
646 	 */
647 
648 	max_work = MAX_WORK;
649 	while (tn->bits > 1 &&  max_work-- &&
650 	       100 * (tnode_child_length(tn) - tn->empty_children) <
651 	       halve_threshold_use * tnode_child_length(tn)) {
652 
653 		old_tn = tn;
654 		tn = halve(t, tn);
655 		if (IS_ERR(tn)) {
656 			tn = old_tn;
657 #ifdef CONFIG_IP_FIB_TRIE_STATS
658 			t->stats.resize_node_skipped++;
659 #endif
660 			break;
661 		}
662 	}
663 
664 
665 	/* Only one child remains */
666 	if (tn->empty_children == tnode_child_length(tn) - 1) {
667 one_child:
668 		for (i = 0; i < tnode_child_length(tn); i++) {
669 			struct node *n;
670 
671 			n = tn->child[i];
672 			if (!n)
673 				continue;
674 
675 			/* compress one level */
676 
677 			node_set_parent(n, NULL);
678 			tnode_free_safe(tn);
679 			return n;
680 		}
681 	}
682 	return (struct node *) tn;
683 }
684 
685 static struct tnode *inflate(struct trie *t, struct tnode *tn)
686 {
687 	struct tnode *oldtnode = tn;
688 	int olen = tnode_child_length(tn);
689 	int i;
690 
691 	pr_debug("In inflate\n");
692 
693 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
694 
695 	if (!tn)
696 		return ERR_PTR(-ENOMEM);
697 
698 	/*
699 	 * Preallocate and store tnodes before the actual work so we
700 	 * don't get into an inconsistent state if memory allocation
701 	 * fails. In case of failure we return the oldnode and  inflate
702 	 * of tnode is ignored.
703 	 */
704 
705 	for (i = 0; i < olen; i++) {
706 		struct tnode *inode;
707 
708 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
709 		if (inode &&
710 		    IS_TNODE(inode) &&
711 		    inode->pos == oldtnode->pos + oldtnode->bits &&
712 		    inode->bits > 1) {
713 			struct tnode *left, *right;
714 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
715 
716 			left = tnode_new(inode->key&(~m), inode->pos + 1,
717 					 inode->bits - 1);
718 			if (!left)
719 				goto nomem;
720 
721 			right = tnode_new(inode->key|m, inode->pos + 1,
722 					  inode->bits - 1);
723 
724 			if (!right) {
725 				tnode_free(left);
726 				goto nomem;
727 			}
728 
729 			put_child(t, tn, 2*i, (struct node *) left);
730 			put_child(t, tn, 2*i+1, (struct node *) right);
731 		}
732 	}
733 
734 	for (i = 0; i < olen; i++) {
735 		struct tnode *inode;
736 		struct node *node = tnode_get_child(oldtnode, i);
737 		struct tnode *left, *right;
738 		int size, j;
739 
740 		/* An empty child */
741 		if (node == NULL)
742 			continue;
743 
744 		/* A leaf or an internal node with skipped bits */
745 
746 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
747 		   tn->pos + tn->bits - 1) {
748 			if (tkey_extract_bits(node->key,
749 					      oldtnode->pos + oldtnode->bits,
750 					      1) == 0)
751 				put_child(t, tn, 2*i, node);
752 			else
753 				put_child(t, tn, 2*i+1, node);
754 			continue;
755 		}
756 
757 		/* An internal node with two children */
758 		inode = (struct tnode *) node;
759 
760 		if (inode->bits == 1) {
761 			put_child(t, tn, 2*i, inode->child[0]);
762 			put_child(t, tn, 2*i+1, inode->child[1]);
763 
764 			tnode_free_safe(inode);
765 			continue;
766 		}
767 
768 		/* An internal node with more than two children */
769 
770 		/* We will replace this node 'inode' with two new
771 		 * ones, 'left' and 'right', each with half of the
772 		 * original children. The two new nodes will have
773 		 * a position one bit further down the key and this
774 		 * means that the "significant" part of their keys
775 		 * (see the discussion near the top of this file)
776 		 * will differ by one bit, which will be "0" in
777 		 * left's key and "1" in right's key. Since we are
778 		 * moving the key position by one step, the bit that
779 		 * we are moving away from - the bit at position
780 		 * (inode->pos) - is the one that will differ between
781 		 * left and right. So... we synthesize that bit in the
782 		 * two  new keys.
783 		 * The mask 'm' below will be a single "one" bit at
784 		 * the position (inode->pos)
785 		 */
786 
787 		/* Use the old key, but set the new significant
788 		 *   bit to zero.
789 		 */
790 
791 		left = (struct tnode *) tnode_get_child(tn, 2*i);
792 		put_child(t, tn, 2*i, NULL);
793 
794 		BUG_ON(!left);
795 
796 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
797 		put_child(t, tn, 2*i+1, NULL);
798 
799 		BUG_ON(!right);
800 
801 		size = tnode_child_length(left);
802 		for (j = 0; j < size; j++) {
803 			put_child(t, left, j, inode->child[j]);
804 			put_child(t, right, j, inode->child[j + size]);
805 		}
806 		put_child(t, tn, 2*i, resize(t, left));
807 		put_child(t, tn, 2*i+1, resize(t, right));
808 
809 		tnode_free_safe(inode);
810 	}
811 	tnode_free_safe(oldtnode);
812 	return tn;
813 nomem:
814 	{
815 		int size = tnode_child_length(tn);
816 		int j;
817 
818 		for (j = 0; j < size; j++)
819 			if (tn->child[j])
820 				tnode_free((struct tnode *)tn->child[j]);
821 
822 		tnode_free(tn);
823 
824 		return ERR_PTR(-ENOMEM);
825 	}
826 }
827 
828 static struct tnode *halve(struct trie *t, struct tnode *tn)
829 {
830 	struct tnode *oldtnode = tn;
831 	struct node *left, *right;
832 	int i;
833 	int olen = tnode_child_length(tn);
834 
835 	pr_debug("In halve\n");
836 
837 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
838 
839 	if (!tn)
840 		return ERR_PTR(-ENOMEM);
841 
842 	/*
843 	 * Preallocate and store tnodes before the actual work so we
844 	 * don't get into an inconsistent state if memory allocation
845 	 * fails. In case of failure we return the oldnode and halve
846 	 * of tnode is ignored.
847 	 */
848 
849 	for (i = 0; i < olen; i += 2) {
850 		left = tnode_get_child(oldtnode, i);
851 		right = tnode_get_child(oldtnode, i+1);
852 
853 		/* Two nonempty children */
854 		if (left && right) {
855 			struct tnode *newn;
856 
857 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
858 
859 			if (!newn)
860 				goto nomem;
861 
862 			put_child(t, tn, i/2, (struct node *)newn);
863 		}
864 
865 	}
866 
867 	for (i = 0; i < olen; i += 2) {
868 		struct tnode *newBinNode;
869 
870 		left = tnode_get_child(oldtnode, i);
871 		right = tnode_get_child(oldtnode, i+1);
872 
873 		/* At least one of the children is empty */
874 		if (left == NULL) {
875 			if (right == NULL)    /* Both are empty */
876 				continue;
877 			put_child(t, tn, i/2, right);
878 			continue;
879 		}
880 
881 		if (right == NULL) {
882 			put_child(t, tn, i/2, left);
883 			continue;
884 		}
885 
886 		/* Two nonempty children */
887 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
888 		put_child(t, tn, i/2, NULL);
889 		put_child(t, newBinNode, 0, left);
890 		put_child(t, newBinNode, 1, right);
891 		put_child(t, tn, i/2, resize(t, newBinNode));
892 	}
893 	tnode_free_safe(oldtnode);
894 	return tn;
895 nomem:
896 	{
897 		int size = tnode_child_length(tn);
898 		int j;
899 
900 		for (j = 0; j < size; j++)
901 			if (tn->child[j])
902 				tnode_free((struct tnode *)tn->child[j]);
903 
904 		tnode_free(tn);
905 
906 		return ERR_PTR(-ENOMEM);
907 	}
908 }
909 
910 /* readside must use rcu_read_lock currently dump routines
911  via get_fa_head and dump */
912 
913 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
914 {
915 	struct hlist_head *head = &l->list;
916 	struct hlist_node *node;
917 	struct leaf_info *li;
918 
919 	hlist_for_each_entry_rcu(li, node, head, hlist)
920 		if (li->plen == plen)
921 			return li;
922 
923 	return NULL;
924 }
925 
926 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
927 {
928 	struct leaf_info *li = find_leaf_info(l, plen);
929 
930 	if (!li)
931 		return NULL;
932 
933 	return &li->falh;
934 }
935 
936 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
937 {
938 	struct leaf_info *li = NULL, *last = NULL;
939 	struct hlist_node *node;
940 
941 	if (hlist_empty(head)) {
942 		hlist_add_head_rcu(&new->hlist, head);
943 	} else {
944 		hlist_for_each_entry(li, node, head, hlist) {
945 			if (new->plen > li->plen)
946 				break;
947 
948 			last = li;
949 		}
950 		if (last)
951 			hlist_add_after_rcu(&last->hlist, &new->hlist);
952 		else
953 			hlist_add_before_rcu(&new->hlist, &li->hlist);
954 	}
955 }
956 
957 /* rcu_read_lock needs to be hold by caller from readside */
958 
959 static struct leaf *
960 fib_find_node(struct trie *t, u32 key)
961 {
962 	int pos;
963 	struct tnode *tn;
964 	struct node *n;
965 
966 	pos = 0;
967 	n = rcu_dereference_check(t->trie,
968 				  rcu_read_lock_held() ||
969 				  lockdep_rtnl_is_held());
970 
971 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
972 		tn = (struct tnode *) n;
973 
974 		check_tnode(tn);
975 
976 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
977 			pos = tn->pos + tn->bits;
978 			n = tnode_get_child_rcu(tn,
979 						tkey_extract_bits(key,
980 								  tn->pos,
981 								  tn->bits));
982 		} else
983 			break;
984 	}
985 	/* Case we have found a leaf. Compare prefixes */
986 
987 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
988 		return (struct leaf *)n;
989 
990 	return NULL;
991 }
992 
993 static void trie_rebalance(struct trie *t, struct tnode *tn)
994 {
995 	int wasfull;
996 	t_key cindex, key;
997 	struct tnode *tp;
998 
999 	key = tn->key;
1000 
1001 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1002 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1003 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1004 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1005 
1006 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1007 				      (struct node *)tn, wasfull);
1008 
1009 		tp = node_parent((struct node *) tn);
1010 		if (!tp)
1011 			rcu_assign_pointer(t->trie, (struct node *)tn);
1012 
1013 		tnode_free_flush();
1014 		if (!tp)
1015 			break;
1016 		tn = tp;
1017 	}
1018 
1019 	/* Handle last (top) tnode */
1020 	if (IS_TNODE(tn))
1021 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1022 
1023 	rcu_assign_pointer(t->trie, (struct node *)tn);
1024 	tnode_free_flush();
1025 
1026 	return;
1027 }
1028 
1029 /* only used from updater-side */
1030 
1031 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1032 {
1033 	int pos, newpos;
1034 	struct tnode *tp = NULL, *tn = NULL;
1035 	struct node *n;
1036 	struct leaf *l;
1037 	int missbit;
1038 	struct list_head *fa_head = NULL;
1039 	struct leaf_info *li;
1040 	t_key cindex;
1041 
1042 	pos = 0;
1043 	n = t->trie;
1044 
1045 	/* If we point to NULL, stop. Either the tree is empty and we should
1046 	 * just put a new leaf in if, or we have reached an empty child slot,
1047 	 * and we should just put our new leaf in that.
1048 	 * If we point to a T_TNODE, check if it matches our key. Note that
1049 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1050 	 * not be the parent's 'pos'+'bits'!
1051 	 *
1052 	 * If it does match the current key, get pos/bits from it, extract
1053 	 * the index from our key, push the T_TNODE and walk the tree.
1054 	 *
1055 	 * If it doesn't, we have to replace it with a new T_TNODE.
1056 	 *
1057 	 * If we point to a T_LEAF, it might or might not have the same key
1058 	 * as we do. If it does, just change the value, update the T_LEAF's
1059 	 * value, and return it.
1060 	 * If it doesn't, we need to replace it with a T_TNODE.
1061 	 */
1062 
1063 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1064 		tn = (struct tnode *) n;
1065 
1066 		check_tnode(tn);
1067 
1068 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1069 			tp = tn;
1070 			pos = tn->pos + tn->bits;
1071 			n = tnode_get_child(tn,
1072 					    tkey_extract_bits(key,
1073 							      tn->pos,
1074 							      tn->bits));
1075 
1076 			BUG_ON(n && node_parent(n) != tn);
1077 		} else
1078 			break;
1079 	}
1080 
1081 	/*
1082 	 * n  ----> NULL, LEAF or TNODE
1083 	 *
1084 	 * tp is n's (parent) ----> NULL or TNODE
1085 	 */
1086 
1087 	BUG_ON(tp && IS_LEAF(tp));
1088 
1089 	/* Case 1: n is a leaf. Compare prefixes */
1090 
1091 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1092 		l = (struct leaf *) n;
1093 		li = leaf_info_new(plen);
1094 
1095 		if (!li)
1096 			return NULL;
1097 
1098 		fa_head = &li->falh;
1099 		insert_leaf_info(&l->list, li);
1100 		goto done;
1101 	}
1102 	l = leaf_new();
1103 
1104 	if (!l)
1105 		return NULL;
1106 
1107 	l->key = key;
1108 	li = leaf_info_new(plen);
1109 
1110 	if (!li) {
1111 		free_leaf(l);
1112 		return NULL;
1113 	}
1114 
1115 	fa_head = &li->falh;
1116 	insert_leaf_info(&l->list, li);
1117 
1118 	if (t->trie && n == NULL) {
1119 		/* Case 2: n is NULL, and will just insert a new leaf */
1120 
1121 		node_set_parent((struct node *)l, tp);
1122 
1123 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1125 	} else {
1126 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127 		/*
1128 		 *  Add a new tnode here
1129 		 *  first tnode need some special handling
1130 		 */
1131 
1132 		if (tp)
1133 			pos = tp->pos+tp->bits;
1134 		else
1135 			pos = 0;
1136 
1137 		if (n) {
1138 			newpos = tkey_mismatch(key, pos, n->key);
1139 			tn = tnode_new(n->key, newpos, 1);
1140 		} else {
1141 			newpos = 0;
1142 			tn = tnode_new(key, newpos, 1); /* First tnode */
1143 		}
1144 
1145 		if (!tn) {
1146 			free_leaf_info(li);
1147 			free_leaf(l);
1148 			return NULL;
1149 		}
1150 
1151 		node_set_parent((struct node *)tn, tp);
1152 
1153 		missbit = tkey_extract_bits(key, newpos, 1);
1154 		put_child(t, tn, missbit, (struct node *)l);
1155 		put_child(t, tn, 1-missbit, n);
1156 
1157 		if (tp) {
1158 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1159 			put_child(t, (struct tnode *)tp, cindex,
1160 				  (struct node *)tn);
1161 		} else {
1162 			rcu_assign_pointer(t->trie, (struct node *)tn);
1163 			tp = tn;
1164 		}
1165 	}
1166 
1167 	if (tp && tp->pos + tp->bits > 32)
1168 		pr_warning("fib_trie"
1169 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1170 			   tp, tp->pos, tp->bits, key, plen);
1171 
1172 	/* Rebalance the trie */
1173 
1174 	trie_rebalance(t, tp);
1175 done:
1176 	return fa_head;
1177 }
1178 
1179 /*
1180  * Caller must hold RTNL.
1181  */
1182 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1183 {
1184 	struct trie *t = (struct trie *) tb->tb_data;
1185 	struct fib_alias *fa, *new_fa;
1186 	struct list_head *fa_head = NULL;
1187 	struct fib_info *fi;
1188 	int plen = cfg->fc_dst_len;
1189 	u8 tos = cfg->fc_tos;
1190 	u32 key, mask;
1191 	int err;
1192 	struct leaf *l;
1193 
1194 	if (plen > 32)
1195 		return -EINVAL;
1196 
1197 	key = ntohl(cfg->fc_dst);
1198 
1199 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1200 
1201 	mask = ntohl(inet_make_mask(plen));
1202 
1203 	if (key & ~mask)
1204 		return -EINVAL;
1205 
1206 	key = key & mask;
1207 
1208 	fi = fib_create_info(cfg);
1209 	if (IS_ERR(fi)) {
1210 		err = PTR_ERR(fi);
1211 		goto err;
1212 	}
1213 
1214 	l = fib_find_node(t, key);
1215 	fa = NULL;
1216 
1217 	if (l) {
1218 		fa_head = get_fa_head(l, plen);
1219 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1220 	}
1221 
1222 	/* Now fa, if non-NULL, points to the first fib alias
1223 	 * with the same keys [prefix,tos,priority], if such key already
1224 	 * exists or to the node before which we will insert new one.
1225 	 *
1226 	 * If fa is NULL, we will need to allocate a new one and
1227 	 * insert to the head of f.
1228 	 *
1229 	 * If f is NULL, no fib node matched the destination key
1230 	 * and we need to allocate a new one of those as well.
1231 	 */
1232 
1233 	if (fa && fa->fa_tos == tos &&
1234 	    fa->fa_info->fib_priority == fi->fib_priority) {
1235 		struct fib_alias *fa_first, *fa_match;
1236 
1237 		err = -EEXIST;
1238 		if (cfg->fc_nlflags & NLM_F_EXCL)
1239 			goto out;
1240 
1241 		/* We have 2 goals:
1242 		 * 1. Find exact match for type, scope, fib_info to avoid
1243 		 * duplicate routes
1244 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1245 		 */
1246 		fa_match = NULL;
1247 		fa_first = fa;
1248 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1249 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1250 			if (fa->fa_tos != tos)
1251 				break;
1252 			if (fa->fa_info->fib_priority != fi->fib_priority)
1253 				break;
1254 			if (fa->fa_type == cfg->fc_type &&
1255 			    fa->fa_scope == cfg->fc_scope &&
1256 			    fa->fa_info == fi) {
1257 				fa_match = fa;
1258 				break;
1259 			}
1260 		}
1261 
1262 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263 			struct fib_info *fi_drop;
1264 			u8 state;
1265 
1266 			fa = fa_first;
1267 			if (fa_match) {
1268 				if (fa == fa_match)
1269 					err = 0;
1270 				goto out;
1271 			}
1272 			err = -ENOBUFS;
1273 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1274 			if (new_fa == NULL)
1275 				goto out;
1276 
1277 			fi_drop = fa->fa_info;
1278 			new_fa->fa_tos = fa->fa_tos;
1279 			new_fa->fa_info = fi;
1280 			new_fa->fa_type = cfg->fc_type;
1281 			new_fa->fa_scope = cfg->fc_scope;
1282 			state = fa->fa_state;
1283 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1284 
1285 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1286 			alias_free_mem_rcu(fa);
1287 
1288 			fib_release_info(fi_drop);
1289 			if (state & FA_S_ACCESSED)
1290 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1291 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1292 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1293 
1294 			goto succeeded;
1295 		}
1296 		/* Error if we find a perfect match which
1297 		 * uses the same scope, type, and nexthop
1298 		 * information.
1299 		 */
1300 		if (fa_match)
1301 			goto out;
1302 
1303 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1304 			fa = fa_first;
1305 	}
1306 	err = -ENOENT;
1307 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1308 		goto out;
1309 
1310 	err = -ENOBUFS;
1311 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1312 	if (new_fa == NULL)
1313 		goto out;
1314 
1315 	new_fa->fa_info = fi;
1316 	new_fa->fa_tos = tos;
1317 	new_fa->fa_type = cfg->fc_type;
1318 	new_fa->fa_scope = cfg->fc_scope;
1319 	new_fa->fa_state = 0;
1320 	/*
1321 	 * Insert new entry to the list.
1322 	 */
1323 
1324 	if (!fa_head) {
1325 		fa_head = fib_insert_node(t, key, plen);
1326 		if (unlikely(!fa_head)) {
1327 			err = -ENOMEM;
1328 			goto out_free_new_fa;
1329 		}
1330 	}
1331 
1332 	list_add_tail_rcu(&new_fa->fa_list,
1333 			  (fa ? &fa->fa_list : fa_head));
1334 
1335 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1336 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1337 		  &cfg->fc_nlinfo, 0);
1338 succeeded:
1339 	return 0;
1340 
1341 out_free_new_fa:
1342 	kmem_cache_free(fn_alias_kmem, new_fa);
1343 out:
1344 	fib_release_info(fi);
1345 err:
1346 	return err;
1347 }
1348 
1349 /* should be called with rcu_read_lock */
1350 static int check_leaf(struct trie *t, struct leaf *l,
1351 		      t_key key,  const struct flowi *flp,
1352 		      struct fib_result *res)
1353 {
1354 	struct leaf_info *li;
1355 	struct hlist_head *hhead = &l->list;
1356 	struct hlist_node *node;
1357 
1358 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1359 		int err;
1360 		int plen = li->plen;
1361 		__be32 mask = inet_make_mask(plen);
1362 
1363 		if (l->key != (key & ntohl(mask)))
1364 			continue;
1365 
1366 		err = fib_semantic_match(&li->falh, flp, res, plen);
1367 
1368 #ifdef CONFIG_IP_FIB_TRIE_STATS
1369 		if (err <= 0)
1370 			t->stats.semantic_match_passed++;
1371 		else
1372 			t->stats.semantic_match_miss++;
1373 #endif
1374 		if (err <= 0)
1375 			return err;
1376 	}
1377 
1378 	return 1;
1379 }
1380 
1381 int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1382 		     struct fib_result *res)
1383 {
1384 	struct trie *t = (struct trie *) tb->tb_data;
1385 	int ret;
1386 	struct node *n;
1387 	struct tnode *pn;
1388 	int pos, bits;
1389 	t_key key = ntohl(flp->fl4_dst);
1390 	int chopped_off;
1391 	t_key cindex = 0;
1392 	int current_prefix_length = KEYLENGTH;
1393 	struct tnode *cn;
1394 	t_key node_prefix, key_prefix, pref_mismatch;
1395 	int mp;
1396 
1397 	rcu_read_lock();
1398 
1399 	n = rcu_dereference(t->trie);
1400 	if (!n)
1401 		goto failed;
1402 
1403 #ifdef CONFIG_IP_FIB_TRIE_STATS
1404 	t->stats.gets++;
1405 #endif
1406 
1407 	/* Just a leaf? */
1408 	if (IS_LEAF(n)) {
1409 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1410 		goto found;
1411 	}
1412 
1413 	pn = (struct tnode *) n;
1414 	chopped_off = 0;
1415 
1416 	while (pn) {
1417 		pos = pn->pos;
1418 		bits = pn->bits;
1419 
1420 		if (!chopped_off)
1421 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1422 						   pos, bits);
1423 
1424 		n = tnode_get_child_rcu(pn, cindex);
1425 
1426 		if (n == NULL) {
1427 #ifdef CONFIG_IP_FIB_TRIE_STATS
1428 			t->stats.null_node_hit++;
1429 #endif
1430 			goto backtrace;
1431 		}
1432 
1433 		if (IS_LEAF(n)) {
1434 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1435 			if (ret > 0)
1436 				goto backtrace;
1437 			goto found;
1438 		}
1439 
1440 		cn = (struct tnode *)n;
1441 
1442 		/*
1443 		 * It's a tnode, and we can do some extra checks here if we
1444 		 * like, to avoid descending into a dead-end branch.
1445 		 * This tnode is in the parent's child array at index
1446 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1447 		 * chopped off, so in reality the index may be just a
1448 		 * subprefix, padded with zero at the end.
1449 		 * We can also take a look at any skipped bits in this
1450 		 * tnode - everything up to p_pos is supposed to be ok,
1451 		 * and the non-chopped bits of the index (se previous
1452 		 * paragraph) are also guaranteed ok, but the rest is
1453 		 * considered unknown.
1454 		 *
1455 		 * The skipped bits are key[pos+bits..cn->pos].
1456 		 */
1457 
1458 		/* If current_prefix_length < pos+bits, we are already doing
1459 		 * actual prefix  matching, which means everything from
1460 		 * pos+(bits-chopped_off) onward must be zero along some
1461 		 * branch of this subtree - otherwise there is *no* valid
1462 		 * prefix present. Here we can only check the skipped
1463 		 * bits. Remember, since we have already indexed into the
1464 		 * parent's child array, we know that the bits we chopped of
1465 		 * *are* zero.
1466 		 */
1467 
1468 		/* NOTA BENE: Checking only skipped bits
1469 		   for the new node here */
1470 
1471 		if (current_prefix_length < pos+bits) {
1472 			if (tkey_extract_bits(cn->key, current_prefix_length,
1473 						cn->pos - current_prefix_length)
1474 			    || !(cn->child[0]))
1475 				goto backtrace;
1476 		}
1477 
1478 		/*
1479 		 * If chopped_off=0, the index is fully validated and we
1480 		 * only need to look at the skipped bits for this, the new,
1481 		 * tnode. What we actually want to do is to find out if
1482 		 * these skipped bits match our key perfectly, or if we will
1483 		 * have to count on finding a matching prefix further down,
1484 		 * because if we do, we would like to have some way of
1485 		 * verifying the existence of such a prefix at this point.
1486 		 */
1487 
1488 		/* The only thing we can do at this point is to verify that
1489 		 * any such matching prefix can indeed be a prefix to our
1490 		 * key, and if the bits in the node we are inspecting that
1491 		 * do not match our key are not ZERO, this cannot be true.
1492 		 * Thus, find out where there is a mismatch (before cn->pos)
1493 		 * and verify that all the mismatching bits are zero in the
1494 		 * new tnode's key.
1495 		 */
1496 
1497 		/*
1498 		 * Note: We aren't very concerned about the piece of
1499 		 * the key that precede pn->pos+pn->bits, since these
1500 		 * have already been checked. The bits after cn->pos
1501 		 * aren't checked since these are by definition
1502 		 * "unknown" at this point. Thus, what we want to see
1503 		 * is if we are about to enter the "prefix matching"
1504 		 * state, and in that case verify that the skipped
1505 		 * bits that will prevail throughout this subtree are
1506 		 * zero, as they have to be if we are to find a
1507 		 * matching prefix.
1508 		 */
1509 
1510 		node_prefix = mask_pfx(cn->key, cn->pos);
1511 		key_prefix = mask_pfx(key, cn->pos);
1512 		pref_mismatch = key_prefix^node_prefix;
1513 		mp = 0;
1514 
1515 		/*
1516 		 * In short: If skipped bits in this node do not match
1517 		 * the search key, enter the "prefix matching"
1518 		 * state.directly.
1519 		 */
1520 		if (pref_mismatch) {
1521 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1522 				mp++;
1523 				pref_mismatch = pref_mismatch << 1;
1524 			}
1525 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526 
1527 			if (key_prefix != 0)
1528 				goto backtrace;
1529 
1530 			if (current_prefix_length >= cn->pos)
1531 				current_prefix_length = mp;
1532 		}
1533 
1534 		pn = (struct tnode *)n; /* Descend */
1535 		chopped_off = 0;
1536 		continue;
1537 
1538 backtrace:
1539 		chopped_off++;
1540 
1541 		/* As zero don't change the child key (cindex) */
1542 		while ((chopped_off <= pn->bits)
1543 		       && !(cindex & (1<<(chopped_off-1))))
1544 			chopped_off++;
1545 
1546 		/* Decrease current_... with bits chopped off */
1547 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1548 			current_prefix_length = pn->pos + pn->bits
1549 				- chopped_off;
1550 
1551 		/*
1552 		 * Either we do the actual chop off according or if we have
1553 		 * chopped off all bits in this tnode walk up to our parent.
1554 		 */
1555 
1556 		if (chopped_off <= pn->bits) {
1557 			cindex &= ~(1 << (chopped_off-1));
1558 		} else {
1559 			struct tnode *parent = node_parent_rcu((struct node *) pn);
1560 			if (!parent)
1561 				goto failed;
1562 
1563 			/* Get Child's index */
1564 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1565 			pn = parent;
1566 			chopped_off = 0;
1567 
1568 #ifdef CONFIG_IP_FIB_TRIE_STATS
1569 			t->stats.backtrack++;
1570 #endif
1571 			goto backtrace;
1572 		}
1573 	}
1574 failed:
1575 	ret = 1;
1576 found:
1577 	rcu_read_unlock();
1578 	return ret;
1579 }
1580 
1581 /*
1582  * Remove the leaf and return parent.
1583  */
1584 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1585 {
1586 	struct tnode *tp = node_parent((struct node *) l);
1587 
1588 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1589 
1590 	if (tp) {
1591 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1592 		put_child(t, (struct tnode *)tp, cindex, NULL);
1593 		trie_rebalance(t, tp);
1594 	} else
1595 		rcu_assign_pointer(t->trie, NULL);
1596 
1597 	free_leaf(l);
1598 }
1599 
1600 /*
1601  * Caller must hold RTNL.
1602  */
1603 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1604 {
1605 	struct trie *t = (struct trie *) tb->tb_data;
1606 	u32 key, mask;
1607 	int plen = cfg->fc_dst_len;
1608 	u8 tos = cfg->fc_tos;
1609 	struct fib_alias *fa, *fa_to_delete;
1610 	struct list_head *fa_head;
1611 	struct leaf *l;
1612 	struct leaf_info *li;
1613 
1614 	if (plen > 32)
1615 		return -EINVAL;
1616 
1617 	key = ntohl(cfg->fc_dst);
1618 	mask = ntohl(inet_make_mask(plen));
1619 
1620 	if (key & ~mask)
1621 		return -EINVAL;
1622 
1623 	key = key & mask;
1624 	l = fib_find_node(t, key);
1625 
1626 	if (!l)
1627 		return -ESRCH;
1628 
1629 	fa_head = get_fa_head(l, plen);
1630 	fa = fib_find_alias(fa_head, tos, 0);
1631 
1632 	if (!fa)
1633 		return -ESRCH;
1634 
1635 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1636 
1637 	fa_to_delete = NULL;
1638 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1639 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1640 		struct fib_info *fi = fa->fa_info;
1641 
1642 		if (fa->fa_tos != tos)
1643 			break;
1644 
1645 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1646 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1647 		     fa->fa_scope == cfg->fc_scope) &&
1648 		    (!cfg->fc_protocol ||
1649 		     fi->fib_protocol == cfg->fc_protocol) &&
1650 		    fib_nh_match(cfg, fi) == 0) {
1651 			fa_to_delete = fa;
1652 			break;
1653 		}
1654 	}
1655 
1656 	if (!fa_to_delete)
1657 		return -ESRCH;
1658 
1659 	fa = fa_to_delete;
1660 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1661 		  &cfg->fc_nlinfo, 0);
1662 
1663 	l = fib_find_node(t, key);
1664 	li = find_leaf_info(l, plen);
1665 
1666 	list_del_rcu(&fa->fa_list);
1667 
1668 	if (list_empty(fa_head)) {
1669 		hlist_del_rcu(&li->hlist);
1670 		free_leaf_info(li);
1671 	}
1672 
1673 	if (hlist_empty(&l->list))
1674 		trie_leaf_remove(t, l);
1675 
1676 	if (fa->fa_state & FA_S_ACCESSED)
1677 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1678 
1679 	fib_release_info(fa->fa_info);
1680 	alias_free_mem_rcu(fa);
1681 	return 0;
1682 }
1683 
1684 static int trie_flush_list(struct list_head *head)
1685 {
1686 	struct fib_alias *fa, *fa_node;
1687 	int found = 0;
1688 
1689 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1690 		struct fib_info *fi = fa->fa_info;
1691 
1692 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1693 			list_del_rcu(&fa->fa_list);
1694 			fib_release_info(fa->fa_info);
1695 			alias_free_mem_rcu(fa);
1696 			found++;
1697 		}
1698 	}
1699 	return found;
1700 }
1701 
1702 static int trie_flush_leaf(struct leaf *l)
1703 {
1704 	int found = 0;
1705 	struct hlist_head *lih = &l->list;
1706 	struct hlist_node *node, *tmp;
1707 	struct leaf_info *li = NULL;
1708 
1709 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1710 		found += trie_flush_list(&li->falh);
1711 
1712 		if (list_empty(&li->falh)) {
1713 			hlist_del_rcu(&li->hlist);
1714 			free_leaf_info(li);
1715 		}
1716 	}
1717 	return found;
1718 }
1719 
1720 /*
1721  * Scan for the next right leaf starting at node p->child[idx]
1722  * Since we have back pointer, no recursion necessary.
1723  */
1724 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1725 {
1726 	do {
1727 		t_key idx;
1728 
1729 		if (c)
1730 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1731 		else
1732 			idx = 0;
1733 
1734 		while (idx < 1u << p->bits) {
1735 			c = tnode_get_child_rcu(p, idx++);
1736 			if (!c)
1737 				continue;
1738 
1739 			if (IS_LEAF(c)) {
1740 				prefetch(p->child[idx]);
1741 				return (struct leaf *) c;
1742 			}
1743 
1744 			/* Rescan start scanning in new node */
1745 			p = (struct tnode *) c;
1746 			idx = 0;
1747 		}
1748 
1749 		/* Node empty, walk back up to parent */
1750 		c = (struct node *) p;
1751 	} while ( (p = node_parent_rcu(c)) != NULL);
1752 
1753 	return NULL; /* Root of trie */
1754 }
1755 
1756 static struct leaf *trie_firstleaf(struct trie *t)
1757 {
1758 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1759 
1760 	if (!n)
1761 		return NULL;
1762 
1763 	if (IS_LEAF(n))          /* trie is just a leaf */
1764 		return (struct leaf *) n;
1765 
1766 	return leaf_walk_rcu(n, NULL);
1767 }
1768 
1769 static struct leaf *trie_nextleaf(struct leaf *l)
1770 {
1771 	struct node *c = (struct node *) l;
1772 	struct tnode *p = node_parent_rcu(c);
1773 
1774 	if (!p)
1775 		return NULL;	/* trie with just one leaf */
1776 
1777 	return leaf_walk_rcu(p, c);
1778 }
1779 
1780 static struct leaf *trie_leafindex(struct trie *t, int index)
1781 {
1782 	struct leaf *l = trie_firstleaf(t);
1783 
1784 	while (l && index-- > 0)
1785 		l = trie_nextleaf(l);
1786 
1787 	return l;
1788 }
1789 
1790 
1791 /*
1792  * Caller must hold RTNL.
1793  */
1794 int fib_table_flush(struct fib_table *tb)
1795 {
1796 	struct trie *t = (struct trie *) tb->tb_data;
1797 	struct leaf *l, *ll = NULL;
1798 	int found = 0;
1799 
1800 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1801 		found += trie_flush_leaf(l);
1802 
1803 		if (ll && hlist_empty(&ll->list))
1804 			trie_leaf_remove(t, ll);
1805 		ll = l;
1806 	}
1807 
1808 	if (ll && hlist_empty(&ll->list))
1809 		trie_leaf_remove(t, ll);
1810 
1811 	pr_debug("trie_flush found=%d\n", found);
1812 	return found;
1813 }
1814 
1815 void fib_table_select_default(struct fib_table *tb,
1816 			      const struct flowi *flp,
1817 			      struct fib_result *res)
1818 {
1819 	struct trie *t = (struct trie *) tb->tb_data;
1820 	int order, last_idx;
1821 	struct fib_info *fi = NULL;
1822 	struct fib_info *last_resort;
1823 	struct fib_alias *fa = NULL;
1824 	struct list_head *fa_head;
1825 	struct leaf *l;
1826 
1827 	last_idx = -1;
1828 	last_resort = NULL;
1829 	order = -1;
1830 
1831 	rcu_read_lock();
1832 
1833 	l = fib_find_node(t, 0);
1834 	if (!l)
1835 		goto out;
1836 
1837 	fa_head = get_fa_head(l, 0);
1838 	if (!fa_head)
1839 		goto out;
1840 
1841 	if (list_empty(fa_head))
1842 		goto out;
1843 
1844 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1845 		struct fib_info *next_fi = fa->fa_info;
1846 
1847 		if (fa->fa_scope != res->scope ||
1848 		    fa->fa_type != RTN_UNICAST)
1849 			continue;
1850 
1851 		if (next_fi->fib_priority > res->fi->fib_priority)
1852 			break;
1853 		if (!next_fi->fib_nh[0].nh_gw ||
1854 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1855 			continue;
1856 		fa->fa_state |= FA_S_ACCESSED;
1857 
1858 		if (fi == NULL) {
1859 			if (next_fi != res->fi)
1860 				break;
1861 		} else if (!fib_detect_death(fi, order, &last_resort,
1862 					     &last_idx, tb->tb_default)) {
1863 			fib_result_assign(res, fi);
1864 			tb->tb_default = order;
1865 			goto out;
1866 		}
1867 		fi = next_fi;
1868 		order++;
1869 	}
1870 	if (order <= 0 || fi == NULL) {
1871 		tb->tb_default = -1;
1872 		goto out;
1873 	}
1874 
1875 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1876 				tb->tb_default)) {
1877 		fib_result_assign(res, fi);
1878 		tb->tb_default = order;
1879 		goto out;
1880 	}
1881 	if (last_idx >= 0)
1882 		fib_result_assign(res, last_resort);
1883 	tb->tb_default = last_idx;
1884 out:
1885 	rcu_read_unlock();
1886 }
1887 
1888 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1889 			   struct fib_table *tb,
1890 			   struct sk_buff *skb, struct netlink_callback *cb)
1891 {
1892 	int i, s_i;
1893 	struct fib_alias *fa;
1894 	__be32 xkey = htonl(key);
1895 
1896 	s_i = cb->args[5];
1897 	i = 0;
1898 
1899 	/* rcu_read_lock is hold by caller */
1900 
1901 	list_for_each_entry_rcu(fa, fah, fa_list) {
1902 		if (i < s_i) {
1903 			i++;
1904 			continue;
1905 		}
1906 
1907 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1908 				  cb->nlh->nlmsg_seq,
1909 				  RTM_NEWROUTE,
1910 				  tb->tb_id,
1911 				  fa->fa_type,
1912 				  fa->fa_scope,
1913 				  xkey,
1914 				  plen,
1915 				  fa->fa_tos,
1916 				  fa->fa_info, NLM_F_MULTI) < 0) {
1917 			cb->args[5] = i;
1918 			return -1;
1919 		}
1920 		i++;
1921 	}
1922 	cb->args[5] = i;
1923 	return skb->len;
1924 }
1925 
1926 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1927 			struct sk_buff *skb, struct netlink_callback *cb)
1928 {
1929 	struct leaf_info *li;
1930 	struct hlist_node *node;
1931 	int i, s_i;
1932 
1933 	s_i = cb->args[4];
1934 	i = 0;
1935 
1936 	/* rcu_read_lock is hold by caller */
1937 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1938 		if (i < s_i) {
1939 			i++;
1940 			continue;
1941 		}
1942 
1943 		if (i > s_i)
1944 			cb->args[5] = 0;
1945 
1946 		if (list_empty(&li->falh))
1947 			continue;
1948 
1949 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1950 			cb->args[4] = i;
1951 			return -1;
1952 		}
1953 		i++;
1954 	}
1955 
1956 	cb->args[4] = i;
1957 	return skb->len;
1958 }
1959 
1960 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1961 		   struct netlink_callback *cb)
1962 {
1963 	struct leaf *l;
1964 	struct trie *t = (struct trie *) tb->tb_data;
1965 	t_key key = cb->args[2];
1966 	int count = cb->args[3];
1967 
1968 	rcu_read_lock();
1969 	/* Dump starting at last key.
1970 	 * Note: 0.0.0.0/0 (ie default) is first key.
1971 	 */
1972 	if (count == 0)
1973 		l = trie_firstleaf(t);
1974 	else {
1975 		/* Normally, continue from last key, but if that is missing
1976 		 * fallback to using slow rescan
1977 		 */
1978 		l = fib_find_node(t, key);
1979 		if (!l)
1980 			l = trie_leafindex(t, count);
1981 	}
1982 
1983 	while (l) {
1984 		cb->args[2] = l->key;
1985 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1986 			cb->args[3] = count;
1987 			rcu_read_unlock();
1988 			return -1;
1989 		}
1990 
1991 		++count;
1992 		l = trie_nextleaf(l);
1993 		memset(&cb->args[4], 0,
1994 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1995 	}
1996 	cb->args[3] = count;
1997 	rcu_read_unlock();
1998 
1999 	return skb->len;
2000 }
2001 
2002 void __init fib_hash_init(void)
2003 {
2004 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2005 					  sizeof(struct fib_alias),
2006 					  0, SLAB_PANIC, NULL);
2007 
2008 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2009 					   max(sizeof(struct leaf),
2010 					       sizeof(struct leaf_info)),
2011 					   0, SLAB_PANIC, NULL);
2012 }
2013 
2014 
2015 /* Fix more generic FIB names for init later */
2016 struct fib_table *fib_hash_table(u32 id)
2017 {
2018 	struct fib_table *tb;
2019 	struct trie *t;
2020 
2021 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2022 		     GFP_KERNEL);
2023 	if (tb == NULL)
2024 		return NULL;
2025 
2026 	tb->tb_id = id;
2027 	tb->tb_default = -1;
2028 
2029 	t = (struct trie *) tb->tb_data;
2030 	memset(t, 0, sizeof(*t));
2031 
2032 	if (id == RT_TABLE_LOCAL)
2033 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2034 
2035 	return tb;
2036 }
2037 
2038 #ifdef CONFIG_PROC_FS
2039 /* Depth first Trie walk iterator */
2040 struct fib_trie_iter {
2041 	struct seq_net_private p;
2042 	struct fib_table *tb;
2043 	struct tnode *tnode;
2044 	unsigned index;
2045 	unsigned depth;
2046 };
2047 
2048 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2049 {
2050 	struct tnode *tn = iter->tnode;
2051 	unsigned cindex = iter->index;
2052 	struct tnode *p;
2053 
2054 	/* A single entry routing table */
2055 	if (!tn)
2056 		return NULL;
2057 
2058 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2059 		 iter->tnode, iter->index, iter->depth);
2060 rescan:
2061 	while (cindex < (1<<tn->bits)) {
2062 		struct node *n = tnode_get_child_rcu(tn, cindex);
2063 
2064 		if (n) {
2065 			if (IS_LEAF(n)) {
2066 				iter->tnode = tn;
2067 				iter->index = cindex + 1;
2068 			} else {
2069 				/* push down one level */
2070 				iter->tnode = (struct tnode *) n;
2071 				iter->index = 0;
2072 				++iter->depth;
2073 			}
2074 			return n;
2075 		}
2076 
2077 		++cindex;
2078 	}
2079 
2080 	/* Current node exhausted, pop back up */
2081 	p = node_parent_rcu((struct node *)tn);
2082 	if (p) {
2083 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2084 		tn = p;
2085 		--iter->depth;
2086 		goto rescan;
2087 	}
2088 
2089 	/* got root? */
2090 	return NULL;
2091 }
2092 
2093 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2094 				       struct trie *t)
2095 {
2096 	struct node *n;
2097 
2098 	if (!t)
2099 		return NULL;
2100 
2101 	n = rcu_dereference(t->trie);
2102 	if (!n)
2103 		return NULL;
2104 
2105 	if (IS_TNODE(n)) {
2106 		iter->tnode = (struct tnode *) n;
2107 		iter->index = 0;
2108 		iter->depth = 1;
2109 	} else {
2110 		iter->tnode = NULL;
2111 		iter->index = 0;
2112 		iter->depth = 0;
2113 	}
2114 
2115 	return n;
2116 }
2117 
2118 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2119 {
2120 	struct node *n;
2121 	struct fib_trie_iter iter;
2122 
2123 	memset(s, 0, sizeof(*s));
2124 
2125 	rcu_read_lock();
2126 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2127 		if (IS_LEAF(n)) {
2128 			struct leaf *l = (struct leaf *)n;
2129 			struct leaf_info *li;
2130 			struct hlist_node *tmp;
2131 
2132 			s->leaves++;
2133 			s->totdepth += iter.depth;
2134 			if (iter.depth > s->maxdepth)
2135 				s->maxdepth = iter.depth;
2136 
2137 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2138 				++s->prefixes;
2139 		} else {
2140 			const struct tnode *tn = (const struct tnode *) n;
2141 			int i;
2142 
2143 			s->tnodes++;
2144 			if (tn->bits < MAX_STAT_DEPTH)
2145 				s->nodesizes[tn->bits]++;
2146 
2147 			for (i = 0; i < (1<<tn->bits); i++)
2148 				if (!tn->child[i])
2149 					s->nullpointers++;
2150 		}
2151 	}
2152 	rcu_read_unlock();
2153 }
2154 
2155 /*
2156  *	This outputs /proc/net/fib_triestats
2157  */
2158 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2159 {
2160 	unsigned i, max, pointers, bytes, avdepth;
2161 
2162 	if (stat->leaves)
2163 		avdepth = stat->totdepth*100 / stat->leaves;
2164 	else
2165 		avdepth = 0;
2166 
2167 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2168 		   avdepth / 100, avdepth % 100);
2169 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2170 
2171 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2172 	bytes = sizeof(struct leaf) * stat->leaves;
2173 
2174 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2175 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2176 
2177 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2178 	bytes += sizeof(struct tnode) * stat->tnodes;
2179 
2180 	max = MAX_STAT_DEPTH;
2181 	while (max > 0 && stat->nodesizes[max-1] == 0)
2182 		max--;
2183 
2184 	pointers = 0;
2185 	for (i = 1; i <= max; i++)
2186 		if (stat->nodesizes[i] != 0) {
2187 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2188 			pointers += (1<<i) * stat->nodesizes[i];
2189 		}
2190 	seq_putc(seq, '\n');
2191 	seq_printf(seq, "\tPointers: %u\n", pointers);
2192 
2193 	bytes += sizeof(struct node *) * pointers;
2194 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2195 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2196 }
2197 
2198 #ifdef CONFIG_IP_FIB_TRIE_STATS
2199 static void trie_show_usage(struct seq_file *seq,
2200 			    const struct trie_use_stats *stats)
2201 {
2202 	seq_printf(seq, "\nCounters:\n---------\n");
2203 	seq_printf(seq, "gets = %u\n", stats->gets);
2204 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2205 	seq_printf(seq, "semantic match passed = %u\n",
2206 		   stats->semantic_match_passed);
2207 	seq_printf(seq, "semantic match miss = %u\n",
2208 		   stats->semantic_match_miss);
2209 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2210 	seq_printf(seq, "skipped node resize = %u\n\n",
2211 		   stats->resize_node_skipped);
2212 }
2213 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2214 
2215 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2216 {
2217 	if (tb->tb_id == RT_TABLE_LOCAL)
2218 		seq_puts(seq, "Local:\n");
2219 	else if (tb->tb_id == RT_TABLE_MAIN)
2220 		seq_puts(seq, "Main:\n");
2221 	else
2222 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2223 }
2224 
2225 
2226 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2227 {
2228 	struct net *net = (struct net *)seq->private;
2229 	unsigned int h;
2230 
2231 	seq_printf(seq,
2232 		   "Basic info: size of leaf:"
2233 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2234 		   sizeof(struct leaf), sizeof(struct tnode));
2235 
2236 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2237 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2238 		struct hlist_node *node;
2239 		struct fib_table *tb;
2240 
2241 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2242 			struct trie *t = (struct trie *) tb->tb_data;
2243 			struct trie_stat stat;
2244 
2245 			if (!t)
2246 				continue;
2247 
2248 			fib_table_print(seq, tb);
2249 
2250 			trie_collect_stats(t, &stat);
2251 			trie_show_stats(seq, &stat);
2252 #ifdef CONFIG_IP_FIB_TRIE_STATS
2253 			trie_show_usage(seq, &t->stats);
2254 #endif
2255 		}
2256 	}
2257 
2258 	return 0;
2259 }
2260 
2261 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2262 {
2263 	return single_open_net(inode, file, fib_triestat_seq_show);
2264 }
2265 
2266 static const struct file_operations fib_triestat_fops = {
2267 	.owner	= THIS_MODULE,
2268 	.open	= fib_triestat_seq_open,
2269 	.read	= seq_read,
2270 	.llseek	= seq_lseek,
2271 	.release = single_release_net,
2272 };
2273 
2274 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2275 {
2276 	struct fib_trie_iter *iter = seq->private;
2277 	struct net *net = seq_file_net(seq);
2278 	loff_t idx = 0;
2279 	unsigned int h;
2280 
2281 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2282 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2283 		struct hlist_node *node;
2284 		struct fib_table *tb;
2285 
2286 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2287 			struct node *n;
2288 
2289 			for (n = fib_trie_get_first(iter,
2290 						    (struct trie *) tb->tb_data);
2291 			     n; n = fib_trie_get_next(iter))
2292 				if (pos == idx++) {
2293 					iter->tb = tb;
2294 					return n;
2295 				}
2296 		}
2297 	}
2298 
2299 	return NULL;
2300 }
2301 
2302 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2303 	__acquires(RCU)
2304 {
2305 	rcu_read_lock();
2306 	return fib_trie_get_idx(seq, *pos);
2307 }
2308 
2309 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2310 {
2311 	struct fib_trie_iter *iter = seq->private;
2312 	struct net *net = seq_file_net(seq);
2313 	struct fib_table *tb = iter->tb;
2314 	struct hlist_node *tb_node;
2315 	unsigned int h;
2316 	struct node *n;
2317 
2318 	++*pos;
2319 	/* next node in same table */
2320 	n = fib_trie_get_next(iter);
2321 	if (n)
2322 		return n;
2323 
2324 	/* walk rest of this hash chain */
2325 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2326 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2327 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2328 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2329 		if (n)
2330 			goto found;
2331 	}
2332 
2333 	/* new hash chain */
2334 	while (++h < FIB_TABLE_HASHSZ) {
2335 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2336 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2337 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2338 			if (n)
2339 				goto found;
2340 		}
2341 	}
2342 	return NULL;
2343 
2344 found:
2345 	iter->tb = tb;
2346 	return n;
2347 }
2348 
2349 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2350 	__releases(RCU)
2351 {
2352 	rcu_read_unlock();
2353 }
2354 
2355 static void seq_indent(struct seq_file *seq, int n)
2356 {
2357 	while (n-- > 0) seq_puts(seq, "   ");
2358 }
2359 
2360 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2361 {
2362 	switch (s) {
2363 	case RT_SCOPE_UNIVERSE: return "universe";
2364 	case RT_SCOPE_SITE:	return "site";
2365 	case RT_SCOPE_LINK:	return "link";
2366 	case RT_SCOPE_HOST:	return "host";
2367 	case RT_SCOPE_NOWHERE:	return "nowhere";
2368 	default:
2369 		snprintf(buf, len, "scope=%d", s);
2370 		return buf;
2371 	}
2372 }
2373 
2374 static const char *const rtn_type_names[__RTN_MAX] = {
2375 	[RTN_UNSPEC] = "UNSPEC",
2376 	[RTN_UNICAST] = "UNICAST",
2377 	[RTN_LOCAL] = "LOCAL",
2378 	[RTN_BROADCAST] = "BROADCAST",
2379 	[RTN_ANYCAST] = "ANYCAST",
2380 	[RTN_MULTICAST] = "MULTICAST",
2381 	[RTN_BLACKHOLE] = "BLACKHOLE",
2382 	[RTN_UNREACHABLE] = "UNREACHABLE",
2383 	[RTN_PROHIBIT] = "PROHIBIT",
2384 	[RTN_THROW] = "THROW",
2385 	[RTN_NAT] = "NAT",
2386 	[RTN_XRESOLVE] = "XRESOLVE",
2387 };
2388 
2389 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2390 {
2391 	if (t < __RTN_MAX && rtn_type_names[t])
2392 		return rtn_type_names[t];
2393 	snprintf(buf, len, "type %u", t);
2394 	return buf;
2395 }
2396 
2397 /* Pretty print the trie */
2398 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2399 {
2400 	const struct fib_trie_iter *iter = seq->private;
2401 	struct node *n = v;
2402 
2403 	if (!node_parent_rcu(n))
2404 		fib_table_print(seq, iter->tb);
2405 
2406 	if (IS_TNODE(n)) {
2407 		struct tnode *tn = (struct tnode *) n;
2408 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2409 
2410 		seq_indent(seq, iter->depth-1);
2411 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2412 			   &prf, tn->pos, tn->bits, tn->full_children,
2413 			   tn->empty_children);
2414 
2415 	} else {
2416 		struct leaf *l = (struct leaf *) n;
2417 		struct leaf_info *li;
2418 		struct hlist_node *node;
2419 		__be32 val = htonl(l->key);
2420 
2421 		seq_indent(seq, iter->depth);
2422 		seq_printf(seq, "  |-- %pI4\n", &val);
2423 
2424 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2425 			struct fib_alias *fa;
2426 
2427 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2428 				char buf1[32], buf2[32];
2429 
2430 				seq_indent(seq, iter->depth+1);
2431 				seq_printf(seq, "  /%d %s %s", li->plen,
2432 					   rtn_scope(buf1, sizeof(buf1),
2433 						     fa->fa_scope),
2434 					   rtn_type(buf2, sizeof(buf2),
2435 						    fa->fa_type));
2436 				if (fa->fa_tos)
2437 					seq_printf(seq, " tos=%d", fa->fa_tos);
2438 				seq_putc(seq, '\n');
2439 			}
2440 		}
2441 	}
2442 
2443 	return 0;
2444 }
2445 
2446 static const struct seq_operations fib_trie_seq_ops = {
2447 	.start  = fib_trie_seq_start,
2448 	.next   = fib_trie_seq_next,
2449 	.stop   = fib_trie_seq_stop,
2450 	.show   = fib_trie_seq_show,
2451 };
2452 
2453 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2454 {
2455 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2456 			    sizeof(struct fib_trie_iter));
2457 }
2458 
2459 static const struct file_operations fib_trie_fops = {
2460 	.owner  = THIS_MODULE,
2461 	.open   = fib_trie_seq_open,
2462 	.read   = seq_read,
2463 	.llseek = seq_lseek,
2464 	.release = seq_release_net,
2465 };
2466 
2467 struct fib_route_iter {
2468 	struct seq_net_private p;
2469 	struct trie *main_trie;
2470 	loff_t	pos;
2471 	t_key	key;
2472 };
2473 
2474 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2475 {
2476 	struct leaf *l = NULL;
2477 	struct trie *t = iter->main_trie;
2478 
2479 	/* use cache location of last found key */
2480 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2481 		pos -= iter->pos;
2482 	else {
2483 		iter->pos = 0;
2484 		l = trie_firstleaf(t);
2485 	}
2486 
2487 	while (l && pos-- > 0) {
2488 		iter->pos++;
2489 		l = trie_nextleaf(l);
2490 	}
2491 
2492 	if (l)
2493 		iter->key = pos;	/* remember it */
2494 	else
2495 		iter->pos = 0;		/* forget it */
2496 
2497 	return l;
2498 }
2499 
2500 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2501 	__acquires(RCU)
2502 {
2503 	struct fib_route_iter *iter = seq->private;
2504 	struct fib_table *tb;
2505 
2506 	rcu_read_lock();
2507 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2508 	if (!tb)
2509 		return NULL;
2510 
2511 	iter->main_trie = (struct trie *) tb->tb_data;
2512 	if (*pos == 0)
2513 		return SEQ_START_TOKEN;
2514 	else
2515 		return fib_route_get_idx(iter, *pos - 1);
2516 }
2517 
2518 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2519 {
2520 	struct fib_route_iter *iter = seq->private;
2521 	struct leaf *l = v;
2522 
2523 	++*pos;
2524 	if (v == SEQ_START_TOKEN) {
2525 		iter->pos = 0;
2526 		l = trie_firstleaf(iter->main_trie);
2527 	} else {
2528 		iter->pos++;
2529 		l = trie_nextleaf(l);
2530 	}
2531 
2532 	if (l)
2533 		iter->key = l->key;
2534 	else
2535 		iter->pos = 0;
2536 	return l;
2537 }
2538 
2539 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2540 	__releases(RCU)
2541 {
2542 	rcu_read_unlock();
2543 }
2544 
2545 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2546 {
2547 	static unsigned type2flags[RTN_MAX + 1] = {
2548 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2549 	};
2550 	unsigned flags = type2flags[type];
2551 
2552 	if (fi && fi->fib_nh->nh_gw)
2553 		flags |= RTF_GATEWAY;
2554 	if (mask == htonl(0xFFFFFFFF))
2555 		flags |= RTF_HOST;
2556 	flags |= RTF_UP;
2557 	return flags;
2558 }
2559 
2560 /*
2561  *	This outputs /proc/net/route.
2562  *	The format of the file is not supposed to be changed
2563  * 	and needs to be same as fib_hash output to avoid breaking
2564  *	legacy utilities
2565  */
2566 static int fib_route_seq_show(struct seq_file *seq, void *v)
2567 {
2568 	struct leaf *l = v;
2569 	struct leaf_info *li;
2570 	struct hlist_node *node;
2571 
2572 	if (v == SEQ_START_TOKEN) {
2573 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2574 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2575 			   "\tWindow\tIRTT");
2576 		return 0;
2577 	}
2578 
2579 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2580 		struct fib_alias *fa;
2581 		__be32 mask, prefix;
2582 
2583 		mask = inet_make_mask(li->plen);
2584 		prefix = htonl(l->key);
2585 
2586 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2587 			const struct fib_info *fi = fa->fa_info;
2588 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2589 			int len;
2590 
2591 			if (fa->fa_type == RTN_BROADCAST
2592 			    || fa->fa_type == RTN_MULTICAST)
2593 				continue;
2594 
2595 			if (fi)
2596 				seq_printf(seq,
2597 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2598 					 "%d\t%08X\t%d\t%u\t%u%n",
2599 					 fi->fib_dev ? fi->fib_dev->name : "*",
2600 					 prefix,
2601 					 fi->fib_nh->nh_gw, flags, 0, 0,
2602 					 fi->fib_priority,
2603 					 mask,
2604 					 (fi->fib_advmss ?
2605 					  fi->fib_advmss + 40 : 0),
2606 					 fi->fib_window,
2607 					 fi->fib_rtt >> 3, &len);
2608 			else
2609 				seq_printf(seq,
2610 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2611 					 "%d\t%08X\t%d\t%u\t%u%n",
2612 					 prefix, 0, flags, 0, 0, 0,
2613 					 mask, 0, 0, 0, &len);
2614 
2615 			seq_printf(seq, "%*s\n", 127 - len, "");
2616 		}
2617 	}
2618 
2619 	return 0;
2620 }
2621 
2622 static const struct seq_operations fib_route_seq_ops = {
2623 	.start  = fib_route_seq_start,
2624 	.next   = fib_route_seq_next,
2625 	.stop   = fib_route_seq_stop,
2626 	.show   = fib_route_seq_show,
2627 };
2628 
2629 static int fib_route_seq_open(struct inode *inode, struct file *file)
2630 {
2631 	return seq_open_net(inode, file, &fib_route_seq_ops,
2632 			    sizeof(struct fib_route_iter));
2633 }
2634 
2635 static const struct file_operations fib_route_fops = {
2636 	.owner  = THIS_MODULE,
2637 	.open   = fib_route_seq_open,
2638 	.read   = seq_read,
2639 	.llseek = seq_lseek,
2640 	.release = seq_release_net,
2641 };
2642 
2643 int __net_init fib_proc_init(struct net *net)
2644 {
2645 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2646 		goto out1;
2647 
2648 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2649 				  &fib_triestat_fops))
2650 		goto out2;
2651 
2652 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2653 		goto out3;
2654 
2655 	return 0;
2656 
2657 out3:
2658 	proc_net_remove(net, "fib_triestat");
2659 out2:
2660 	proc_net_remove(net, "fib_trie");
2661 out1:
2662 	return -ENOMEM;
2663 }
2664 
2665 void __net_exit fib_proc_exit(struct net *net)
2666 {
2667 	proc_net_remove(net, "fib_trie");
2668 	proc_net_remove(net, "fib_triestat");
2669 	proc_net_remove(net, "route");
2670 }
2671 
2672 #endif /* CONFIG_PROC_FS */
2673