xref: /linux/net/ipv4/fib_trie.c (revision d39d0ed196aa1685bb24771e92f78633c66ac9cb)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <linux/slab.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #define MAX_STAT_DEPTH 32
85 
86 #define KEYLENGTH (8*sizeof(t_key))
87 
88 typedef unsigned int t_key;
89 
90 #define T_TNODE 0
91 #define T_LEAF  1
92 #define NODE_TYPE_MASK	0x1UL
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 
95 #define IS_TNODE(n) (!(n->parent & T_LEAF))
96 #define IS_LEAF(n) (n->parent & T_LEAF)
97 
98 struct node {
99 	unsigned long parent;
100 	t_key key;
101 };
102 
103 struct leaf {
104 	unsigned long parent;
105 	t_key key;
106 	struct hlist_head list;
107 	struct rcu_head rcu;
108 };
109 
110 struct leaf_info {
111 	struct hlist_node hlist;
112 	struct rcu_head rcu;
113 	int plen;
114 	struct list_head falh;
115 };
116 
117 struct tnode {
118 	unsigned long parent;
119 	t_key key;
120 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
122 	unsigned int full_children;	/* KEYLENGTH bits needed */
123 	unsigned int empty_children;	/* KEYLENGTH bits needed */
124 	union {
125 		struct rcu_head rcu;
126 		struct work_struct work;
127 		struct tnode *tnode_free;
128 	};
129 	struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int prefixes;
150 	unsigned int nodesizes[MAX_STAT_DEPTH];
151 };
152 
153 struct trie {
154 	struct node *trie;
155 #ifdef CONFIG_IP_FIB_TRIE_STATS
156 	struct trie_use_stats stats;
157 #endif
158 };
159 
160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 				  int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 /* tnodes to free after resize(); protected by RTNL */
167 static struct tnode *tnode_free_head;
168 static size_t tnode_free_size;
169 
170 /*
171  * synchronize_rcu after call_rcu for that many pages; it should be especially
172  * useful before resizing the root node with PREEMPT_NONE configs; the value was
173  * obtained experimentally, aiming to avoid visible slowdown.
174  */
175 static const int sync_pages = 128;
176 
177 static struct kmem_cache *fn_alias_kmem __read_mostly;
178 static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 
180 static inline struct tnode *node_parent(struct node *node)
181 {
182 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183 }
184 
185 static inline struct tnode *node_parent_rcu(struct node *node)
186 {
187 	struct tnode *ret = node_parent(node);
188 
189 	return rcu_dereference(ret);
190 }
191 
192 /* Same as rcu_assign_pointer
193  * but that macro() assumes that value is a pointer.
194  */
195 static inline void node_set_parent(struct node *node, struct tnode *ptr)
196 {
197 	smp_wmb();
198 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
199 }
200 
201 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
202 {
203 	BUG_ON(i >= 1U << tn->bits);
204 
205 	return tn->child[i];
206 }
207 
208 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209 {
210 	struct node *ret = tnode_get_child(tn, i);
211 
212 	return rcu_dereference_check(ret,
213 				     rcu_read_lock_held() ||
214 				     lockdep_rtnl_is_held());
215 }
216 
217 static inline int tnode_child_length(const struct tnode *tn)
218 {
219 	return 1 << tn->bits;
220 }
221 
222 static inline t_key mask_pfx(t_key k, unsigned short l)
223 {
224 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
225 }
226 
227 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
228 {
229 	if (offset < KEYLENGTH)
230 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
231 	else
232 		return 0;
233 }
234 
235 static inline int tkey_equals(t_key a, t_key b)
236 {
237 	return a == b;
238 }
239 
240 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
241 {
242 	if (bits == 0 || offset >= KEYLENGTH)
243 		return 1;
244 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
245 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
246 }
247 
248 static inline int tkey_mismatch(t_key a, int offset, t_key b)
249 {
250 	t_key diff = a ^ b;
251 	int i = offset;
252 
253 	if (!diff)
254 		return 0;
255 	while ((diff << i) >> (KEYLENGTH-1) == 0)
256 		i++;
257 	return i;
258 }
259 
260 /*
261   To understand this stuff, an understanding of keys and all their bits is
262   necessary. Every node in the trie has a key associated with it, but not
263   all of the bits in that key are significant.
264 
265   Consider a node 'n' and its parent 'tp'.
266 
267   If n is a leaf, every bit in its key is significant. Its presence is
268   necessitated by path compression, since during a tree traversal (when
269   searching for a leaf - unless we are doing an insertion) we will completely
270   ignore all skipped bits we encounter. Thus we need to verify, at the end of
271   a potentially successful search, that we have indeed been walking the
272   correct key path.
273 
274   Note that we can never "miss" the correct key in the tree if present by
275   following the wrong path. Path compression ensures that segments of the key
276   that are the same for all keys with a given prefix are skipped, but the
277   skipped part *is* identical for each node in the subtrie below the skipped
278   bit! trie_insert() in this implementation takes care of that - note the
279   call to tkey_sub_equals() in trie_insert().
280 
281   if n is an internal node - a 'tnode' here, the various parts of its key
282   have many different meanings.
283 
284   Example:
285   _________________________________________________________________
286   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287   -----------------------------------------------------------------
288     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
289 
290   _________________________________________________________________
291   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292   -----------------------------------------------------------------
293    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
294 
295   tp->pos = 7
296   tp->bits = 3
297   n->pos = 15
298   n->bits = 4
299 
300   First, let's just ignore the bits that come before the parent tp, that is
301   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302   not use them for anything.
303 
304   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305   index into the parent's child array. That is, they will be used to find
306   'n' among tp's children.
307 
308   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
309   for the node n.
310 
311   All the bits we have seen so far are significant to the node n. The rest
312   of the bits are really not needed or indeed known in n->key.
313 
314   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315   n's child array, and will of course be different for each child.
316 
317 
318   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
319   at this point.
320 
321 */
322 
323 static inline void check_tnode(const struct tnode *tn)
324 {
325 	WARN_ON(tn && tn->pos+tn->bits > 32);
326 }
327 
328 static const int halve_threshold = 25;
329 static const int inflate_threshold = 50;
330 static const int halve_threshold_root = 15;
331 static const int inflate_threshold_root = 30;
332 
333 static void __alias_free_mem(struct rcu_head *head)
334 {
335 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
336 	kmem_cache_free(fn_alias_kmem, fa);
337 }
338 
339 static inline void alias_free_mem_rcu(struct fib_alias *fa)
340 {
341 	call_rcu(&fa->rcu, __alias_free_mem);
342 }
343 
344 static void __leaf_free_rcu(struct rcu_head *head)
345 {
346 	struct leaf *l = container_of(head, struct leaf, rcu);
347 	kmem_cache_free(trie_leaf_kmem, l);
348 }
349 
350 static inline void free_leaf(struct leaf *l)
351 {
352 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
353 }
354 
355 static void __leaf_info_free_rcu(struct rcu_head *head)
356 {
357 	kfree(container_of(head, struct leaf_info, rcu));
358 }
359 
360 static inline void free_leaf_info(struct leaf_info *leaf)
361 {
362 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
363 }
364 
365 static struct tnode *tnode_alloc(size_t size)
366 {
367 	if (size <= PAGE_SIZE)
368 		return kzalloc(size, GFP_KERNEL);
369 	else
370 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
371 }
372 
373 static void __tnode_vfree(struct work_struct *arg)
374 {
375 	struct tnode *tn = container_of(arg, struct tnode, work);
376 	vfree(tn);
377 }
378 
379 static void __tnode_free_rcu(struct rcu_head *head)
380 {
381 	struct tnode *tn = container_of(head, struct tnode, rcu);
382 	size_t size = sizeof(struct tnode) +
383 		      (sizeof(struct node *) << tn->bits);
384 
385 	if (size <= PAGE_SIZE)
386 		kfree(tn);
387 	else {
388 		INIT_WORK(&tn->work, __tnode_vfree);
389 		schedule_work(&tn->work);
390 	}
391 }
392 
393 static inline void tnode_free(struct tnode *tn)
394 {
395 	if (IS_LEAF(tn))
396 		free_leaf((struct leaf *) tn);
397 	else
398 		call_rcu(&tn->rcu, __tnode_free_rcu);
399 }
400 
401 static void tnode_free_safe(struct tnode *tn)
402 {
403 	BUG_ON(IS_LEAF(tn));
404 	tn->tnode_free = tnode_free_head;
405 	tnode_free_head = tn;
406 	tnode_free_size += sizeof(struct tnode) +
407 			   (sizeof(struct node *) << tn->bits);
408 }
409 
410 static void tnode_free_flush(void)
411 {
412 	struct tnode *tn;
413 
414 	while ((tn = tnode_free_head)) {
415 		tnode_free_head = tn->tnode_free;
416 		tn->tnode_free = NULL;
417 		tnode_free(tn);
418 	}
419 
420 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
421 		tnode_free_size = 0;
422 		synchronize_rcu();
423 	}
424 }
425 
426 static struct leaf *leaf_new(void)
427 {
428 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
429 	if (l) {
430 		l->parent = T_LEAF;
431 		INIT_HLIST_HEAD(&l->list);
432 	}
433 	return l;
434 }
435 
436 static struct leaf_info *leaf_info_new(int plen)
437 {
438 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
439 	if (li) {
440 		li->plen = plen;
441 		INIT_LIST_HEAD(&li->falh);
442 	}
443 	return li;
444 }
445 
446 static struct tnode *tnode_new(t_key key, int pos, int bits)
447 {
448 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
449 	struct tnode *tn = tnode_alloc(sz);
450 
451 	if (tn) {
452 		tn->parent = T_TNODE;
453 		tn->pos = pos;
454 		tn->bits = bits;
455 		tn->key = key;
456 		tn->full_children = 0;
457 		tn->empty_children = 1<<bits;
458 	}
459 
460 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
461 		 (unsigned long) (sizeof(struct node) << bits));
462 	return tn;
463 }
464 
465 /*
466  * Check whether a tnode 'n' is "full", i.e. it is an internal node
467  * and no bits are skipped. See discussion in dyntree paper p. 6
468  */
469 
470 static inline int tnode_full(const struct tnode *tn, const struct node *n)
471 {
472 	if (n == NULL || IS_LEAF(n))
473 		return 0;
474 
475 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
476 }
477 
478 static inline void put_child(struct trie *t, struct tnode *tn, int i,
479 			     struct node *n)
480 {
481 	tnode_put_child_reorg(tn, i, n, -1);
482 }
483 
484  /*
485   * Add a child at position i overwriting the old value.
486   * Update the value of full_children and empty_children.
487   */
488 
489 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
490 				  int wasfull)
491 {
492 	struct node *chi = tn->child[i];
493 	int isfull;
494 
495 	BUG_ON(i >= 1<<tn->bits);
496 
497 	/* update emptyChildren */
498 	if (n == NULL && chi != NULL)
499 		tn->empty_children++;
500 	else if (n != NULL && chi == NULL)
501 		tn->empty_children--;
502 
503 	/* update fullChildren */
504 	if (wasfull == -1)
505 		wasfull = tnode_full(tn, chi);
506 
507 	isfull = tnode_full(tn, n);
508 	if (wasfull && !isfull)
509 		tn->full_children--;
510 	else if (!wasfull && isfull)
511 		tn->full_children++;
512 
513 	if (n)
514 		node_set_parent(n, tn);
515 
516 	rcu_assign_pointer(tn->child[i], n);
517 }
518 
519 #define MAX_WORK 10
520 static struct node *resize(struct trie *t, struct tnode *tn)
521 {
522 	int i;
523 	struct tnode *old_tn;
524 	int inflate_threshold_use;
525 	int halve_threshold_use;
526 	int max_work;
527 
528 	if (!tn)
529 		return NULL;
530 
531 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
532 		 tn, inflate_threshold, halve_threshold);
533 
534 	/* No children */
535 	if (tn->empty_children == tnode_child_length(tn)) {
536 		tnode_free_safe(tn);
537 		return NULL;
538 	}
539 	/* One child */
540 	if (tn->empty_children == tnode_child_length(tn) - 1)
541 		goto one_child;
542 	/*
543 	 * Double as long as the resulting node has a number of
544 	 * nonempty nodes that are above the threshold.
545 	 */
546 
547 	/*
548 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
549 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
550 	 * Telecommunications, page 6:
551 	 * "A node is doubled if the ratio of non-empty children to all
552 	 * children in the *doubled* node is at least 'high'."
553 	 *
554 	 * 'high' in this instance is the variable 'inflate_threshold'. It
555 	 * is expressed as a percentage, so we multiply it with
556 	 * tnode_child_length() and instead of multiplying by 2 (since the
557 	 * child array will be doubled by inflate()) and multiplying
558 	 * the left-hand side by 100 (to handle the percentage thing) we
559 	 * multiply the left-hand side by 50.
560 	 *
561 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
562 	 * - tn->empty_children is of course the number of non-null children
563 	 * in the current node. tn->full_children is the number of "full"
564 	 * children, that is non-null tnodes with a skip value of 0.
565 	 * All of those will be doubled in the resulting inflated tnode, so
566 	 * we just count them one extra time here.
567 	 *
568 	 * A clearer way to write this would be:
569 	 *
570 	 * to_be_doubled = tn->full_children;
571 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
572 	 *     tn->full_children;
573 	 *
574 	 * new_child_length = tnode_child_length(tn) * 2;
575 	 *
576 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
577 	 *      new_child_length;
578 	 * if (new_fill_factor >= inflate_threshold)
579 	 *
580 	 * ...and so on, tho it would mess up the while () loop.
581 	 *
582 	 * anyway,
583 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
584 	 *      inflate_threshold
585 	 *
586 	 * avoid a division:
587 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
588 	 *      inflate_threshold * new_child_length
589 	 *
590 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
591 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
592 	 *    tn->full_children) >= inflate_threshold * new_child_length
593 	 *
594 	 * expand new_child_length:
595 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 	 *    tn->full_children) >=
597 	 *      inflate_threshold * tnode_child_length(tn) * 2
598 	 *
599 	 * shorten again:
600 	 * 50 * (tn->full_children + tnode_child_length(tn) -
601 	 *    tn->empty_children) >= inflate_threshold *
602 	 *    tnode_child_length(tn)
603 	 *
604 	 */
605 
606 	check_tnode(tn);
607 
608 	/* Keep root node larger  */
609 
610 	if (!node_parent((struct node*) tn)) {
611 		inflate_threshold_use = inflate_threshold_root;
612 		halve_threshold_use = halve_threshold_root;
613 	}
614 	else {
615 		inflate_threshold_use = inflate_threshold;
616 		halve_threshold_use = halve_threshold;
617 	}
618 
619 	max_work = MAX_WORK;
620 	while ((tn->full_children > 0 &&  max_work-- &&
621 		50 * (tn->full_children + tnode_child_length(tn)
622 		      - tn->empty_children)
623 		>= inflate_threshold_use * tnode_child_length(tn))) {
624 
625 		old_tn = tn;
626 		tn = inflate(t, tn);
627 
628 		if (IS_ERR(tn)) {
629 			tn = old_tn;
630 #ifdef CONFIG_IP_FIB_TRIE_STATS
631 			t->stats.resize_node_skipped++;
632 #endif
633 			break;
634 		}
635 	}
636 
637 	check_tnode(tn);
638 
639 	/* Return if at least one inflate is run */
640 	if( max_work != MAX_WORK)
641 		return (struct node *) tn;
642 
643 	/*
644 	 * Halve as long as the number of empty children in this
645 	 * node is above threshold.
646 	 */
647 
648 	max_work = MAX_WORK;
649 	while (tn->bits > 1 &&  max_work-- &&
650 	       100 * (tnode_child_length(tn) - tn->empty_children) <
651 	       halve_threshold_use * tnode_child_length(tn)) {
652 
653 		old_tn = tn;
654 		tn = halve(t, tn);
655 		if (IS_ERR(tn)) {
656 			tn = old_tn;
657 #ifdef CONFIG_IP_FIB_TRIE_STATS
658 			t->stats.resize_node_skipped++;
659 #endif
660 			break;
661 		}
662 	}
663 
664 
665 	/* Only one child remains */
666 	if (tn->empty_children == tnode_child_length(tn) - 1) {
667 one_child:
668 		for (i = 0; i < tnode_child_length(tn); i++) {
669 			struct node *n;
670 
671 			n = tn->child[i];
672 			if (!n)
673 				continue;
674 
675 			/* compress one level */
676 
677 			node_set_parent(n, NULL);
678 			tnode_free_safe(tn);
679 			return n;
680 		}
681 	}
682 	return (struct node *) tn;
683 }
684 
685 static struct tnode *inflate(struct trie *t, struct tnode *tn)
686 {
687 	struct tnode *oldtnode = tn;
688 	int olen = tnode_child_length(tn);
689 	int i;
690 
691 	pr_debug("In inflate\n");
692 
693 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
694 
695 	if (!tn)
696 		return ERR_PTR(-ENOMEM);
697 
698 	/*
699 	 * Preallocate and store tnodes before the actual work so we
700 	 * don't get into an inconsistent state if memory allocation
701 	 * fails. In case of failure we return the oldnode and  inflate
702 	 * of tnode is ignored.
703 	 */
704 
705 	for (i = 0; i < olen; i++) {
706 		struct tnode *inode;
707 
708 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
709 		if (inode &&
710 		    IS_TNODE(inode) &&
711 		    inode->pos == oldtnode->pos + oldtnode->bits &&
712 		    inode->bits > 1) {
713 			struct tnode *left, *right;
714 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
715 
716 			left = tnode_new(inode->key&(~m), inode->pos + 1,
717 					 inode->bits - 1);
718 			if (!left)
719 				goto nomem;
720 
721 			right = tnode_new(inode->key|m, inode->pos + 1,
722 					  inode->bits - 1);
723 
724 			if (!right) {
725 				tnode_free(left);
726 				goto nomem;
727 			}
728 
729 			put_child(t, tn, 2*i, (struct node *) left);
730 			put_child(t, tn, 2*i+1, (struct node *) right);
731 		}
732 	}
733 
734 	for (i = 0; i < olen; i++) {
735 		struct tnode *inode;
736 		struct node *node = tnode_get_child(oldtnode, i);
737 		struct tnode *left, *right;
738 		int size, j;
739 
740 		/* An empty child */
741 		if (node == NULL)
742 			continue;
743 
744 		/* A leaf or an internal node with skipped bits */
745 
746 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
747 		   tn->pos + tn->bits - 1) {
748 			if (tkey_extract_bits(node->key,
749 					      oldtnode->pos + oldtnode->bits,
750 					      1) == 0)
751 				put_child(t, tn, 2*i, node);
752 			else
753 				put_child(t, tn, 2*i+1, node);
754 			continue;
755 		}
756 
757 		/* An internal node with two children */
758 		inode = (struct tnode *) node;
759 
760 		if (inode->bits == 1) {
761 			put_child(t, tn, 2*i, inode->child[0]);
762 			put_child(t, tn, 2*i+1, inode->child[1]);
763 
764 			tnode_free_safe(inode);
765 			continue;
766 		}
767 
768 		/* An internal node with more than two children */
769 
770 		/* We will replace this node 'inode' with two new
771 		 * ones, 'left' and 'right', each with half of the
772 		 * original children. The two new nodes will have
773 		 * a position one bit further down the key and this
774 		 * means that the "significant" part of their keys
775 		 * (see the discussion near the top of this file)
776 		 * will differ by one bit, which will be "0" in
777 		 * left's key and "1" in right's key. Since we are
778 		 * moving the key position by one step, the bit that
779 		 * we are moving away from - the bit at position
780 		 * (inode->pos) - is the one that will differ between
781 		 * left and right. So... we synthesize that bit in the
782 		 * two  new keys.
783 		 * The mask 'm' below will be a single "one" bit at
784 		 * the position (inode->pos)
785 		 */
786 
787 		/* Use the old key, but set the new significant
788 		 *   bit to zero.
789 		 */
790 
791 		left = (struct tnode *) tnode_get_child(tn, 2*i);
792 		put_child(t, tn, 2*i, NULL);
793 
794 		BUG_ON(!left);
795 
796 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
797 		put_child(t, tn, 2*i+1, NULL);
798 
799 		BUG_ON(!right);
800 
801 		size = tnode_child_length(left);
802 		for (j = 0; j < size; j++) {
803 			put_child(t, left, j, inode->child[j]);
804 			put_child(t, right, j, inode->child[j + size]);
805 		}
806 		put_child(t, tn, 2*i, resize(t, left));
807 		put_child(t, tn, 2*i+1, resize(t, right));
808 
809 		tnode_free_safe(inode);
810 	}
811 	tnode_free_safe(oldtnode);
812 	return tn;
813 nomem:
814 	{
815 		int size = tnode_child_length(tn);
816 		int j;
817 
818 		for (j = 0; j < size; j++)
819 			if (tn->child[j])
820 				tnode_free((struct tnode *)tn->child[j]);
821 
822 		tnode_free(tn);
823 
824 		return ERR_PTR(-ENOMEM);
825 	}
826 }
827 
828 static struct tnode *halve(struct trie *t, struct tnode *tn)
829 {
830 	struct tnode *oldtnode = tn;
831 	struct node *left, *right;
832 	int i;
833 	int olen = tnode_child_length(tn);
834 
835 	pr_debug("In halve\n");
836 
837 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
838 
839 	if (!tn)
840 		return ERR_PTR(-ENOMEM);
841 
842 	/*
843 	 * Preallocate and store tnodes before the actual work so we
844 	 * don't get into an inconsistent state if memory allocation
845 	 * fails. In case of failure we return the oldnode and halve
846 	 * of tnode is ignored.
847 	 */
848 
849 	for (i = 0; i < olen; i += 2) {
850 		left = tnode_get_child(oldtnode, i);
851 		right = tnode_get_child(oldtnode, i+1);
852 
853 		/* Two nonempty children */
854 		if (left && right) {
855 			struct tnode *newn;
856 
857 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
858 
859 			if (!newn)
860 				goto nomem;
861 
862 			put_child(t, tn, i/2, (struct node *)newn);
863 		}
864 
865 	}
866 
867 	for (i = 0; i < olen; i += 2) {
868 		struct tnode *newBinNode;
869 
870 		left = tnode_get_child(oldtnode, i);
871 		right = tnode_get_child(oldtnode, i+1);
872 
873 		/* At least one of the children is empty */
874 		if (left == NULL) {
875 			if (right == NULL)    /* Both are empty */
876 				continue;
877 			put_child(t, tn, i/2, right);
878 			continue;
879 		}
880 
881 		if (right == NULL) {
882 			put_child(t, tn, i/2, left);
883 			continue;
884 		}
885 
886 		/* Two nonempty children */
887 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
888 		put_child(t, tn, i/2, NULL);
889 		put_child(t, newBinNode, 0, left);
890 		put_child(t, newBinNode, 1, right);
891 		put_child(t, tn, i/2, resize(t, newBinNode));
892 	}
893 	tnode_free_safe(oldtnode);
894 	return tn;
895 nomem:
896 	{
897 		int size = tnode_child_length(tn);
898 		int j;
899 
900 		for (j = 0; j < size; j++)
901 			if (tn->child[j])
902 				tnode_free((struct tnode *)tn->child[j]);
903 
904 		tnode_free(tn);
905 
906 		return ERR_PTR(-ENOMEM);
907 	}
908 }
909 
910 /* readside must use rcu_read_lock currently dump routines
911  via get_fa_head and dump */
912 
913 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
914 {
915 	struct hlist_head *head = &l->list;
916 	struct hlist_node *node;
917 	struct leaf_info *li;
918 
919 	hlist_for_each_entry_rcu(li, node, head, hlist)
920 		if (li->plen == plen)
921 			return li;
922 
923 	return NULL;
924 }
925 
926 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
927 {
928 	struct leaf_info *li = find_leaf_info(l, plen);
929 
930 	if (!li)
931 		return NULL;
932 
933 	return &li->falh;
934 }
935 
936 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
937 {
938 	struct leaf_info *li = NULL, *last = NULL;
939 	struct hlist_node *node;
940 
941 	if (hlist_empty(head)) {
942 		hlist_add_head_rcu(&new->hlist, head);
943 	} else {
944 		hlist_for_each_entry(li, node, head, hlist) {
945 			if (new->plen > li->plen)
946 				break;
947 
948 			last = li;
949 		}
950 		if (last)
951 			hlist_add_after_rcu(&last->hlist, &new->hlist);
952 		else
953 			hlist_add_before_rcu(&new->hlist, &li->hlist);
954 	}
955 }
956 
957 /* rcu_read_lock needs to be hold by caller from readside */
958 
959 static struct leaf *
960 fib_find_node(struct trie *t, u32 key)
961 {
962 	int pos;
963 	struct tnode *tn;
964 	struct node *n;
965 
966 	pos = 0;
967 	n = rcu_dereference_check(t->trie,
968 				  rcu_read_lock_held() ||
969 				  lockdep_rtnl_is_held());
970 
971 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
972 		tn = (struct tnode *) n;
973 
974 		check_tnode(tn);
975 
976 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
977 			pos = tn->pos + tn->bits;
978 			n = tnode_get_child_rcu(tn,
979 						tkey_extract_bits(key,
980 								  tn->pos,
981 								  tn->bits));
982 		} else
983 			break;
984 	}
985 	/* Case we have found a leaf. Compare prefixes */
986 
987 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
988 		return (struct leaf *)n;
989 
990 	return NULL;
991 }
992 
993 static void trie_rebalance(struct trie *t, struct tnode *tn)
994 {
995 	int wasfull;
996 	t_key cindex, key;
997 	struct tnode *tp;
998 
999 	key = tn->key;
1000 
1001 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1002 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1003 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1004 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1005 
1006 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1007 				      (struct node *)tn, wasfull);
1008 
1009 		tp = node_parent((struct node *) tn);
1010 		if (!tp)
1011 			rcu_assign_pointer(t->trie, (struct node *)tn);
1012 
1013 		tnode_free_flush();
1014 		if (!tp)
1015 			break;
1016 		tn = tp;
1017 	}
1018 
1019 	/* Handle last (top) tnode */
1020 	if (IS_TNODE(tn))
1021 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1022 
1023 	rcu_assign_pointer(t->trie, (struct node *)tn);
1024 	tnode_free_flush();
1025 }
1026 
1027 /* only used from updater-side */
1028 
1029 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1030 {
1031 	int pos, newpos;
1032 	struct tnode *tp = NULL, *tn = NULL;
1033 	struct node *n;
1034 	struct leaf *l;
1035 	int missbit;
1036 	struct list_head *fa_head = NULL;
1037 	struct leaf_info *li;
1038 	t_key cindex;
1039 
1040 	pos = 0;
1041 	n = t->trie;
1042 
1043 	/* If we point to NULL, stop. Either the tree is empty and we should
1044 	 * just put a new leaf in if, or we have reached an empty child slot,
1045 	 * and we should just put our new leaf in that.
1046 	 * If we point to a T_TNODE, check if it matches our key. Note that
1047 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1048 	 * not be the parent's 'pos'+'bits'!
1049 	 *
1050 	 * If it does match the current key, get pos/bits from it, extract
1051 	 * the index from our key, push the T_TNODE and walk the tree.
1052 	 *
1053 	 * If it doesn't, we have to replace it with a new T_TNODE.
1054 	 *
1055 	 * If we point to a T_LEAF, it might or might not have the same key
1056 	 * as we do. If it does, just change the value, update the T_LEAF's
1057 	 * value, and return it.
1058 	 * If it doesn't, we need to replace it with a T_TNODE.
1059 	 */
1060 
1061 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1062 		tn = (struct tnode *) n;
1063 
1064 		check_tnode(tn);
1065 
1066 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1067 			tp = tn;
1068 			pos = tn->pos + tn->bits;
1069 			n = tnode_get_child(tn,
1070 					    tkey_extract_bits(key,
1071 							      tn->pos,
1072 							      tn->bits));
1073 
1074 			BUG_ON(n && node_parent(n) != tn);
1075 		} else
1076 			break;
1077 	}
1078 
1079 	/*
1080 	 * n  ----> NULL, LEAF or TNODE
1081 	 *
1082 	 * tp is n's (parent) ----> NULL or TNODE
1083 	 */
1084 
1085 	BUG_ON(tp && IS_LEAF(tp));
1086 
1087 	/* Case 1: n is a leaf. Compare prefixes */
1088 
1089 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1090 		l = (struct leaf *) n;
1091 		li = leaf_info_new(plen);
1092 
1093 		if (!li)
1094 			return NULL;
1095 
1096 		fa_head = &li->falh;
1097 		insert_leaf_info(&l->list, li);
1098 		goto done;
1099 	}
1100 	l = leaf_new();
1101 
1102 	if (!l)
1103 		return NULL;
1104 
1105 	l->key = key;
1106 	li = leaf_info_new(plen);
1107 
1108 	if (!li) {
1109 		free_leaf(l);
1110 		return NULL;
1111 	}
1112 
1113 	fa_head = &li->falh;
1114 	insert_leaf_info(&l->list, li);
1115 
1116 	if (t->trie && n == NULL) {
1117 		/* Case 2: n is NULL, and will just insert a new leaf */
1118 
1119 		node_set_parent((struct node *)l, tp);
1120 
1121 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1122 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1123 	} else {
1124 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1125 		/*
1126 		 *  Add a new tnode here
1127 		 *  first tnode need some special handling
1128 		 */
1129 
1130 		if (tp)
1131 			pos = tp->pos+tp->bits;
1132 		else
1133 			pos = 0;
1134 
1135 		if (n) {
1136 			newpos = tkey_mismatch(key, pos, n->key);
1137 			tn = tnode_new(n->key, newpos, 1);
1138 		} else {
1139 			newpos = 0;
1140 			tn = tnode_new(key, newpos, 1); /* First tnode */
1141 		}
1142 
1143 		if (!tn) {
1144 			free_leaf_info(li);
1145 			free_leaf(l);
1146 			return NULL;
1147 		}
1148 
1149 		node_set_parent((struct node *)tn, tp);
1150 
1151 		missbit = tkey_extract_bits(key, newpos, 1);
1152 		put_child(t, tn, missbit, (struct node *)l);
1153 		put_child(t, tn, 1-missbit, n);
1154 
1155 		if (tp) {
1156 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1157 			put_child(t, (struct tnode *)tp, cindex,
1158 				  (struct node *)tn);
1159 		} else {
1160 			rcu_assign_pointer(t->trie, (struct node *)tn);
1161 			tp = tn;
1162 		}
1163 	}
1164 
1165 	if (tp && tp->pos + tp->bits > 32)
1166 		pr_warning("fib_trie"
1167 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1168 			   tp, tp->pos, tp->bits, key, plen);
1169 
1170 	/* Rebalance the trie */
1171 
1172 	trie_rebalance(t, tp);
1173 done:
1174 	return fa_head;
1175 }
1176 
1177 /*
1178  * Caller must hold RTNL.
1179  */
1180 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1181 {
1182 	struct trie *t = (struct trie *) tb->tb_data;
1183 	struct fib_alias *fa, *new_fa;
1184 	struct list_head *fa_head = NULL;
1185 	struct fib_info *fi;
1186 	int plen = cfg->fc_dst_len;
1187 	u8 tos = cfg->fc_tos;
1188 	u32 key, mask;
1189 	int err;
1190 	struct leaf *l;
1191 
1192 	if (plen > 32)
1193 		return -EINVAL;
1194 
1195 	key = ntohl(cfg->fc_dst);
1196 
1197 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1198 
1199 	mask = ntohl(inet_make_mask(plen));
1200 
1201 	if (key & ~mask)
1202 		return -EINVAL;
1203 
1204 	key = key & mask;
1205 
1206 	fi = fib_create_info(cfg);
1207 	if (IS_ERR(fi)) {
1208 		err = PTR_ERR(fi);
1209 		goto err;
1210 	}
1211 
1212 	l = fib_find_node(t, key);
1213 	fa = NULL;
1214 
1215 	if (l) {
1216 		fa_head = get_fa_head(l, plen);
1217 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1218 	}
1219 
1220 	/* Now fa, if non-NULL, points to the first fib alias
1221 	 * with the same keys [prefix,tos,priority], if such key already
1222 	 * exists or to the node before which we will insert new one.
1223 	 *
1224 	 * If fa is NULL, we will need to allocate a new one and
1225 	 * insert to the head of f.
1226 	 *
1227 	 * If f is NULL, no fib node matched the destination key
1228 	 * and we need to allocate a new one of those as well.
1229 	 */
1230 
1231 	if (fa && fa->fa_tos == tos &&
1232 	    fa->fa_info->fib_priority == fi->fib_priority) {
1233 		struct fib_alias *fa_first, *fa_match;
1234 
1235 		err = -EEXIST;
1236 		if (cfg->fc_nlflags & NLM_F_EXCL)
1237 			goto out;
1238 
1239 		/* We have 2 goals:
1240 		 * 1. Find exact match for type, scope, fib_info to avoid
1241 		 * duplicate routes
1242 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1243 		 */
1244 		fa_match = NULL;
1245 		fa_first = fa;
1246 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1247 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1248 			if (fa->fa_tos != tos)
1249 				break;
1250 			if (fa->fa_info->fib_priority != fi->fib_priority)
1251 				break;
1252 			if (fa->fa_type == cfg->fc_type &&
1253 			    fa->fa_scope == cfg->fc_scope &&
1254 			    fa->fa_info == fi) {
1255 				fa_match = fa;
1256 				break;
1257 			}
1258 		}
1259 
1260 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1261 			struct fib_info *fi_drop;
1262 			u8 state;
1263 
1264 			fa = fa_first;
1265 			if (fa_match) {
1266 				if (fa == fa_match)
1267 					err = 0;
1268 				goto out;
1269 			}
1270 			err = -ENOBUFS;
1271 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1272 			if (new_fa == NULL)
1273 				goto out;
1274 
1275 			fi_drop = fa->fa_info;
1276 			new_fa->fa_tos = fa->fa_tos;
1277 			new_fa->fa_info = fi;
1278 			new_fa->fa_type = cfg->fc_type;
1279 			new_fa->fa_scope = cfg->fc_scope;
1280 			state = fa->fa_state;
1281 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1282 
1283 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1284 			alias_free_mem_rcu(fa);
1285 
1286 			fib_release_info(fi_drop);
1287 			if (state & FA_S_ACCESSED)
1288 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1289 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1290 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1291 
1292 			goto succeeded;
1293 		}
1294 		/* Error if we find a perfect match which
1295 		 * uses the same scope, type, and nexthop
1296 		 * information.
1297 		 */
1298 		if (fa_match)
1299 			goto out;
1300 
1301 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1302 			fa = fa_first;
1303 	}
1304 	err = -ENOENT;
1305 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1306 		goto out;
1307 
1308 	err = -ENOBUFS;
1309 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1310 	if (new_fa == NULL)
1311 		goto out;
1312 
1313 	new_fa->fa_info = fi;
1314 	new_fa->fa_tos = tos;
1315 	new_fa->fa_type = cfg->fc_type;
1316 	new_fa->fa_scope = cfg->fc_scope;
1317 	new_fa->fa_state = 0;
1318 	/*
1319 	 * Insert new entry to the list.
1320 	 */
1321 
1322 	if (!fa_head) {
1323 		fa_head = fib_insert_node(t, key, plen);
1324 		if (unlikely(!fa_head)) {
1325 			err = -ENOMEM;
1326 			goto out_free_new_fa;
1327 		}
1328 	}
1329 
1330 	list_add_tail_rcu(&new_fa->fa_list,
1331 			  (fa ? &fa->fa_list : fa_head));
1332 
1333 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1334 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1335 		  &cfg->fc_nlinfo, 0);
1336 succeeded:
1337 	return 0;
1338 
1339 out_free_new_fa:
1340 	kmem_cache_free(fn_alias_kmem, new_fa);
1341 out:
1342 	fib_release_info(fi);
1343 err:
1344 	return err;
1345 }
1346 
1347 /* should be called with rcu_read_lock */
1348 static int check_leaf(struct trie *t, struct leaf *l,
1349 		      t_key key,  const struct flowi *flp,
1350 		      struct fib_result *res)
1351 {
1352 	struct leaf_info *li;
1353 	struct hlist_head *hhead = &l->list;
1354 	struct hlist_node *node;
1355 
1356 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1357 		int err;
1358 		int plen = li->plen;
1359 		__be32 mask = inet_make_mask(plen);
1360 
1361 		if (l->key != (key & ntohl(mask)))
1362 			continue;
1363 
1364 		err = fib_semantic_match(&li->falh, flp, res, plen);
1365 
1366 #ifdef CONFIG_IP_FIB_TRIE_STATS
1367 		if (err <= 0)
1368 			t->stats.semantic_match_passed++;
1369 		else
1370 			t->stats.semantic_match_miss++;
1371 #endif
1372 		if (err <= 0)
1373 			return err;
1374 	}
1375 
1376 	return 1;
1377 }
1378 
1379 int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1380 		     struct fib_result *res)
1381 {
1382 	struct trie *t = (struct trie *) tb->tb_data;
1383 	int ret;
1384 	struct node *n;
1385 	struct tnode *pn;
1386 	int pos, bits;
1387 	t_key key = ntohl(flp->fl4_dst);
1388 	int chopped_off;
1389 	t_key cindex = 0;
1390 	int current_prefix_length = KEYLENGTH;
1391 	struct tnode *cn;
1392 	t_key node_prefix, key_prefix, pref_mismatch;
1393 	int mp;
1394 
1395 	rcu_read_lock();
1396 
1397 	n = rcu_dereference(t->trie);
1398 	if (!n)
1399 		goto failed;
1400 
1401 #ifdef CONFIG_IP_FIB_TRIE_STATS
1402 	t->stats.gets++;
1403 #endif
1404 
1405 	/* Just a leaf? */
1406 	if (IS_LEAF(n)) {
1407 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1408 		goto found;
1409 	}
1410 
1411 	pn = (struct tnode *) n;
1412 	chopped_off = 0;
1413 
1414 	while (pn) {
1415 		pos = pn->pos;
1416 		bits = pn->bits;
1417 
1418 		if (!chopped_off)
1419 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1420 						   pos, bits);
1421 
1422 		n = tnode_get_child_rcu(pn, cindex);
1423 
1424 		if (n == NULL) {
1425 #ifdef CONFIG_IP_FIB_TRIE_STATS
1426 			t->stats.null_node_hit++;
1427 #endif
1428 			goto backtrace;
1429 		}
1430 
1431 		if (IS_LEAF(n)) {
1432 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1433 			if (ret > 0)
1434 				goto backtrace;
1435 			goto found;
1436 		}
1437 
1438 		cn = (struct tnode *)n;
1439 
1440 		/*
1441 		 * It's a tnode, and we can do some extra checks here if we
1442 		 * like, to avoid descending into a dead-end branch.
1443 		 * This tnode is in the parent's child array at index
1444 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1445 		 * chopped off, so in reality the index may be just a
1446 		 * subprefix, padded with zero at the end.
1447 		 * We can also take a look at any skipped bits in this
1448 		 * tnode - everything up to p_pos is supposed to be ok,
1449 		 * and the non-chopped bits of the index (se previous
1450 		 * paragraph) are also guaranteed ok, but the rest is
1451 		 * considered unknown.
1452 		 *
1453 		 * The skipped bits are key[pos+bits..cn->pos].
1454 		 */
1455 
1456 		/* If current_prefix_length < pos+bits, we are already doing
1457 		 * actual prefix  matching, which means everything from
1458 		 * pos+(bits-chopped_off) onward must be zero along some
1459 		 * branch of this subtree - otherwise there is *no* valid
1460 		 * prefix present. Here we can only check the skipped
1461 		 * bits. Remember, since we have already indexed into the
1462 		 * parent's child array, we know that the bits we chopped of
1463 		 * *are* zero.
1464 		 */
1465 
1466 		/* NOTA BENE: Checking only skipped bits
1467 		   for the new node here */
1468 
1469 		if (current_prefix_length < pos+bits) {
1470 			if (tkey_extract_bits(cn->key, current_prefix_length,
1471 						cn->pos - current_prefix_length)
1472 			    || !(cn->child[0]))
1473 				goto backtrace;
1474 		}
1475 
1476 		/*
1477 		 * If chopped_off=0, the index is fully validated and we
1478 		 * only need to look at the skipped bits for this, the new,
1479 		 * tnode. What we actually want to do is to find out if
1480 		 * these skipped bits match our key perfectly, or if we will
1481 		 * have to count on finding a matching prefix further down,
1482 		 * because if we do, we would like to have some way of
1483 		 * verifying the existence of such a prefix at this point.
1484 		 */
1485 
1486 		/* The only thing we can do at this point is to verify that
1487 		 * any such matching prefix can indeed be a prefix to our
1488 		 * key, and if the bits in the node we are inspecting that
1489 		 * do not match our key are not ZERO, this cannot be true.
1490 		 * Thus, find out where there is a mismatch (before cn->pos)
1491 		 * and verify that all the mismatching bits are zero in the
1492 		 * new tnode's key.
1493 		 */
1494 
1495 		/*
1496 		 * Note: We aren't very concerned about the piece of
1497 		 * the key that precede pn->pos+pn->bits, since these
1498 		 * have already been checked. The bits after cn->pos
1499 		 * aren't checked since these are by definition
1500 		 * "unknown" at this point. Thus, what we want to see
1501 		 * is if we are about to enter the "prefix matching"
1502 		 * state, and in that case verify that the skipped
1503 		 * bits that will prevail throughout this subtree are
1504 		 * zero, as they have to be if we are to find a
1505 		 * matching prefix.
1506 		 */
1507 
1508 		node_prefix = mask_pfx(cn->key, cn->pos);
1509 		key_prefix = mask_pfx(key, cn->pos);
1510 		pref_mismatch = key_prefix^node_prefix;
1511 		mp = 0;
1512 
1513 		/*
1514 		 * In short: If skipped bits in this node do not match
1515 		 * the search key, enter the "prefix matching"
1516 		 * state.directly.
1517 		 */
1518 		if (pref_mismatch) {
1519 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1520 				mp++;
1521 				pref_mismatch = pref_mismatch << 1;
1522 			}
1523 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1524 
1525 			if (key_prefix != 0)
1526 				goto backtrace;
1527 
1528 			if (current_prefix_length >= cn->pos)
1529 				current_prefix_length = mp;
1530 		}
1531 
1532 		pn = (struct tnode *)n; /* Descend */
1533 		chopped_off = 0;
1534 		continue;
1535 
1536 backtrace:
1537 		chopped_off++;
1538 
1539 		/* As zero don't change the child key (cindex) */
1540 		while ((chopped_off <= pn->bits)
1541 		       && !(cindex & (1<<(chopped_off-1))))
1542 			chopped_off++;
1543 
1544 		/* Decrease current_... with bits chopped off */
1545 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1546 			current_prefix_length = pn->pos + pn->bits
1547 				- chopped_off;
1548 
1549 		/*
1550 		 * Either we do the actual chop off according or if we have
1551 		 * chopped off all bits in this tnode walk up to our parent.
1552 		 */
1553 
1554 		if (chopped_off <= pn->bits) {
1555 			cindex &= ~(1 << (chopped_off-1));
1556 		} else {
1557 			struct tnode *parent = node_parent_rcu((struct node *) pn);
1558 			if (!parent)
1559 				goto failed;
1560 
1561 			/* Get Child's index */
1562 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1563 			pn = parent;
1564 			chopped_off = 0;
1565 
1566 #ifdef CONFIG_IP_FIB_TRIE_STATS
1567 			t->stats.backtrack++;
1568 #endif
1569 			goto backtrace;
1570 		}
1571 	}
1572 failed:
1573 	ret = 1;
1574 found:
1575 	rcu_read_unlock();
1576 	return ret;
1577 }
1578 
1579 /*
1580  * Remove the leaf and return parent.
1581  */
1582 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1583 {
1584 	struct tnode *tp = node_parent((struct node *) l);
1585 
1586 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1587 
1588 	if (tp) {
1589 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1590 		put_child(t, (struct tnode *)tp, cindex, NULL);
1591 		trie_rebalance(t, tp);
1592 	} else
1593 		rcu_assign_pointer(t->trie, NULL);
1594 
1595 	free_leaf(l);
1596 }
1597 
1598 /*
1599  * Caller must hold RTNL.
1600  */
1601 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1602 {
1603 	struct trie *t = (struct trie *) tb->tb_data;
1604 	u32 key, mask;
1605 	int plen = cfg->fc_dst_len;
1606 	u8 tos = cfg->fc_tos;
1607 	struct fib_alias *fa, *fa_to_delete;
1608 	struct list_head *fa_head;
1609 	struct leaf *l;
1610 	struct leaf_info *li;
1611 
1612 	if (plen > 32)
1613 		return -EINVAL;
1614 
1615 	key = ntohl(cfg->fc_dst);
1616 	mask = ntohl(inet_make_mask(plen));
1617 
1618 	if (key & ~mask)
1619 		return -EINVAL;
1620 
1621 	key = key & mask;
1622 	l = fib_find_node(t, key);
1623 
1624 	if (!l)
1625 		return -ESRCH;
1626 
1627 	fa_head = get_fa_head(l, plen);
1628 	fa = fib_find_alias(fa_head, tos, 0);
1629 
1630 	if (!fa)
1631 		return -ESRCH;
1632 
1633 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1634 
1635 	fa_to_delete = NULL;
1636 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1637 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1638 		struct fib_info *fi = fa->fa_info;
1639 
1640 		if (fa->fa_tos != tos)
1641 			break;
1642 
1643 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1644 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1645 		     fa->fa_scope == cfg->fc_scope) &&
1646 		    (!cfg->fc_protocol ||
1647 		     fi->fib_protocol == cfg->fc_protocol) &&
1648 		    fib_nh_match(cfg, fi) == 0) {
1649 			fa_to_delete = fa;
1650 			break;
1651 		}
1652 	}
1653 
1654 	if (!fa_to_delete)
1655 		return -ESRCH;
1656 
1657 	fa = fa_to_delete;
1658 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1659 		  &cfg->fc_nlinfo, 0);
1660 
1661 	l = fib_find_node(t, key);
1662 	li = find_leaf_info(l, plen);
1663 
1664 	list_del_rcu(&fa->fa_list);
1665 
1666 	if (list_empty(fa_head)) {
1667 		hlist_del_rcu(&li->hlist);
1668 		free_leaf_info(li);
1669 	}
1670 
1671 	if (hlist_empty(&l->list))
1672 		trie_leaf_remove(t, l);
1673 
1674 	if (fa->fa_state & FA_S_ACCESSED)
1675 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1676 
1677 	fib_release_info(fa->fa_info);
1678 	alias_free_mem_rcu(fa);
1679 	return 0;
1680 }
1681 
1682 static int trie_flush_list(struct list_head *head)
1683 {
1684 	struct fib_alias *fa, *fa_node;
1685 	int found = 0;
1686 
1687 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1688 		struct fib_info *fi = fa->fa_info;
1689 
1690 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1691 			list_del_rcu(&fa->fa_list);
1692 			fib_release_info(fa->fa_info);
1693 			alias_free_mem_rcu(fa);
1694 			found++;
1695 		}
1696 	}
1697 	return found;
1698 }
1699 
1700 static int trie_flush_leaf(struct leaf *l)
1701 {
1702 	int found = 0;
1703 	struct hlist_head *lih = &l->list;
1704 	struct hlist_node *node, *tmp;
1705 	struct leaf_info *li = NULL;
1706 
1707 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1708 		found += trie_flush_list(&li->falh);
1709 
1710 		if (list_empty(&li->falh)) {
1711 			hlist_del_rcu(&li->hlist);
1712 			free_leaf_info(li);
1713 		}
1714 	}
1715 	return found;
1716 }
1717 
1718 /*
1719  * Scan for the next right leaf starting at node p->child[idx]
1720  * Since we have back pointer, no recursion necessary.
1721  */
1722 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1723 {
1724 	do {
1725 		t_key idx;
1726 
1727 		if (c)
1728 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1729 		else
1730 			idx = 0;
1731 
1732 		while (idx < 1u << p->bits) {
1733 			c = tnode_get_child_rcu(p, idx++);
1734 			if (!c)
1735 				continue;
1736 
1737 			if (IS_LEAF(c)) {
1738 				prefetch(p->child[idx]);
1739 				return (struct leaf *) c;
1740 			}
1741 
1742 			/* Rescan start scanning in new node */
1743 			p = (struct tnode *) c;
1744 			idx = 0;
1745 		}
1746 
1747 		/* Node empty, walk back up to parent */
1748 		c = (struct node *) p;
1749 	} while ( (p = node_parent_rcu(c)) != NULL);
1750 
1751 	return NULL; /* Root of trie */
1752 }
1753 
1754 static struct leaf *trie_firstleaf(struct trie *t)
1755 {
1756 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1757 
1758 	if (!n)
1759 		return NULL;
1760 
1761 	if (IS_LEAF(n))          /* trie is just a leaf */
1762 		return (struct leaf *) n;
1763 
1764 	return leaf_walk_rcu(n, NULL);
1765 }
1766 
1767 static struct leaf *trie_nextleaf(struct leaf *l)
1768 {
1769 	struct node *c = (struct node *) l;
1770 	struct tnode *p = node_parent_rcu(c);
1771 
1772 	if (!p)
1773 		return NULL;	/* trie with just one leaf */
1774 
1775 	return leaf_walk_rcu(p, c);
1776 }
1777 
1778 static struct leaf *trie_leafindex(struct trie *t, int index)
1779 {
1780 	struct leaf *l = trie_firstleaf(t);
1781 
1782 	while (l && index-- > 0)
1783 		l = trie_nextleaf(l);
1784 
1785 	return l;
1786 }
1787 
1788 
1789 /*
1790  * Caller must hold RTNL.
1791  */
1792 int fib_table_flush(struct fib_table *tb)
1793 {
1794 	struct trie *t = (struct trie *) tb->tb_data;
1795 	struct leaf *l, *ll = NULL;
1796 	int found = 0;
1797 
1798 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1799 		found += trie_flush_leaf(l);
1800 
1801 		if (ll && hlist_empty(&ll->list))
1802 			trie_leaf_remove(t, ll);
1803 		ll = l;
1804 	}
1805 
1806 	if (ll && hlist_empty(&ll->list))
1807 		trie_leaf_remove(t, ll);
1808 
1809 	pr_debug("trie_flush found=%d\n", found);
1810 	return found;
1811 }
1812 
1813 void fib_table_select_default(struct fib_table *tb,
1814 			      const struct flowi *flp,
1815 			      struct fib_result *res)
1816 {
1817 	struct trie *t = (struct trie *) tb->tb_data;
1818 	int order, last_idx;
1819 	struct fib_info *fi = NULL;
1820 	struct fib_info *last_resort;
1821 	struct fib_alias *fa = NULL;
1822 	struct list_head *fa_head;
1823 	struct leaf *l;
1824 
1825 	last_idx = -1;
1826 	last_resort = NULL;
1827 	order = -1;
1828 
1829 	rcu_read_lock();
1830 
1831 	l = fib_find_node(t, 0);
1832 	if (!l)
1833 		goto out;
1834 
1835 	fa_head = get_fa_head(l, 0);
1836 	if (!fa_head)
1837 		goto out;
1838 
1839 	if (list_empty(fa_head))
1840 		goto out;
1841 
1842 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1843 		struct fib_info *next_fi = fa->fa_info;
1844 
1845 		if (fa->fa_scope != res->scope ||
1846 		    fa->fa_type != RTN_UNICAST)
1847 			continue;
1848 
1849 		if (next_fi->fib_priority > res->fi->fib_priority)
1850 			break;
1851 		if (!next_fi->fib_nh[0].nh_gw ||
1852 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1853 			continue;
1854 		fa->fa_state |= FA_S_ACCESSED;
1855 
1856 		if (fi == NULL) {
1857 			if (next_fi != res->fi)
1858 				break;
1859 		} else if (!fib_detect_death(fi, order, &last_resort,
1860 					     &last_idx, tb->tb_default)) {
1861 			fib_result_assign(res, fi);
1862 			tb->tb_default = order;
1863 			goto out;
1864 		}
1865 		fi = next_fi;
1866 		order++;
1867 	}
1868 	if (order <= 0 || fi == NULL) {
1869 		tb->tb_default = -1;
1870 		goto out;
1871 	}
1872 
1873 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1874 				tb->tb_default)) {
1875 		fib_result_assign(res, fi);
1876 		tb->tb_default = order;
1877 		goto out;
1878 	}
1879 	if (last_idx >= 0)
1880 		fib_result_assign(res, last_resort);
1881 	tb->tb_default = last_idx;
1882 out:
1883 	rcu_read_unlock();
1884 }
1885 
1886 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1887 			   struct fib_table *tb,
1888 			   struct sk_buff *skb, struct netlink_callback *cb)
1889 {
1890 	int i, s_i;
1891 	struct fib_alias *fa;
1892 	__be32 xkey = htonl(key);
1893 
1894 	s_i = cb->args[5];
1895 	i = 0;
1896 
1897 	/* rcu_read_lock is hold by caller */
1898 
1899 	list_for_each_entry_rcu(fa, fah, fa_list) {
1900 		if (i < s_i) {
1901 			i++;
1902 			continue;
1903 		}
1904 
1905 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1906 				  cb->nlh->nlmsg_seq,
1907 				  RTM_NEWROUTE,
1908 				  tb->tb_id,
1909 				  fa->fa_type,
1910 				  fa->fa_scope,
1911 				  xkey,
1912 				  plen,
1913 				  fa->fa_tos,
1914 				  fa->fa_info, NLM_F_MULTI) < 0) {
1915 			cb->args[5] = i;
1916 			return -1;
1917 		}
1918 		i++;
1919 	}
1920 	cb->args[5] = i;
1921 	return skb->len;
1922 }
1923 
1924 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1925 			struct sk_buff *skb, struct netlink_callback *cb)
1926 {
1927 	struct leaf_info *li;
1928 	struct hlist_node *node;
1929 	int i, s_i;
1930 
1931 	s_i = cb->args[4];
1932 	i = 0;
1933 
1934 	/* rcu_read_lock is hold by caller */
1935 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1936 		if (i < s_i) {
1937 			i++;
1938 			continue;
1939 		}
1940 
1941 		if (i > s_i)
1942 			cb->args[5] = 0;
1943 
1944 		if (list_empty(&li->falh))
1945 			continue;
1946 
1947 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1948 			cb->args[4] = i;
1949 			return -1;
1950 		}
1951 		i++;
1952 	}
1953 
1954 	cb->args[4] = i;
1955 	return skb->len;
1956 }
1957 
1958 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1959 		   struct netlink_callback *cb)
1960 {
1961 	struct leaf *l;
1962 	struct trie *t = (struct trie *) tb->tb_data;
1963 	t_key key = cb->args[2];
1964 	int count = cb->args[3];
1965 
1966 	rcu_read_lock();
1967 	/* Dump starting at last key.
1968 	 * Note: 0.0.0.0/0 (ie default) is first key.
1969 	 */
1970 	if (count == 0)
1971 		l = trie_firstleaf(t);
1972 	else {
1973 		/* Normally, continue from last key, but if that is missing
1974 		 * fallback to using slow rescan
1975 		 */
1976 		l = fib_find_node(t, key);
1977 		if (!l)
1978 			l = trie_leafindex(t, count);
1979 	}
1980 
1981 	while (l) {
1982 		cb->args[2] = l->key;
1983 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1984 			cb->args[3] = count;
1985 			rcu_read_unlock();
1986 			return -1;
1987 		}
1988 
1989 		++count;
1990 		l = trie_nextleaf(l);
1991 		memset(&cb->args[4], 0,
1992 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1993 	}
1994 	cb->args[3] = count;
1995 	rcu_read_unlock();
1996 
1997 	return skb->len;
1998 }
1999 
2000 void __init fib_hash_init(void)
2001 {
2002 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2003 					  sizeof(struct fib_alias),
2004 					  0, SLAB_PANIC, NULL);
2005 
2006 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2007 					   max(sizeof(struct leaf),
2008 					       sizeof(struct leaf_info)),
2009 					   0, SLAB_PANIC, NULL);
2010 }
2011 
2012 
2013 /* Fix more generic FIB names for init later */
2014 struct fib_table *fib_hash_table(u32 id)
2015 {
2016 	struct fib_table *tb;
2017 	struct trie *t;
2018 
2019 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2020 		     GFP_KERNEL);
2021 	if (tb == NULL)
2022 		return NULL;
2023 
2024 	tb->tb_id = id;
2025 	tb->tb_default = -1;
2026 
2027 	t = (struct trie *) tb->tb_data;
2028 	memset(t, 0, sizeof(*t));
2029 
2030 	if (id == RT_TABLE_LOCAL)
2031 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2032 
2033 	return tb;
2034 }
2035 
2036 #ifdef CONFIG_PROC_FS
2037 /* Depth first Trie walk iterator */
2038 struct fib_trie_iter {
2039 	struct seq_net_private p;
2040 	struct fib_table *tb;
2041 	struct tnode *tnode;
2042 	unsigned index;
2043 	unsigned depth;
2044 };
2045 
2046 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2047 {
2048 	struct tnode *tn = iter->tnode;
2049 	unsigned cindex = iter->index;
2050 	struct tnode *p;
2051 
2052 	/* A single entry routing table */
2053 	if (!tn)
2054 		return NULL;
2055 
2056 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2057 		 iter->tnode, iter->index, iter->depth);
2058 rescan:
2059 	while (cindex < (1<<tn->bits)) {
2060 		struct node *n = tnode_get_child_rcu(tn, cindex);
2061 
2062 		if (n) {
2063 			if (IS_LEAF(n)) {
2064 				iter->tnode = tn;
2065 				iter->index = cindex + 1;
2066 			} else {
2067 				/* push down one level */
2068 				iter->tnode = (struct tnode *) n;
2069 				iter->index = 0;
2070 				++iter->depth;
2071 			}
2072 			return n;
2073 		}
2074 
2075 		++cindex;
2076 	}
2077 
2078 	/* Current node exhausted, pop back up */
2079 	p = node_parent_rcu((struct node *)tn);
2080 	if (p) {
2081 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2082 		tn = p;
2083 		--iter->depth;
2084 		goto rescan;
2085 	}
2086 
2087 	/* got root? */
2088 	return NULL;
2089 }
2090 
2091 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2092 				       struct trie *t)
2093 {
2094 	struct node *n;
2095 
2096 	if (!t)
2097 		return NULL;
2098 
2099 	n = rcu_dereference(t->trie);
2100 	if (!n)
2101 		return NULL;
2102 
2103 	if (IS_TNODE(n)) {
2104 		iter->tnode = (struct tnode *) n;
2105 		iter->index = 0;
2106 		iter->depth = 1;
2107 	} else {
2108 		iter->tnode = NULL;
2109 		iter->index = 0;
2110 		iter->depth = 0;
2111 	}
2112 
2113 	return n;
2114 }
2115 
2116 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2117 {
2118 	struct node *n;
2119 	struct fib_trie_iter iter;
2120 
2121 	memset(s, 0, sizeof(*s));
2122 
2123 	rcu_read_lock();
2124 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2125 		if (IS_LEAF(n)) {
2126 			struct leaf *l = (struct leaf *)n;
2127 			struct leaf_info *li;
2128 			struct hlist_node *tmp;
2129 
2130 			s->leaves++;
2131 			s->totdepth += iter.depth;
2132 			if (iter.depth > s->maxdepth)
2133 				s->maxdepth = iter.depth;
2134 
2135 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2136 				++s->prefixes;
2137 		} else {
2138 			const struct tnode *tn = (const struct tnode *) n;
2139 			int i;
2140 
2141 			s->tnodes++;
2142 			if (tn->bits < MAX_STAT_DEPTH)
2143 				s->nodesizes[tn->bits]++;
2144 
2145 			for (i = 0; i < (1<<tn->bits); i++)
2146 				if (!tn->child[i])
2147 					s->nullpointers++;
2148 		}
2149 	}
2150 	rcu_read_unlock();
2151 }
2152 
2153 /*
2154  *	This outputs /proc/net/fib_triestats
2155  */
2156 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2157 {
2158 	unsigned i, max, pointers, bytes, avdepth;
2159 
2160 	if (stat->leaves)
2161 		avdepth = stat->totdepth*100 / stat->leaves;
2162 	else
2163 		avdepth = 0;
2164 
2165 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2166 		   avdepth / 100, avdepth % 100);
2167 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2168 
2169 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2170 	bytes = sizeof(struct leaf) * stat->leaves;
2171 
2172 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2173 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2174 
2175 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2176 	bytes += sizeof(struct tnode) * stat->tnodes;
2177 
2178 	max = MAX_STAT_DEPTH;
2179 	while (max > 0 && stat->nodesizes[max-1] == 0)
2180 		max--;
2181 
2182 	pointers = 0;
2183 	for (i = 1; i <= max; i++)
2184 		if (stat->nodesizes[i] != 0) {
2185 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2186 			pointers += (1<<i) * stat->nodesizes[i];
2187 		}
2188 	seq_putc(seq, '\n');
2189 	seq_printf(seq, "\tPointers: %u\n", pointers);
2190 
2191 	bytes += sizeof(struct node *) * pointers;
2192 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2193 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2194 }
2195 
2196 #ifdef CONFIG_IP_FIB_TRIE_STATS
2197 static void trie_show_usage(struct seq_file *seq,
2198 			    const struct trie_use_stats *stats)
2199 {
2200 	seq_printf(seq, "\nCounters:\n---------\n");
2201 	seq_printf(seq, "gets = %u\n", stats->gets);
2202 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2203 	seq_printf(seq, "semantic match passed = %u\n",
2204 		   stats->semantic_match_passed);
2205 	seq_printf(seq, "semantic match miss = %u\n",
2206 		   stats->semantic_match_miss);
2207 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2208 	seq_printf(seq, "skipped node resize = %u\n\n",
2209 		   stats->resize_node_skipped);
2210 }
2211 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2212 
2213 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2214 {
2215 	if (tb->tb_id == RT_TABLE_LOCAL)
2216 		seq_puts(seq, "Local:\n");
2217 	else if (tb->tb_id == RT_TABLE_MAIN)
2218 		seq_puts(seq, "Main:\n");
2219 	else
2220 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2221 }
2222 
2223 
2224 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2225 {
2226 	struct net *net = (struct net *)seq->private;
2227 	unsigned int h;
2228 
2229 	seq_printf(seq,
2230 		   "Basic info: size of leaf:"
2231 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2232 		   sizeof(struct leaf), sizeof(struct tnode));
2233 
2234 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2235 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2236 		struct hlist_node *node;
2237 		struct fib_table *tb;
2238 
2239 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2240 			struct trie *t = (struct trie *) tb->tb_data;
2241 			struct trie_stat stat;
2242 
2243 			if (!t)
2244 				continue;
2245 
2246 			fib_table_print(seq, tb);
2247 
2248 			trie_collect_stats(t, &stat);
2249 			trie_show_stats(seq, &stat);
2250 #ifdef CONFIG_IP_FIB_TRIE_STATS
2251 			trie_show_usage(seq, &t->stats);
2252 #endif
2253 		}
2254 	}
2255 
2256 	return 0;
2257 }
2258 
2259 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2260 {
2261 	return single_open_net(inode, file, fib_triestat_seq_show);
2262 }
2263 
2264 static const struct file_operations fib_triestat_fops = {
2265 	.owner	= THIS_MODULE,
2266 	.open	= fib_triestat_seq_open,
2267 	.read	= seq_read,
2268 	.llseek	= seq_lseek,
2269 	.release = single_release_net,
2270 };
2271 
2272 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2273 {
2274 	struct fib_trie_iter *iter = seq->private;
2275 	struct net *net = seq_file_net(seq);
2276 	loff_t idx = 0;
2277 	unsigned int h;
2278 
2279 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2280 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2281 		struct hlist_node *node;
2282 		struct fib_table *tb;
2283 
2284 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2285 			struct node *n;
2286 
2287 			for (n = fib_trie_get_first(iter,
2288 						    (struct trie *) tb->tb_data);
2289 			     n; n = fib_trie_get_next(iter))
2290 				if (pos == idx++) {
2291 					iter->tb = tb;
2292 					return n;
2293 				}
2294 		}
2295 	}
2296 
2297 	return NULL;
2298 }
2299 
2300 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2301 	__acquires(RCU)
2302 {
2303 	rcu_read_lock();
2304 	return fib_trie_get_idx(seq, *pos);
2305 }
2306 
2307 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308 {
2309 	struct fib_trie_iter *iter = seq->private;
2310 	struct net *net = seq_file_net(seq);
2311 	struct fib_table *tb = iter->tb;
2312 	struct hlist_node *tb_node;
2313 	unsigned int h;
2314 	struct node *n;
2315 
2316 	++*pos;
2317 	/* next node in same table */
2318 	n = fib_trie_get_next(iter);
2319 	if (n)
2320 		return n;
2321 
2322 	/* walk rest of this hash chain */
2323 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2324 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2325 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2326 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2327 		if (n)
2328 			goto found;
2329 	}
2330 
2331 	/* new hash chain */
2332 	while (++h < FIB_TABLE_HASHSZ) {
2333 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2334 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2335 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2336 			if (n)
2337 				goto found;
2338 		}
2339 	}
2340 	return NULL;
2341 
2342 found:
2343 	iter->tb = tb;
2344 	return n;
2345 }
2346 
2347 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2348 	__releases(RCU)
2349 {
2350 	rcu_read_unlock();
2351 }
2352 
2353 static void seq_indent(struct seq_file *seq, int n)
2354 {
2355 	while (n-- > 0) seq_puts(seq, "   ");
2356 }
2357 
2358 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2359 {
2360 	switch (s) {
2361 	case RT_SCOPE_UNIVERSE: return "universe";
2362 	case RT_SCOPE_SITE:	return "site";
2363 	case RT_SCOPE_LINK:	return "link";
2364 	case RT_SCOPE_HOST:	return "host";
2365 	case RT_SCOPE_NOWHERE:	return "nowhere";
2366 	default:
2367 		snprintf(buf, len, "scope=%d", s);
2368 		return buf;
2369 	}
2370 }
2371 
2372 static const char *const rtn_type_names[__RTN_MAX] = {
2373 	[RTN_UNSPEC] = "UNSPEC",
2374 	[RTN_UNICAST] = "UNICAST",
2375 	[RTN_LOCAL] = "LOCAL",
2376 	[RTN_BROADCAST] = "BROADCAST",
2377 	[RTN_ANYCAST] = "ANYCAST",
2378 	[RTN_MULTICAST] = "MULTICAST",
2379 	[RTN_BLACKHOLE] = "BLACKHOLE",
2380 	[RTN_UNREACHABLE] = "UNREACHABLE",
2381 	[RTN_PROHIBIT] = "PROHIBIT",
2382 	[RTN_THROW] = "THROW",
2383 	[RTN_NAT] = "NAT",
2384 	[RTN_XRESOLVE] = "XRESOLVE",
2385 };
2386 
2387 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2388 {
2389 	if (t < __RTN_MAX && rtn_type_names[t])
2390 		return rtn_type_names[t];
2391 	snprintf(buf, len, "type %u", t);
2392 	return buf;
2393 }
2394 
2395 /* Pretty print the trie */
2396 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2397 {
2398 	const struct fib_trie_iter *iter = seq->private;
2399 	struct node *n = v;
2400 
2401 	if (!node_parent_rcu(n))
2402 		fib_table_print(seq, iter->tb);
2403 
2404 	if (IS_TNODE(n)) {
2405 		struct tnode *tn = (struct tnode *) n;
2406 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2407 
2408 		seq_indent(seq, iter->depth-1);
2409 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2410 			   &prf, tn->pos, tn->bits, tn->full_children,
2411 			   tn->empty_children);
2412 
2413 	} else {
2414 		struct leaf *l = (struct leaf *) n;
2415 		struct leaf_info *li;
2416 		struct hlist_node *node;
2417 		__be32 val = htonl(l->key);
2418 
2419 		seq_indent(seq, iter->depth);
2420 		seq_printf(seq, "  |-- %pI4\n", &val);
2421 
2422 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2423 			struct fib_alias *fa;
2424 
2425 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2426 				char buf1[32], buf2[32];
2427 
2428 				seq_indent(seq, iter->depth+1);
2429 				seq_printf(seq, "  /%d %s %s", li->plen,
2430 					   rtn_scope(buf1, sizeof(buf1),
2431 						     fa->fa_scope),
2432 					   rtn_type(buf2, sizeof(buf2),
2433 						    fa->fa_type));
2434 				if (fa->fa_tos)
2435 					seq_printf(seq, " tos=%d", fa->fa_tos);
2436 				seq_putc(seq, '\n');
2437 			}
2438 		}
2439 	}
2440 
2441 	return 0;
2442 }
2443 
2444 static const struct seq_operations fib_trie_seq_ops = {
2445 	.start  = fib_trie_seq_start,
2446 	.next   = fib_trie_seq_next,
2447 	.stop   = fib_trie_seq_stop,
2448 	.show   = fib_trie_seq_show,
2449 };
2450 
2451 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2452 {
2453 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2454 			    sizeof(struct fib_trie_iter));
2455 }
2456 
2457 static const struct file_operations fib_trie_fops = {
2458 	.owner  = THIS_MODULE,
2459 	.open   = fib_trie_seq_open,
2460 	.read   = seq_read,
2461 	.llseek = seq_lseek,
2462 	.release = seq_release_net,
2463 };
2464 
2465 struct fib_route_iter {
2466 	struct seq_net_private p;
2467 	struct trie *main_trie;
2468 	loff_t	pos;
2469 	t_key	key;
2470 };
2471 
2472 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2473 {
2474 	struct leaf *l = NULL;
2475 	struct trie *t = iter->main_trie;
2476 
2477 	/* use cache location of last found key */
2478 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2479 		pos -= iter->pos;
2480 	else {
2481 		iter->pos = 0;
2482 		l = trie_firstleaf(t);
2483 	}
2484 
2485 	while (l && pos-- > 0) {
2486 		iter->pos++;
2487 		l = trie_nextleaf(l);
2488 	}
2489 
2490 	if (l)
2491 		iter->key = pos;	/* remember it */
2492 	else
2493 		iter->pos = 0;		/* forget it */
2494 
2495 	return l;
2496 }
2497 
2498 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2499 	__acquires(RCU)
2500 {
2501 	struct fib_route_iter *iter = seq->private;
2502 	struct fib_table *tb;
2503 
2504 	rcu_read_lock();
2505 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2506 	if (!tb)
2507 		return NULL;
2508 
2509 	iter->main_trie = (struct trie *) tb->tb_data;
2510 	if (*pos == 0)
2511 		return SEQ_START_TOKEN;
2512 	else
2513 		return fib_route_get_idx(iter, *pos - 1);
2514 }
2515 
2516 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2517 {
2518 	struct fib_route_iter *iter = seq->private;
2519 	struct leaf *l = v;
2520 
2521 	++*pos;
2522 	if (v == SEQ_START_TOKEN) {
2523 		iter->pos = 0;
2524 		l = trie_firstleaf(iter->main_trie);
2525 	} else {
2526 		iter->pos++;
2527 		l = trie_nextleaf(l);
2528 	}
2529 
2530 	if (l)
2531 		iter->key = l->key;
2532 	else
2533 		iter->pos = 0;
2534 	return l;
2535 }
2536 
2537 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2538 	__releases(RCU)
2539 {
2540 	rcu_read_unlock();
2541 }
2542 
2543 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2544 {
2545 	static unsigned type2flags[RTN_MAX + 1] = {
2546 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2547 	};
2548 	unsigned flags = type2flags[type];
2549 
2550 	if (fi && fi->fib_nh->nh_gw)
2551 		flags |= RTF_GATEWAY;
2552 	if (mask == htonl(0xFFFFFFFF))
2553 		flags |= RTF_HOST;
2554 	flags |= RTF_UP;
2555 	return flags;
2556 }
2557 
2558 /*
2559  *	This outputs /proc/net/route.
2560  *	The format of the file is not supposed to be changed
2561  * 	and needs to be same as fib_hash output to avoid breaking
2562  *	legacy utilities
2563  */
2564 static int fib_route_seq_show(struct seq_file *seq, void *v)
2565 {
2566 	struct leaf *l = v;
2567 	struct leaf_info *li;
2568 	struct hlist_node *node;
2569 
2570 	if (v == SEQ_START_TOKEN) {
2571 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2572 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2573 			   "\tWindow\tIRTT");
2574 		return 0;
2575 	}
2576 
2577 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2578 		struct fib_alias *fa;
2579 		__be32 mask, prefix;
2580 
2581 		mask = inet_make_mask(li->plen);
2582 		prefix = htonl(l->key);
2583 
2584 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2585 			const struct fib_info *fi = fa->fa_info;
2586 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2587 			int len;
2588 
2589 			if (fa->fa_type == RTN_BROADCAST
2590 			    || fa->fa_type == RTN_MULTICAST)
2591 				continue;
2592 
2593 			if (fi)
2594 				seq_printf(seq,
2595 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2596 					 "%d\t%08X\t%d\t%u\t%u%n",
2597 					 fi->fib_dev ? fi->fib_dev->name : "*",
2598 					 prefix,
2599 					 fi->fib_nh->nh_gw, flags, 0, 0,
2600 					 fi->fib_priority,
2601 					 mask,
2602 					 (fi->fib_advmss ?
2603 					  fi->fib_advmss + 40 : 0),
2604 					 fi->fib_window,
2605 					 fi->fib_rtt >> 3, &len);
2606 			else
2607 				seq_printf(seq,
2608 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2609 					 "%d\t%08X\t%d\t%u\t%u%n",
2610 					 prefix, 0, flags, 0, 0, 0,
2611 					 mask, 0, 0, 0, &len);
2612 
2613 			seq_printf(seq, "%*s\n", 127 - len, "");
2614 		}
2615 	}
2616 
2617 	return 0;
2618 }
2619 
2620 static const struct seq_operations fib_route_seq_ops = {
2621 	.start  = fib_route_seq_start,
2622 	.next   = fib_route_seq_next,
2623 	.stop   = fib_route_seq_stop,
2624 	.show   = fib_route_seq_show,
2625 };
2626 
2627 static int fib_route_seq_open(struct inode *inode, struct file *file)
2628 {
2629 	return seq_open_net(inode, file, &fib_route_seq_ops,
2630 			    sizeof(struct fib_route_iter));
2631 }
2632 
2633 static const struct file_operations fib_route_fops = {
2634 	.owner  = THIS_MODULE,
2635 	.open   = fib_route_seq_open,
2636 	.read   = seq_read,
2637 	.llseek = seq_lseek,
2638 	.release = seq_release_net,
2639 };
2640 
2641 int __net_init fib_proc_init(struct net *net)
2642 {
2643 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2644 		goto out1;
2645 
2646 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2647 				  &fib_triestat_fops))
2648 		goto out2;
2649 
2650 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2651 		goto out3;
2652 
2653 	return 0;
2654 
2655 out3:
2656 	proc_net_remove(net, "fib_triestat");
2657 out2:
2658 	proc_net_remove(net, "fib_trie");
2659 out1:
2660 	return -ENOMEM;
2661 }
2662 
2663 void __net_exit fib_proc_exit(struct net *net)
2664 {
2665 	proc_net_remove(net, "fib_trie");
2666 	proc_net_remove(net, "fib_triestat");
2667 	proc_net_remove(net, "route");
2668 }
2669 
2670 #endif /* CONFIG_PROC_FS */
2671