xref: /linux/net/ipv4/fib_trie.c (revision 6000fc4d6f3e55ad52cce8d76317187fe01af2aa)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.408"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
82 
83 #define MAX_STAT_DEPTH 32
84 
85 #define KEYLENGTH (8*sizeof(t_key))
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93 
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
96 
97 struct node {
98 	unsigned long parent;
99 	t_key key;
100 };
101 
102 struct leaf {
103 	unsigned long parent;
104 	t_key key;
105 	struct hlist_head list;
106 	struct rcu_head rcu;
107 };
108 
109 struct leaf_info {
110 	struct hlist_node hlist;
111 	struct rcu_head rcu;
112 	int plen;
113 	struct list_head falh;
114 };
115 
116 struct tnode {
117 	unsigned long parent;
118 	t_key key;
119 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
120 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned int full_children;	/* KEYLENGTH bits needed */
122 	unsigned int empty_children;	/* KEYLENGTH bits needed */
123 	union {
124 		struct rcu_head rcu;
125 		struct work_struct work;
126 		struct tnode *tnode_free;
127 	};
128 	struct node *child[0];
129 };
130 
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
133 	unsigned int gets;
134 	unsigned int backtrack;
135 	unsigned int semantic_match_passed;
136 	unsigned int semantic_match_miss;
137 	unsigned int null_node_hit;
138 	unsigned int resize_node_skipped;
139 };
140 #endif
141 
142 struct trie_stat {
143 	unsigned int totdepth;
144 	unsigned int maxdepth;
145 	unsigned int tnodes;
146 	unsigned int leaves;
147 	unsigned int nullpointers;
148 	unsigned int prefixes;
149 	unsigned int nodesizes[MAX_STAT_DEPTH];
150 };
151 
152 struct trie {
153 	struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 	struct trie_use_stats stats;
156 #endif
157 };
158 
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 				  int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 
168 static struct kmem_cache *fn_alias_kmem __read_mostly;
169 static struct kmem_cache *trie_leaf_kmem __read_mostly;
170 
171 static inline struct tnode *node_parent(struct node *node)
172 {
173 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174 }
175 
176 static inline struct tnode *node_parent_rcu(struct node *node)
177 {
178 	struct tnode *ret = node_parent(node);
179 
180 	return rcu_dereference(ret);
181 }
182 
183 /* Same as rcu_assign_pointer
184  * but that macro() assumes that value is a pointer.
185  */
186 static inline void node_set_parent(struct node *node, struct tnode *ptr)
187 {
188 	smp_wmb();
189 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
190 }
191 
192 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
193 {
194 	BUG_ON(i >= 1U << tn->bits);
195 
196 	return tn->child[i];
197 }
198 
199 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
200 {
201 	struct node *ret = tnode_get_child(tn, i);
202 
203 	return rcu_dereference(ret);
204 }
205 
206 static inline int tnode_child_length(const struct tnode *tn)
207 {
208 	return 1 << tn->bits;
209 }
210 
211 static inline t_key mask_pfx(t_key k, unsigned short l)
212 {
213 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
214 }
215 
216 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
217 {
218 	if (offset < KEYLENGTH)
219 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
220 	else
221 		return 0;
222 }
223 
224 static inline int tkey_equals(t_key a, t_key b)
225 {
226 	return a == b;
227 }
228 
229 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
230 {
231 	if (bits == 0 || offset >= KEYLENGTH)
232 		return 1;
233 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
234 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
235 }
236 
237 static inline int tkey_mismatch(t_key a, int offset, t_key b)
238 {
239 	t_key diff = a ^ b;
240 	int i = offset;
241 
242 	if (!diff)
243 		return 0;
244 	while ((diff << i) >> (KEYLENGTH-1) == 0)
245 		i++;
246 	return i;
247 }
248 
249 /*
250   To understand this stuff, an understanding of keys and all their bits is
251   necessary. Every node in the trie has a key associated with it, but not
252   all of the bits in that key are significant.
253 
254   Consider a node 'n' and its parent 'tp'.
255 
256   If n is a leaf, every bit in its key is significant. Its presence is
257   necessitated by path compression, since during a tree traversal (when
258   searching for a leaf - unless we are doing an insertion) we will completely
259   ignore all skipped bits we encounter. Thus we need to verify, at the end of
260   a potentially successful search, that we have indeed been walking the
261   correct key path.
262 
263   Note that we can never "miss" the correct key in the tree if present by
264   following the wrong path. Path compression ensures that segments of the key
265   that are the same for all keys with a given prefix are skipped, but the
266   skipped part *is* identical for each node in the subtrie below the skipped
267   bit! trie_insert() in this implementation takes care of that - note the
268   call to tkey_sub_equals() in trie_insert().
269 
270   if n is an internal node - a 'tnode' here, the various parts of its key
271   have many different meanings.
272 
273   Example:
274   _________________________________________________________________
275   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
276   -----------------------------------------------------------------
277     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
278 
279   _________________________________________________________________
280   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
281   -----------------------------------------------------------------
282    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
283 
284   tp->pos = 7
285   tp->bits = 3
286   n->pos = 15
287   n->bits = 4
288 
289   First, let's just ignore the bits that come before the parent tp, that is
290   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
291   not use them for anything.
292 
293   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
294   index into the parent's child array. That is, they will be used to find
295   'n' among tp's children.
296 
297   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
298   for the node n.
299 
300   All the bits we have seen so far are significant to the node n. The rest
301   of the bits are really not needed or indeed known in n->key.
302 
303   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
304   n's child array, and will of course be different for each child.
305 
306 
307   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
308   at this point.
309 
310 */
311 
312 static inline void check_tnode(const struct tnode *tn)
313 {
314 	WARN_ON(tn && tn->pos+tn->bits > 32);
315 }
316 
317 static const int halve_threshold = 25;
318 static const int inflate_threshold = 50;
319 static const int halve_threshold_root = 15;
320 static const int inflate_threshold_root = 25;
321 
322 
323 static void __alias_free_mem(struct rcu_head *head)
324 {
325 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
326 	kmem_cache_free(fn_alias_kmem, fa);
327 }
328 
329 static inline void alias_free_mem_rcu(struct fib_alias *fa)
330 {
331 	call_rcu(&fa->rcu, __alias_free_mem);
332 }
333 
334 static void __leaf_free_rcu(struct rcu_head *head)
335 {
336 	struct leaf *l = container_of(head, struct leaf, rcu);
337 	kmem_cache_free(trie_leaf_kmem, l);
338 }
339 
340 static inline void free_leaf(struct leaf *l)
341 {
342 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
343 }
344 
345 static void __leaf_info_free_rcu(struct rcu_head *head)
346 {
347 	kfree(container_of(head, struct leaf_info, rcu));
348 }
349 
350 static inline void free_leaf_info(struct leaf_info *leaf)
351 {
352 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
353 }
354 
355 static struct tnode *tnode_alloc(size_t size)
356 {
357 	if (size <= PAGE_SIZE)
358 		return kzalloc(size, GFP_KERNEL);
359 	else
360 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
361 }
362 
363 static void __tnode_vfree(struct work_struct *arg)
364 {
365 	struct tnode *tn = container_of(arg, struct tnode, work);
366 	vfree(tn);
367 }
368 
369 static void __tnode_free_rcu(struct rcu_head *head)
370 {
371 	struct tnode *tn = container_of(head, struct tnode, rcu);
372 	size_t size = sizeof(struct tnode) +
373 		      (sizeof(struct node *) << tn->bits);
374 
375 	if (size <= PAGE_SIZE)
376 		kfree(tn);
377 	else {
378 		INIT_WORK(&tn->work, __tnode_vfree);
379 		schedule_work(&tn->work);
380 	}
381 }
382 
383 static inline void tnode_free(struct tnode *tn)
384 {
385 	if (IS_LEAF(tn))
386 		free_leaf((struct leaf *) tn);
387 	else
388 		call_rcu(&tn->rcu, __tnode_free_rcu);
389 }
390 
391 static void tnode_free_safe(struct tnode *tn)
392 {
393 	BUG_ON(IS_LEAF(tn));
394 	tn->tnode_free = tnode_free_head;
395 	tnode_free_head = tn;
396 }
397 
398 static void tnode_free_flush(void)
399 {
400 	struct tnode *tn;
401 
402 	while ((tn = tnode_free_head)) {
403 		tnode_free_head = tn->tnode_free;
404 		tn->tnode_free = NULL;
405 		tnode_free(tn);
406 	}
407 }
408 
409 static struct leaf *leaf_new(void)
410 {
411 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
412 	if (l) {
413 		l->parent = T_LEAF;
414 		INIT_HLIST_HEAD(&l->list);
415 	}
416 	return l;
417 }
418 
419 static struct leaf_info *leaf_info_new(int plen)
420 {
421 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
422 	if (li) {
423 		li->plen = plen;
424 		INIT_LIST_HEAD(&li->falh);
425 	}
426 	return li;
427 }
428 
429 static struct tnode *tnode_new(t_key key, int pos, int bits)
430 {
431 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
432 	struct tnode *tn = tnode_alloc(sz);
433 
434 	if (tn) {
435 		tn->parent = T_TNODE;
436 		tn->pos = pos;
437 		tn->bits = bits;
438 		tn->key = key;
439 		tn->full_children = 0;
440 		tn->empty_children = 1<<bits;
441 	}
442 
443 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
444 		 (unsigned long) (sizeof(struct node) << bits));
445 	return tn;
446 }
447 
448 /*
449  * Check whether a tnode 'n' is "full", i.e. it is an internal node
450  * and no bits are skipped. See discussion in dyntree paper p. 6
451  */
452 
453 static inline int tnode_full(const struct tnode *tn, const struct node *n)
454 {
455 	if (n == NULL || IS_LEAF(n))
456 		return 0;
457 
458 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
459 }
460 
461 static inline void put_child(struct trie *t, struct tnode *tn, int i,
462 			     struct node *n)
463 {
464 	tnode_put_child_reorg(tn, i, n, -1);
465 }
466 
467  /*
468   * Add a child at position i overwriting the old value.
469   * Update the value of full_children and empty_children.
470   */
471 
472 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
473 				  int wasfull)
474 {
475 	struct node *chi = tn->child[i];
476 	int isfull;
477 
478 	BUG_ON(i >= 1<<tn->bits);
479 
480 	/* update emptyChildren */
481 	if (n == NULL && chi != NULL)
482 		tn->empty_children++;
483 	else if (n != NULL && chi == NULL)
484 		tn->empty_children--;
485 
486 	/* update fullChildren */
487 	if (wasfull == -1)
488 		wasfull = tnode_full(tn, chi);
489 
490 	isfull = tnode_full(tn, n);
491 	if (wasfull && !isfull)
492 		tn->full_children--;
493 	else if (!wasfull && isfull)
494 		tn->full_children++;
495 
496 	if (n)
497 		node_set_parent(n, tn);
498 
499 	rcu_assign_pointer(tn->child[i], n);
500 }
501 
502 static struct node *resize(struct trie *t, struct tnode *tn)
503 {
504 	int i;
505 	int err = 0;
506 	struct tnode *old_tn;
507 	int inflate_threshold_use;
508 	int halve_threshold_use;
509 	int max_resize;
510 
511 	if (!tn)
512 		return NULL;
513 
514 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
515 		 tn, inflate_threshold, halve_threshold);
516 
517 	/* No children */
518 	if (tn->empty_children == tnode_child_length(tn)) {
519 		tnode_free_safe(tn);
520 		return NULL;
521 	}
522 	/* One child */
523 	if (tn->empty_children == tnode_child_length(tn) - 1)
524 		for (i = 0; i < tnode_child_length(tn); i++) {
525 			struct node *n;
526 
527 			n = tn->child[i];
528 			if (!n)
529 				continue;
530 
531 			/* compress one level */
532 			node_set_parent(n, NULL);
533 			tnode_free_safe(tn);
534 			return n;
535 		}
536 	/*
537 	 * Double as long as the resulting node has a number of
538 	 * nonempty nodes that are above the threshold.
539 	 */
540 
541 	/*
542 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
543 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
544 	 * Telecommunications, page 6:
545 	 * "A node is doubled if the ratio of non-empty children to all
546 	 * children in the *doubled* node is at least 'high'."
547 	 *
548 	 * 'high' in this instance is the variable 'inflate_threshold'. It
549 	 * is expressed as a percentage, so we multiply it with
550 	 * tnode_child_length() and instead of multiplying by 2 (since the
551 	 * child array will be doubled by inflate()) and multiplying
552 	 * the left-hand side by 100 (to handle the percentage thing) we
553 	 * multiply the left-hand side by 50.
554 	 *
555 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
556 	 * - tn->empty_children is of course the number of non-null children
557 	 * in the current node. tn->full_children is the number of "full"
558 	 * children, that is non-null tnodes with a skip value of 0.
559 	 * All of those will be doubled in the resulting inflated tnode, so
560 	 * we just count them one extra time here.
561 	 *
562 	 * A clearer way to write this would be:
563 	 *
564 	 * to_be_doubled = tn->full_children;
565 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
566 	 *     tn->full_children;
567 	 *
568 	 * new_child_length = tnode_child_length(tn) * 2;
569 	 *
570 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
571 	 *      new_child_length;
572 	 * if (new_fill_factor >= inflate_threshold)
573 	 *
574 	 * ...and so on, tho it would mess up the while () loop.
575 	 *
576 	 * anyway,
577 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
578 	 *      inflate_threshold
579 	 *
580 	 * avoid a division:
581 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
582 	 *      inflate_threshold * new_child_length
583 	 *
584 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
585 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
586 	 *    tn->full_children) >= inflate_threshold * new_child_length
587 	 *
588 	 * expand new_child_length:
589 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
590 	 *    tn->full_children) >=
591 	 *      inflate_threshold * tnode_child_length(tn) * 2
592 	 *
593 	 * shorten again:
594 	 * 50 * (tn->full_children + tnode_child_length(tn) -
595 	 *    tn->empty_children) >= inflate_threshold *
596 	 *    tnode_child_length(tn)
597 	 *
598 	 */
599 
600 	check_tnode(tn);
601 
602 	/* Keep root node larger  */
603 
604 	if (!tn->parent)
605 		inflate_threshold_use = inflate_threshold_root;
606 	else
607 		inflate_threshold_use = inflate_threshold;
608 
609 	err = 0;
610 	max_resize = 10;
611 	while ((tn->full_children > 0 &&  max_resize-- &&
612 		50 * (tn->full_children + tnode_child_length(tn)
613 		      - tn->empty_children)
614 		>= inflate_threshold_use * tnode_child_length(tn))) {
615 
616 		old_tn = tn;
617 		tn = inflate(t, tn);
618 
619 		if (IS_ERR(tn)) {
620 			tn = old_tn;
621 #ifdef CONFIG_IP_FIB_TRIE_STATS
622 			t->stats.resize_node_skipped++;
623 #endif
624 			break;
625 		}
626 	}
627 
628 	if (max_resize < 0) {
629 		if (!tn->parent)
630 			pr_warning("Fix inflate_threshold_root."
631 				   " Now=%d size=%d bits\n",
632 				   inflate_threshold_root, tn->bits);
633 		else
634 			pr_warning("Fix inflate_threshold."
635 				   " Now=%d size=%d bits\n",
636 				   inflate_threshold, tn->bits);
637 	}
638 
639 	check_tnode(tn);
640 
641 	/*
642 	 * Halve as long as the number of empty children in this
643 	 * node is above threshold.
644 	 */
645 
646 
647 	/* Keep root node larger  */
648 
649 	if (!tn->parent)
650 		halve_threshold_use = halve_threshold_root;
651 	else
652 		halve_threshold_use = halve_threshold;
653 
654 	err = 0;
655 	max_resize = 10;
656 	while (tn->bits > 1 &&  max_resize-- &&
657 	       100 * (tnode_child_length(tn) - tn->empty_children) <
658 	       halve_threshold_use * tnode_child_length(tn)) {
659 
660 		old_tn = tn;
661 		tn = halve(t, tn);
662 		if (IS_ERR(tn)) {
663 			tn = old_tn;
664 #ifdef CONFIG_IP_FIB_TRIE_STATS
665 			t->stats.resize_node_skipped++;
666 #endif
667 			break;
668 		}
669 	}
670 
671 	if (max_resize < 0) {
672 		if (!tn->parent)
673 			pr_warning("Fix halve_threshold_root."
674 				   " Now=%d size=%d bits\n",
675 				   halve_threshold_root, tn->bits);
676 		else
677 			pr_warning("Fix halve_threshold."
678 				   " Now=%d size=%d bits\n",
679 				   halve_threshold, tn->bits);
680 	}
681 
682 	/* Only one child remains */
683 	if (tn->empty_children == tnode_child_length(tn) - 1)
684 		for (i = 0; i < tnode_child_length(tn); i++) {
685 			struct node *n;
686 
687 			n = tn->child[i];
688 			if (!n)
689 				continue;
690 
691 			/* compress one level */
692 
693 			node_set_parent(n, NULL);
694 			tnode_free_safe(tn);
695 			return n;
696 		}
697 
698 	return (struct node *) tn;
699 }
700 
701 static struct tnode *inflate(struct trie *t, struct tnode *tn)
702 {
703 	struct tnode *oldtnode = tn;
704 	int olen = tnode_child_length(tn);
705 	int i;
706 
707 	pr_debug("In inflate\n");
708 
709 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
710 
711 	if (!tn)
712 		return ERR_PTR(-ENOMEM);
713 
714 	/*
715 	 * Preallocate and store tnodes before the actual work so we
716 	 * don't get into an inconsistent state if memory allocation
717 	 * fails. In case of failure we return the oldnode and  inflate
718 	 * of tnode is ignored.
719 	 */
720 
721 	for (i = 0; i < olen; i++) {
722 		struct tnode *inode;
723 
724 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
725 		if (inode &&
726 		    IS_TNODE(inode) &&
727 		    inode->pos == oldtnode->pos + oldtnode->bits &&
728 		    inode->bits > 1) {
729 			struct tnode *left, *right;
730 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
731 
732 			left = tnode_new(inode->key&(~m), inode->pos + 1,
733 					 inode->bits - 1);
734 			if (!left)
735 				goto nomem;
736 
737 			right = tnode_new(inode->key|m, inode->pos + 1,
738 					  inode->bits - 1);
739 
740 			if (!right) {
741 				tnode_free(left);
742 				goto nomem;
743 			}
744 
745 			put_child(t, tn, 2*i, (struct node *) left);
746 			put_child(t, tn, 2*i+1, (struct node *) right);
747 		}
748 	}
749 
750 	for (i = 0; i < olen; i++) {
751 		struct tnode *inode;
752 		struct node *node = tnode_get_child(oldtnode, i);
753 		struct tnode *left, *right;
754 		int size, j;
755 
756 		/* An empty child */
757 		if (node == NULL)
758 			continue;
759 
760 		/* A leaf or an internal node with skipped bits */
761 
762 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
763 		   tn->pos + tn->bits - 1) {
764 			if (tkey_extract_bits(node->key,
765 					      oldtnode->pos + oldtnode->bits,
766 					      1) == 0)
767 				put_child(t, tn, 2*i, node);
768 			else
769 				put_child(t, tn, 2*i+1, node);
770 			continue;
771 		}
772 
773 		/* An internal node with two children */
774 		inode = (struct tnode *) node;
775 
776 		if (inode->bits == 1) {
777 			put_child(t, tn, 2*i, inode->child[0]);
778 			put_child(t, tn, 2*i+1, inode->child[1]);
779 
780 			tnode_free_safe(inode);
781 			continue;
782 		}
783 
784 		/* An internal node with more than two children */
785 
786 		/* We will replace this node 'inode' with two new
787 		 * ones, 'left' and 'right', each with half of the
788 		 * original children. The two new nodes will have
789 		 * a position one bit further down the key and this
790 		 * means that the "significant" part of their keys
791 		 * (see the discussion near the top of this file)
792 		 * will differ by one bit, which will be "0" in
793 		 * left's key and "1" in right's key. Since we are
794 		 * moving the key position by one step, the bit that
795 		 * we are moving away from - the bit at position
796 		 * (inode->pos) - is the one that will differ between
797 		 * left and right. So... we synthesize that bit in the
798 		 * two  new keys.
799 		 * The mask 'm' below will be a single "one" bit at
800 		 * the position (inode->pos)
801 		 */
802 
803 		/* Use the old key, but set the new significant
804 		 *   bit to zero.
805 		 */
806 
807 		left = (struct tnode *) tnode_get_child(tn, 2*i);
808 		put_child(t, tn, 2*i, NULL);
809 
810 		BUG_ON(!left);
811 
812 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
813 		put_child(t, tn, 2*i+1, NULL);
814 
815 		BUG_ON(!right);
816 
817 		size = tnode_child_length(left);
818 		for (j = 0; j < size; j++) {
819 			put_child(t, left, j, inode->child[j]);
820 			put_child(t, right, j, inode->child[j + size]);
821 		}
822 		put_child(t, tn, 2*i, resize(t, left));
823 		put_child(t, tn, 2*i+1, resize(t, right));
824 
825 		tnode_free_safe(inode);
826 	}
827 	tnode_free_safe(oldtnode);
828 	return tn;
829 nomem:
830 	{
831 		int size = tnode_child_length(tn);
832 		int j;
833 
834 		for (j = 0; j < size; j++)
835 			if (tn->child[j])
836 				tnode_free((struct tnode *)tn->child[j]);
837 
838 		tnode_free(tn);
839 
840 		return ERR_PTR(-ENOMEM);
841 	}
842 }
843 
844 static struct tnode *halve(struct trie *t, struct tnode *tn)
845 {
846 	struct tnode *oldtnode = tn;
847 	struct node *left, *right;
848 	int i;
849 	int olen = tnode_child_length(tn);
850 
851 	pr_debug("In halve\n");
852 
853 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
854 
855 	if (!tn)
856 		return ERR_PTR(-ENOMEM);
857 
858 	/*
859 	 * Preallocate and store tnodes before the actual work so we
860 	 * don't get into an inconsistent state if memory allocation
861 	 * fails. In case of failure we return the oldnode and halve
862 	 * of tnode is ignored.
863 	 */
864 
865 	for (i = 0; i < olen; i += 2) {
866 		left = tnode_get_child(oldtnode, i);
867 		right = tnode_get_child(oldtnode, i+1);
868 
869 		/* Two nonempty children */
870 		if (left && right) {
871 			struct tnode *newn;
872 
873 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
874 
875 			if (!newn)
876 				goto nomem;
877 
878 			put_child(t, tn, i/2, (struct node *)newn);
879 		}
880 
881 	}
882 
883 	for (i = 0; i < olen; i += 2) {
884 		struct tnode *newBinNode;
885 
886 		left = tnode_get_child(oldtnode, i);
887 		right = tnode_get_child(oldtnode, i+1);
888 
889 		/* At least one of the children is empty */
890 		if (left == NULL) {
891 			if (right == NULL)    /* Both are empty */
892 				continue;
893 			put_child(t, tn, i/2, right);
894 			continue;
895 		}
896 
897 		if (right == NULL) {
898 			put_child(t, tn, i/2, left);
899 			continue;
900 		}
901 
902 		/* Two nonempty children */
903 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
904 		put_child(t, tn, i/2, NULL);
905 		put_child(t, newBinNode, 0, left);
906 		put_child(t, newBinNode, 1, right);
907 		put_child(t, tn, i/2, resize(t, newBinNode));
908 	}
909 	tnode_free_safe(oldtnode);
910 	return tn;
911 nomem:
912 	{
913 		int size = tnode_child_length(tn);
914 		int j;
915 
916 		for (j = 0; j < size; j++)
917 			if (tn->child[j])
918 				tnode_free((struct tnode *)tn->child[j]);
919 
920 		tnode_free(tn);
921 
922 		return ERR_PTR(-ENOMEM);
923 	}
924 }
925 
926 /* readside must use rcu_read_lock currently dump routines
927  via get_fa_head and dump */
928 
929 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
930 {
931 	struct hlist_head *head = &l->list;
932 	struct hlist_node *node;
933 	struct leaf_info *li;
934 
935 	hlist_for_each_entry_rcu(li, node, head, hlist)
936 		if (li->plen == plen)
937 			return li;
938 
939 	return NULL;
940 }
941 
942 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
943 {
944 	struct leaf_info *li = find_leaf_info(l, plen);
945 
946 	if (!li)
947 		return NULL;
948 
949 	return &li->falh;
950 }
951 
952 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
953 {
954 	struct leaf_info *li = NULL, *last = NULL;
955 	struct hlist_node *node;
956 
957 	if (hlist_empty(head)) {
958 		hlist_add_head_rcu(&new->hlist, head);
959 	} else {
960 		hlist_for_each_entry(li, node, head, hlist) {
961 			if (new->plen > li->plen)
962 				break;
963 
964 			last = li;
965 		}
966 		if (last)
967 			hlist_add_after_rcu(&last->hlist, &new->hlist);
968 		else
969 			hlist_add_before_rcu(&new->hlist, &li->hlist);
970 	}
971 }
972 
973 /* rcu_read_lock needs to be hold by caller from readside */
974 
975 static struct leaf *
976 fib_find_node(struct trie *t, u32 key)
977 {
978 	int pos;
979 	struct tnode *tn;
980 	struct node *n;
981 
982 	pos = 0;
983 	n = rcu_dereference(t->trie);
984 
985 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
986 		tn = (struct tnode *) n;
987 
988 		check_tnode(tn);
989 
990 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
991 			pos = tn->pos + tn->bits;
992 			n = tnode_get_child_rcu(tn,
993 						tkey_extract_bits(key,
994 								  tn->pos,
995 								  tn->bits));
996 		} else
997 			break;
998 	}
999 	/* Case we have found a leaf. Compare prefixes */
1000 
1001 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1002 		return (struct leaf *)n;
1003 
1004 	return NULL;
1005 }
1006 
1007 static void trie_rebalance(struct trie *t, struct tnode *tn)
1008 {
1009 	int wasfull;
1010 	t_key cindex, key;
1011 	struct tnode *tp;
1012 
1013 	key = tn->key;
1014 
1015 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1016 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1017 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1018 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1019 
1020 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1021 				      (struct node *)tn, wasfull);
1022 
1023 		tp = node_parent((struct node *) tn);
1024 		if (!tp)
1025 			rcu_assign_pointer(t->trie, (struct node *)tn);
1026 
1027 		tnode_free_flush();
1028 		if (!tp)
1029 			break;
1030 		tn = tp;
1031 	}
1032 
1033 	/* Handle last (top) tnode */
1034 	if (IS_TNODE(tn))
1035 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1036 
1037 	rcu_assign_pointer(t->trie, (struct node *)tn);
1038 	tnode_free_flush();
1039 
1040 	return;
1041 }
1042 
1043 /* only used from updater-side */
1044 
1045 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1046 {
1047 	int pos, newpos;
1048 	struct tnode *tp = NULL, *tn = NULL;
1049 	struct node *n;
1050 	struct leaf *l;
1051 	int missbit;
1052 	struct list_head *fa_head = NULL;
1053 	struct leaf_info *li;
1054 	t_key cindex;
1055 
1056 	pos = 0;
1057 	n = t->trie;
1058 
1059 	/* If we point to NULL, stop. Either the tree is empty and we should
1060 	 * just put a new leaf in if, or we have reached an empty child slot,
1061 	 * and we should just put our new leaf in that.
1062 	 * If we point to a T_TNODE, check if it matches our key. Note that
1063 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1064 	 * not be the parent's 'pos'+'bits'!
1065 	 *
1066 	 * If it does match the current key, get pos/bits from it, extract
1067 	 * the index from our key, push the T_TNODE and walk the tree.
1068 	 *
1069 	 * If it doesn't, we have to replace it with a new T_TNODE.
1070 	 *
1071 	 * If we point to a T_LEAF, it might or might not have the same key
1072 	 * as we do. If it does, just change the value, update the T_LEAF's
1073 	 * value, and return it.
1074 	 * If it doesn't, we need to replace it with a T_TNODE.
1075 	 */
1076 
1077 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1078 		tn = (struct tnode *) n;
1079 
1080 		check_tnode(tn);
1081 
1082 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1083 			tp = tn;
1084 			pos = tn->pos + tn->bits;
1085 			n = tnode_get_child(tn,
1086 					    tkey_extract_bits(key,
1087 							      tn->pos,
1088 							      tn->bits));
1089 
1090 			BUG_ON(n && node_parent(n) != tn);
1091 		} else
1092 			break;
1093 	}
1094 
1095 	/*
1096 	 * n  ----> NULL, LEAF or TNODE
1097 	 *
1098 	 * tp is n's (parent) ----> NULL or TNODE
1099 	 */
1100 
1101 	BUG_ON(tp && IS_LEAF(tp));
1102 
1103 	/* Case 1: n is a leaf. Compare prefixes */
1104 
1105 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1106 		l = (struct leaf *) n;
1107 		li = leaf_info_new(plen);
1108 
1109 		if (!li)
1110 			return NULL;
1111 
1112 		fa_head = &li->falh;
1113 		insert_leaf_info(&l->list, li);
1114 		goto done;
1115 	}
1116 	l = leaf_new();
1117 
1118 	if (!l)
1119 		return NULL;
1120 
1121 	l->key = key;
1122 	li = leaf_info_new(plen);
1123 
1124 	if (!li) {
1125 		free_leaf(l);
1126 		return NULL;
1127 	}
1128 
1129 	fa_head = &li->falh;
1130 	insert_leaf_info(&l->list, li);
1131 
1132 	if (t->trie && n == NULL) {
1133 		/* Case 2: n is NULL, and will just insert a new leaf */
1134 
1135 		node_set_parent((struct node *)l, tp);
1136 
1137 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1138 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1139 	} else {
1140 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1141 		/*
1142 		 *  Add a new tnode here
1143 		 *  first tnode need some special handling
1144 		 */
1145 
1146 		if (tp)
1147 			pos = tp->pos+tp->bits;
1148 		else
1149 			pos = 0;
1150 
1151 		if (n) {
1152 			newpos = tkey_mismatch(key, pos, n->key);
1153 			tn = tnode_new(n->key, newpos, 1);
1154 		} else {
1155 			newpos = 0;
1156 			tn = tnode_new(key, newpos, 1); /* First tnode */
1157 		}
1158 
1159 		if (!tn) {
1160 			free_leaf_info(li);
1161 			free_leaf(l);
1162 			return NULL;
1163 		}
1164 
1165 		node_set_parent((struct node *)tn, tp);
1166 
1167 		missbit = tkey_extract_bits(key, newpos, 1);
1168 		put_child(t, tn, missbit, (struct node *)l);
1169 		put_child(t, tn, 1-missbit, n);
1170 
1171 		if (tp) {
1172 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1173 			put_child(t, (struct tnode *)tp, cindex,
1174 				  (struct node *)tn);
1175 		} else {
1176 			rcu_assign_pointer(t->trie, (struct node *)tn);
1177 			tp = tn;
1178 		}
1179 	}
1180 
1181 	if (tp && tp->pos + tp->bits > 32)
1182 		pr_warning("fib_trie"
1183 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1184 			   tp, tp->pos, tp->bits, key, plen);
1185 
1186 	/* Rebalance the trie */
1187 
1188 	trie_rebalance(t, tp);
1189 done:
1190 	return fa_head;
1191 }
1192 
1193 /*
1194  * Caller must hold RTNL.
1195  */
1196 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1197 {
1198 	struct trie *t = (struct trie *) tb->tb_data;
1199 	struct fib_alias *fa, *new_fa;
1200 	struct list_head *fa_head = NULL;
1201 	struct fib_info *fi;
1202 	int plen = cfg->fc_dst_len;
1203 	u8 tos = cfg->fc_tos;
1204 	u32 key, mask;
1205 	int err;
1206 	struct leaf *l;
1207 
1208 	if (plen > 32)
1209 		return -EINVAL;
1210 
1211 	key = ntohl(cfg->fc_dst);
1212 
1213 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1214 
1215 	mask = ntohl(inet_make_mask(plen));
1216 
1217 	if (key & ~mask)
1218 		return -EINVAL;
1219 
1220 	key = key & mask;
1221 
1222 	fi = fib_create_info(cfg);
1223 	if (IS_ERR(fi)) {
1224 		err = PTR_ERR(fi);
1225 		goto err;
1226 	}
1227 
1228 	l = fib_find_node(t, key);
1229 	fa = NULL;
1230 
1231 	if (l) {
1232 		fa_head = get_fa_head(l, plen);
1233 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1234 	}
1235 
1236 	/* Now fa, if non-NULL, points to the first fib alias
1237 	 * with the same keys [prefix,tos,priority], if such key already
1238 	 * exists or to the node before which we will insert new one.
1239 	 *
1240 	 * If fa is NULL, we will need to allocate a new one and
1241 	 * insert to the head of f.
1242 	 *
1243 	 * If f is NULL, no fib node matched the destination key
1244 	 * and we need to allocate a new one of those as well.
1245 	 */
1246 
1247 	if (fa && fa->fa_tos == tos &&
1248 	    fa->fa_info->fib_priority == fi->fib_priority) {
1249 		struct fib_alias *fa_first, *fa_match;
1250 
1251 		err = -EEXIST;
1252 		if (cfg->fc_nlflags & NLM_F_EXCL)
1253 			goto out;
1254 
1255 		/* We have 2 goals:
1256 		 * 1. Find exact match for type, scope, fib_info to avoid
1257 		 * duplicate routes
1258 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1259 		 */
1260 		fa_match = NULL;
1261 		fa_first = fa;
1262 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1263 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1264 			if (fa->fa_tos != tos)
1265 				break;
1266 			if (fa->fa_info->fib_priority != fi->fib_priority)
1267 				break;
1268 			if (fa->fa_type == cfg->fc_type &&
1269 			    fa->fa_scope == cfg->fc_scope &&
1270 			    fa->fa_info == fi) {
1271 				fa_match = fa;
1272 				break;
1273 			}
1274 		}
1275 
1276 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1277 			struct fib_info *fi_drop;
1278 			u8 state;
1279 
1280 			fa = fa_first;
1281 			if (fa_match) {
1282 				if (fa == fa_match)
1283 					err = 0;
1284 				goto out;
1285 			}
1286 			err = -ENOBUFS;
1287 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1288 			if (new_fa == NULL)
1289 				goto out;
1290 
1291 			fi_drop = fa->fa_info;
1292 			new_fa->fa_tos = fa->fa_tos;
1293 			new_fa->fa_info = fi;
1294 			new_fa->fa_type = cfg->fc_type;
1295 			new_fa->fa_scope = cfg->fc_scope;
1296 			state = fa->fa_state;
1297 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1298 
1299 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1300 			alias_free_mem_rcu(fa);
1301 
1302 			fib_release_info(fi_drop);
1303 			if (state & FA_S_ACCESSED)
1304 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1305 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1306 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1307 
1308 			goto succeeded;
1309 		}
1310 		/* Error if we find a perfect match which
1311 		 * uses the same scope, type, and nexthop
1312 		 * information.
1313 		 */
1314 		if (fa_match)
1315 			goto out;
1316 
1317 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1318 			fa = fa_first;
1319 	}
1320 	err = -ENOENT;
1321 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1322 		goto out;
1323 
1324 	err = -ENOBUFS;
1325 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1326 	if (new_fa == NULL)
1327 		goto out;
1328 
1329 	new_fa->fa_info = fi;
1330 	new_fa->fa_tos = tos;
1331 	new_fa->fa_type = cfg->fc_type;
1332 	new_fa->fa_scope = cfg->fc_scope;
1333 	new_fa->fa_state = 0;
1334 	/*
1335 	 * Insert new entry to the list.
1336 	 */
1337 
1338 	if (!fa_head) {
1339 		fa_head = fib_insert_node(t, key, plen);
1340 		if (unlikely(!fa_head)) {
1341 			err = -ENOMEM;
1342 			goto out_free_new_fa;
1343 		}
1344 	}
1345 
1346 	list_add_tail_rcu(&new_fa->fa_list,
1347 			  (fa ? &fa->fa_list : fa_head));
1348 
1349 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1350 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1351 		  &cfg->fc_nlinfo, 0);
1352 succeeded:
1353 	return 0;
1354 
1355 out_free_new_fa:
1356 	kmem_cache_free(fn_alias_kmem, new_fa);
1357 out:
1358 	fib_release_info(fi);
1359 err:
1360 	return err;
1361 }
1362 
1363 /* should be called with rcu_read_lock */
1364 static int check_leaf(struct trie *t, struct leaf *l,
1365 		      t_key key,  const struct flowi *flp,
1366 		      struct fib_result *res)
1367 {
1368 	struct leaf_info *li;
1369 	struct hlist_head *hhead = &l->list;
1370 	struct hlist_node *node;
1371 
1372 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1373 		int err;
1374 		int plen = li->plen;
1375 		__be32 mask = inet_make_mask(plen);
1376 
1377 		if (l->key != (key & ntohl(mask)))
1378 			continue;
1379 
1380 		err = fib_semantic_match(&li->falh, flp, res, plen);
1381 
1382 #ifdef CONFIG_IP_FIB_TRIE_STATS
1383 		if (err <= 0)
1384 			t->stats.semantic_match_passed++;
1385 		else
1386 			t->stats.semantic_match_miss++;
1387 #endif
1388 		if (err <= 0)
1389 			return err;
1390 	}
1391 
1392 	return 1;
1393 }
1394 
1395 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1396 			  struct fib_result *res)
1397 {
1398 	struct trie *t = (struct trie *) tb->tb_data;
1399 	int ret;
1400 	struct node *n;
1401 	struct tnode *pn;
1402 	int pos, bits;
1403 	t_key key = ntohl(flp->fl4_dst);
1404 	int chopped_off;
1405 	t_key cindex = 0;
1406 	int current_prefix_length = KEYLENGTH;
1407 	struct tnode *cn;
1408 	t_key node_prefix, key_prefix, pref_mismatch;
1409 	int mp;
1410 
1411 	rcu_read_lock();
1412 
1413 	n = rcu_dereference(t->trie);
1414 	if (!n)
1415 		goto failed;
1416 
1417 #ifdef CONFIG_IP_FIB_TRIE_STATS
1418 	t->stats.gets++;
1419 #endif
1420 
1421 	/* Just a leaf? */
1422 	if (IS_LEAF(n)) {
1423 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1424 		goto found;
1425 	}
1426 
1427 	pn = (struct tnode *) n;
1428 	chopped_off = 0;
1429 
1430 	while (pn) {
1431 		pos = pn->pos;
1432 		bits = pn->bits;
1433 
1434 		if (!chopped_off)
1435 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1436 						   pos, bits);
1437 
1438 		n = tnode_get_child(pn, cindex);
1439 
1440 		if (n == NULL) {
1441 #ifdef CONFIG_IP_FIB_TRIE_STATS
1442 			t->stats.null_node_hit++;
1443 #endif
1444 			goto backtrace;
1445 		}
1446 
1447 		if (IS_LEAF(n)) {
1448 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1449 			if (ret > 0)
1450 				goto backtrace;
1451 			goto found;
1452 		}
1453 
1454 		cn = (struct tnode *)n;
1455 
1456 		/*
1457 		 * It's a tnode, and we can do some extra checks here if we
1458 		 * like, to avoid descending into a dead-end branch.
1459 		 * This tnode is in the parent's child array at index
1460 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1461 		 * chopped off, so in reality the index may be just a
1462 		 * subprefix, padded with zero at the end.
1463 		 * We can also take a look at any skipped bits in this
1464 		 * tnode - everything up to p_pos is supposed to be ok,
1465 		 * and the non-chopped bits of the index (se previous
1466 		 * paragraph) are also guaranteed ok, but the rest is
1467 		 * considered unknown.
1468 		 *
1469 		 * The skipped bits are key[pos+bits..cn->pos].
1470 		 */
1471 
1472 		/* If current_prefix_length < pos+bits, we are already doing
1473 		 * actual prefix  matching, which means everything from
1474 		 * pos+(bits-chopped_off) onward must be zero along some
1475 		 * branch of this subtree - otherwise there is *no* valid
1476 		 * prefix present. Here we can only check the skipped
1477 		 * bits. Remember, since we have already indexed into the
1478 		 * parent's child array, we know that the bits we chopped of
1479 		 * *are* zero.
1480 		 */
1481 
1482 		/* NOTA BENE: Checking only skipped bits
1483 		   for the new node here */
1484 
1485 		if (current_prefix_length < pos+bits) {
1486 			if (tkey_extract_bits(cn->key, current_prefix_length,
1487 						cn->pos - current_prefix_length)
1488 			    || !(cn->child[0]))
1489 				goto backtrace;
1490 		}
1491 
1492 		/*
1493 		 * If chopped_off=0, the index is fully validated and we
1494 		 * only need to look at the skipped bits for this, the new,
1495 		 * tnode. What we actually want to do is to find out if
1496 		 * these skipped bits match our key perfectly, or if we will
1497 		 * have to count on finding a matching prefix further down,
1498 		 * because if we do, we would like to have some way of
1499 		 * verifying the existence of such a prefix at this point.
1500 		 */
1501 
1502 		/* The only thing we can do at this point is to verify that
1503 		 * any such matching prefix can indeed be a prefix to our
1504 		 * key, and if the bits in the node we are inspecting that
1505 		 * do not match our key are not ZERO, this cannot be true.
1506 		 * Thus, find out where there is a mismatch (before cn->pos)
1507 		 * and verify that all the mismatching bits are zero in the
1508 		 * new tnode's key.
1509 		 */
1510 
1511 		/*
1512 		 * Note: We aren't very concerned about the piece of
1513 		 * the key that precede pn->pos+pn->bits, since these
1514 		 * have already been checked. The bits after cn->pos
1515 		 * aren't checked since these are by definition
1516 		 * "unknown" at this point. Thus, what we want to see
1517 		 * is if we are about to enter the "prefix matching"
1518 		 * state, and in that case verify that the skipped
1519 		 * bits that will prevail throughout this subtree are
1520 		 * zero, as they have to be if we are to find a
1521 		 * matching prefix.
1522 		 */
1523 
1524 		node_prefix = mask_pfx(cn->key, cn->pos);
1525 		key_prefix = mask_pfx(key, cn->pos);
1526 		pref_mismatch = key_prefix^node_prefix;
1527 		mp = 0;
1528 
1529 		/*
1530 		 * In short: If skipped bits in this node do not match
1531 		 * the search key, enter the "prefix matching"
1532 		 * state.directly.
1533 		 */
1534 		if (pref_mismatch) {
1535 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1536 				mp++;
1537 				pref_mismatch = pref_mismatch << 1;
1538 			}
1539 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1540 
1541 			if (key_prefix != 0)
1542 				goto backtrace;
1543 
1544 			if (current_prefix_length >= cn->pos)
1545 				current_prefix_length = mp;
1546 		}
1547 
1548 		pn = (struct tnode *)n; /* Descend */
1549 		chopped_off = 0;
1550 		continue;
1551 
1552 backtrace:
1553 		chopped_off++;
1554 
1555 		/* As zero don't change the child key (cindex) */
1556 		while ((chopped_off <= pn->bits)
1557 		       && !(cindex & (1<<(chopped_off-1))))
1558 			chopped_off++;
1559 
1560 		/* Decrease current_... with bits chopped off */
1561 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1562 			current_prefix_length = pn->pos + pn->bits
1563 				- chopped_off;
1564 
1565 		/*
1566 		 * Either we do the actual chop off according or if we have
1567 		 * chopped off all bits in this tnode walk up to our parent.
1568 		 */
1569 
1570 		if (chopped_off <= pn->bits) {
1571 			cindex &= ~(1 << (chopped_off-1));
1572 		} else {
1573 			struct tnode *parent = node_parent((struct node *) pn);
1574 			if (!parent)
1575 				goto failed;
1576 
1577 			/* Get Child's index */
1578 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1579 			pn = parent;
1580 			chopped_off = 0;
1581 
1582 #ifdef CONFIG_IP_FIB_TRIE_STATS
1583 			t->stats.backtrack++;
1584 #endif
1585 			goto backtrace;
1586 		}
1587 	}
1588 failed:
1589 	ret = 1;
1590 found:
1591 	rcu_read_unlock();
1592 	return ret;
1593 }
1594 
1595 /*
1596  * Remove the leaf and return parent.
1597  */
1598 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1599 {
1600 	struct tnode *tp = node_parent((struct node *) l);
1601 
1602 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1603 
1604 	if (tp) {
1605 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1606 		put_child(t, (struct tnode *)tp, cindex, NULL);
1607 		trie_rebalance(t, tp);
1608 	} else
1609 		rcu_assign_pointer(t->trie, NULL);
1610 
1611 	free_leaf(l);
1612 }
1613 
1614 /*
1615  * Caller must hold RTNL.
1616  */
1617 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1618 {
1619 	struct trie *t = (struct trie *) tb->tb_data;
1620 	u32 key, mask;
1621 	int plen = cfg->fc_dst_len;
1622 	u8 tos = cfg->fc_tos;
1623 	struct fib_alias *fa, *fa_to_delete;
1624 	struct list_head *fa_head;
1625 	struct leaf *l;
1626 	struct leaf_info *li;
1627 
1628 	if (plen > 32)
1629 		return -EINVAL;
1630 
1631 	key = ntohl(cfg->fc_dst);
1632 	mask = ntohl(inet_make_mask(plen));
1633 
1634 	if (key & ~mask)
1635 		return -EINVAL;
1636 
1637 	key = key & mask;
1638 	l = fib_find_node(t, key);
1639 
1640 	if (!l)
1641 		return -ESRCH;
1642 
1643 	fa_head = get_fa_head(l, plen);
1644 	fa = fib_find_alias(fa_head, tos, 0);
1645 
1646 	if (!fa)
1647 		return -ESRCH;
1648 
1649 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1650 
1651 	fa_to_delete = NULL;
1652 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1653 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1654 		struct fib_info *fi = fa->fa_info;
1655 
1656 		if (fa->fa_tos != tos)
1657 			break;
1658 
1659 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1660 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1661 		     fa->fa_scope == cfg->fc_scope) &&
1662 		    (!cfg->fc_protocol ||
1663 		     fi->fib_protocol == cfg->fc_protocol) &&
1664 		    fib_nh_match(cfg, fi) == 0) {
1665 			fa_to_delete = fa;
1666 			break;
1667 		}
1668 	}
1669 
1670 	if (!fa_to_delete)
1671 		return -ESRCH;
1672 
1673 	fa = fa_to_delete;
1674 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1675 		  &cfg->fc_nlinfo, 0);
1676 
1677 	l = fib_find_node(t, key);
1678 	li = find_leaf_info(l, plen);
1679 
1680 	list_del_rcu(&fa->fa_list);
1681 
1682 	if (list_empty(fa_head)) {
1683 		hlist_del_rcu(&li->hlist);
1684 		free_leaf_info(li);
1685 	}
1686 
1687 	if (hlist_empty(&l->list))
1688 		trie_leaf_remove(t, l);
1689 
1690 	if (fa->fa_state & FA_S_ACCESSED)
1691 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1692 
1693 	fib_release_info(fa->fa_info);
1694 	alias_free_mem_rcu(fa);
1695 	return 0;
1696 }
1697 
1698 static int trie_flush_list(struct list_head *head)
1699 {
1700 	struct fib_alias *fa, *fa_node;
1701 	int found = 0;
1702 
1703 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1704 		struct fib_info *fi = fa->fa_info;
1705 
1706 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1707 			list_del_rcu(&fa->fa_list);
1708 			fib_release_info(fa->fa_info);
1709 			alias_free_mem_rcu(fa);
1710 			found++;
1711 		}
1712 	}
1713 	return found;
1714 }
1715 
1716 static int trie_flush_leaf(struct leaf *l)
1717 {
1718 	int found = 0;
1719 	struct hlist_head *lih = &l->list;
1720 	struct hlist_node *node, *tmp;
1721 	struct leaf_info *li = NULL;
1722 
1723 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1724 		found += trie_flush_list(&li->falh);
1725 
1726 		if (list_empty(&li->falh)) {
1727 			hlist_del_rcu(&li->hlist);
1728 			free_leaf_info(li);
1729 		}
1730 	}
1731 	return found;
1732 }
1733 
1734 /*
1735  * Scan for the next right leaf starting at node p->child[idx]
1736  * Since we have back pointer, no recursion necessary.
1737  */
1738 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1739 {
1740 	do {
1741 		t_key idx;
1742 
1743 		if (c)
1744 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1745 		else
1746 			idx = 0;
1747 
1748 		while (idx < 1u << p->bits) {
1749 			c = tnode_get_child_rcu(p, idx++);
1750 			if (!c)
1751 				continue;
1752 
1753 			if (IS_LEAF(c)) {
1754 				prefetch(p->child[idx]);
1755 				return (struct leaf *) c;
1756 			}
1757 
1758 			/* Rescan start scanning in new node */
1759 			p = (struct tnode *) c;
1760 			idx = 0;
1761 		}
1762 
1763 		/* Node empty, walk back up to parent */
1764 		c = (struct node *) p;
1765 	} while ( (p = node_parent_rcu(c)) != NULL);
1766 
1767 	return NULL; /* Root of trie */
1768 }
1769 
1770 static struct leaf *trie_firstleaf(struct trie *t)
1771 {
1772 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1773 
1774 	if (!n)
1775 		return NULL;
1776 
1777 	if (IS_LEAF(n))          /* trie is just a leaf */
1778 		return (struct leaf *) n;
1779 
1780 	return leaf_walk_rcu(n, NULL);
1781 }
1782 
1783 static struct leaf *trie_nextleaf(struct leaf *l)
1784 {
1785 	struct node *c = (struct node *) l;
1786 	struct tnode *p = node_parent(c);
1787 
1788 	if (!p)
1789 		return NULL;	/* trie with just one leaf */
1790 
1791 	return leaf_walk_rcu(p, c);
1792 }
1793 
1794 static struct leaf *trie_leafindex(struct trie *t, int index)
1795 {
1796 	struct leaf *l = trie_firstleaf(t);
1797 
1798 	while (l && index-- > 0)
1799 		l = trie_nextleaf(l);
1800 
1801 	return l;
1802 }
1803 
1804 
1805 /*
1806  * Caller must hold RTNL.
1807  */
1808 static int fn_trie_flush(struct fib_table *tb)
1809 {
1810 	struct trie *t = (struct trie *) tb->tb_data;
1811 	struct leaf *l, *ll = NULL;
1812 	int found = 0;
1813 
1814 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1815 		found += trie_flush_leaf(l);
1816 
1817 		if (ll && hlist_empty(&ll->list))
1818 			trie_leaf_remove(t, ll);
1819 		ll = l;
1820 	}
1821 
1822 	if (ll && hlist_empty(&ll->list))
1823 		trie_leaf_remove(t, ll);
1824 
1825 	pr_debug("trie_flush found=%d\n", found);
1826 	return found;
1827 }
1828 
1829 static void fn_trie_select_default(struct fib_table *tb,
1830 				   const struct flowi *flp,
1831 				   struct fib_result *res)
1832 {
1833 	struct trie *t = (struct trie *) tb->tb_data;
1834 	int order, last_idx;
1835 	struct fib_info *fi = NULL;
1836 	struct fib_info *last_resort;
1837 	struct fib_alias *fa = NULL;
1838 	struct list_head *fa_head;
1839 	struct leaf *l;
1840 
1841 	last_idx = -1;
1842 	last_resort = NULL;
1843 	order = -1;
1844 
1845 	rcu_read_lock();
1846 
1847 	l = fib_find_node(t, 0);
1848 	if (!l)
1849 		goto out;
1850 
1851 	fa_head = get_fa_head(l, 0);
1852 	if (!fa_head)
1853 		goto out;
1854 
1855 	if (list_empty(fa_head))
1856 		goto out;
1857 
1858 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1859 		struct fib_info *next_fi = fa->fa_info;
1860 
1861 		if (fa->fa_scope != res->scope ||
1862 		    fa->fa_type != RTN_UNICAST)
1863 			continue;
1864 
1865 		if (next_fi->fib_priority > res->fi->fib_priority)
1866 			break;
1867 		if (!next_fi->fib_nh[0].nh_gw ||
1868 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1869 			continue;
1870 		fa->fa_state |= FA_S_ACCESSED;
1871 
1872 		if (fi == NULL) {
1873 			if (next_fi != res->fi)
1874 				break;
1875 		} else if (!fib_detect_death(fi, order, &last_resort,
1876 					     &last_idx, tb->tb_default)) {
1877 			fib_result_assign(res, fi);
1878 			tb->tb_default = order;
1879 			goto out;
1880 		}
1881 		fi = next_fi;
1882 		order++;
1883 	}
1884 	if (order <= 0 || fi == NULL) {
1885 		tb->tb_default = -1;
1886 		goto out;
1887 	}
1888 
1889 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1890 				tb->tb_default)) {
1891 		fib_result_assign(res, fi);
1892 		tb->tb_default = order;
1893 		goto out;
1894 	}
1895 	if (last_idx >= 0)
1896 		fib_result_assign(res, last_resort);
1897 	tb->tb_default = last_idx;
1898 out:
1899 	rcu_read_unlock();
1900 }
1901 
1902 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1903 			   struct fib_table *tb,
1904 			   struct sk_buff *skb, struct netlink_callback *cb)
1905 {
1906 	int i, s_i;
1907 	struct fib_alias *fa;
1908 	__be32 xkey = htonl(key);
1909 
1910 	s_i = cb->args[5];
1911 	i = 0;
1912 
1913 	/* rcu_read_lock is hold by caller */
1914 
1915 	list_for_each_entry_rcu(fa, fah, fa_list) {
1916 		if (i < s_i) {
1917 			i++;
1918 			continue;
1919 		}
1920 
1921 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1922 				  cb->nlh->nlmsg_seq,
1923 				  RTM_NEWROUTE,
1924 				  tb->tb_id,
1925 				  fa->fa_type,
1926 				  fa->fa_scope,
1927 				  xkey,
1928 				  plen,
1929 				  fa->fa_tos,
1930 				  fa->fa_info, NLM_F_MULTI) < 0) {
1931 			cb->args[5] = i;
1932 			return -1;
1933 		}
1934 		i++;
1935 	}
1936 	cb->args[5] = i;
1937 	return skb->len;
1938 }
1939 
1940 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1941 			struct sk_buff *skb, struct netlink_callback *cb)
1942 {
1943 	struct leaf_info *li;
1944 	struct hlist_node *node;
1945 	int i, s_i;
1946 
1947 	s_i = cb->args[4];
1948 	i = 0;
1949 
1950 	/* rcu_read_lock is hold by caller */
1951 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1952 		if (i < s_i) {
1953 			i++;
1954 			continue;
1955 		}
1956 
1957 		if (i > s_i)
1958 			cb->args[5] = 0;
1959 
1960 		if (list_empty(&li->falh))
1961 			continue;
1962 
1963 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1964 			cb->args[4] = i;
1965 			return -1;
1966 		}
1967 		i++;
1968 	}
1969 
1970 	cb->args[4] = i;
1971 	return skb->len;
1972 }
1973 
1974 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1975 			struct netlink_callback *cb)
1976 {
1977 	struct leaf *l;
1978 	struct trie *t = (struct trie *) tb->tb_data;
1979 	t_key key = cb->args[2];
1980 	int count = cb->args[3];
1981 
1982 	rcu_read_lock();
1983 	/* Dump starting at last key.
1984 	 * Note: 0.0.0.0/0 (ie default) is first key.
1985 	 */
1986 	if (count == 0)
1987 		l = trie_firstleaf(t);
1988 	else {
1989 		/* Normally, continue from last key, but if that is missing
1990 		 * fallback to using slow rescan
1991 		 */
1992 		l = fib_find_node(t, key);
1993 		if (!l)
1994 			l = trie_leafindex(t, count);
1995 	}
1996 
1997 	while (l) {
1998 		cb->args[2] = l->key;
1999 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2000 			cb->args[3] = count;
2001 			rcu_read_unlock();
2002 			return -1;
2003 		}
2004 
2005 		++count;
2006 		l = trie_nextleaf(l);
2007 		memset(&cb->args[4], 0,
2008 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2009 	}
2010 	cb->args[3] = count;
2011 	rcu_read_unlock();
2012 
2013 	return skb->len;
2014 }
2015 
2016 void __init fib_hash_init(void)
2017 {
2018 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2019 					  sizeof(struct fib_alias),
2020 					  0, SLAB_PANIC, NULL);
2021 
2022 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2023 					   max(sizeof(struct leaf),
2024 					       sizeof(struct leaf_info)),
2025 					   0, SLAB_PANIC, NULL);
2026 }
2027 
2028 
2029 /* Fix more generic FIB names for init later */
2030 struct fib_table *fib_hash_table(u32 id)
2031 {
2032 	struct fib_table *tb;
2033 	struct trie *t;
2034 
2035 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2036 		     GFP_KERNEL);
2037 	if (tb == NULL)
2038 		return NULL;
2039 
2040 	tb->tb_id = id;
2041 	tb->tb_default = -1;
2042 	tb->tb_lookup = fn_trie_lookup;
2043 	tb->tb_insert = fn_trie_insert;
2044 	tb->tb_delete = fn_trie_delete;
2045 	tb->tb_flush = fn_trie_flush;
2046 	tb->tb_select_default = fn_trie_select_default;
2047 	tb->tb_dump = fn_trie_dump;
2048 
2049 	t = (struct trie *) tb->tb_data;
2050 	memset(t, 0, sizeof(*t));
2051 
2052 	if (id == RT_TABLE_LOCAL)
2053 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2054 
2055 	return tb;
2056 }
2057 
2058 #ifdef CONFIG_PROC_FS
2059 /* Depth first Trie walk iterator */
2060 struct fib_trie_iter {
2061 	struct seq_net_private p;
2062 	struct fib_table *tb;
2063 	struct tnode *tnode;
2064 	unsigned index;
2065 	unsigned depth;
2066 };
2067 
2068 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2069 {
2070 	struct tnode *tn = iter->tnode;
2071 	unsigned cindex = iter->index;
2072 	struct tnode *p;
2073 
2074 	/* A single entry routing table */
2075 	if (!tn)
2076 		return NULL;
2077 
2078 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2079 		 iter->tnode, iter->index, iter->depth);
2080 rescan:
2081 	while (cindex < (1<<tn->bits)) {
2082 		struct node *n = tnode_get_child_rcu(tn, cindex);
2083 
2084 		if (n) {
2085 			if (IS_LEAF(n)) {
2086 				iter->tnode = tn;
2087 				iter->index = cindex + 1;
2088 			} else {
2089 				/* push down one level */
2090 				iter->tnode = (struct tnode *) n;
2091 				iter->index = 0;
2092 				++iter->depth;
2093 			}
2094 			return n;
2095 		}
2096 
2097 		++cindex;
2098 	}
2099 
2100 	/* Current node exhausted, pop back up */
2101 	p = node_parent_rcu((struct node *)tn);
2102 	if (p) {
2103 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2104 		tn = p;
2105 		--iter->depth;
2106 		goto rescan;
2107 	}
2108 
2109 	/* got root? */
2110 	return NULL;
2111 }
2112 
2113 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2114 				       struct trie *t)
2115 {
2116 	struct node *n;
2117 
2118 	if (!t)
2119 		return NULL;
2120 
2121 	n = rcu_dereference(t->trie);
2122 	if (!n)
2123 		return NULL;
2124 
2125 	if (IS_TNODE(n)) {
2126 		iter->tnode = (struct tnode *) n;
2127 		iter->index = 0;
2128 		iter->depth = 1;
2129 	} else {
2130 		iter->tnode = NULL;
2131 		iter->index = 0;
2132 		iter->depth = 0;
2133 	}
2134 
2135 	return n;
2136 }
2137 
2138 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2139 {
2140 	struct node *n;
2141 	struct fib_trie_iter iter;
2142 
2143 	memset(s, 0, sizeof(*s));
2144 
2145 	rcu_read_lock();
2146 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2147 		if (IS_LEAF(n)) {
2148 			struct leaf *l = (struct leaf *)n;
2149 			struct leaf_info *li;
2150 			struct hlist_node *tmp;
2151 
2152 			s->leaves++;
2153 			s->totdepth += iter.depth;
2154 			if (iter.depth > s->maxdepth)
2155 				s->maxdepth = iter.depth;
2156 
2157 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2158 				++s->prefixes;
2159 		} else {
2160 			const struct tnode *tn = (const struct tnode *) n;
2161 			int i;
2162 
2163 			s->tnodes++;
2164 			if (tn->bits < MAX_STAT_DEPTH)
2165 				s->nodesizes[tn->bits]++;
2166 
2167 			for (i = 0; i < (1<<tn->bits); i++)
2168 				if (!tn->child[i])
2169 					s->nullpointers++;
2170 		}
2171 	}
2172 	rcu_read_unlock();
2173 }
2174 
2175 /*
2176  *	This outputs /proc/net/fib_triestats
2177  */
2178 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2179 {
2180 	unsigned i, max, pointers, bytes, avdepth;
2181 
2182 	if (stat->leaves)
2183 		avdepth = stat->totdepth*100 / stat->leaves;
2184 	else
2185 		avdepth = 0;
2186 
2187 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2188 		   avdepth / 100, avdepth % 100);
2189 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2190 
2191 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2192 	bytes = sizeof(struct leaf) * stat->leaves;
2193 
2194 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2195 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2196 
2197 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2198 	bytes += sizeof(struct tnode) * stat->tnodes;
2199 
2200 	max = MAX_STAT_DEPTH;
2201 	while (max > 0 && stat->nodesizes[max-1] == 0)
2202 		max--;
2203 
2204 	pointers = 0;
2205 	for (i = 1; i <= max; i++)
2206 		if (stat->nodesizes[i] != 0) {
2207 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2208 			pointers += (1<<i) * stat->nodesizes[i];
2209 		}
2210 	seq_putc(seq, '\n');
2211 	seq_printf(seq, "\tPointers: %u\n", pointers);
2212 
2213 	bytes += sizeof(struct node *) * pointers;
2214 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2215 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2216 }
2217 
2218 #ifdef CONFIG_IP_FIB_TRIE_STATS
2219 static void trie_show_usage(struct seq_file *seq,
2220 			    const struct trie_use_stats *stats)
2221 {
2222 	seq_printf(seq, "\nCounters:\n---------\n");
2223 	seq_printf(seq, "gets = %u\n", stats->gets);
2224 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2225 	seq_printf(seq, "semantic match passed = %u\n",
2226 		   stats->semantic_match_passed);
2227 	seq_printf(seq, "semantic match miss = %u\n",
2228 		   stats->semantic_match_miss);
2229 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2230 	seq_printf(seq, "skipped node resize = %u\n\n",
2231 		   stats->resize_node_skipped);
2232 }
2233 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2234 
2235 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2236 {
2237 	if (tb->tb_id == RT_TABLE_LOCAL)
2238 		seq_puts(seq, "Local:\n");
2239 	else if (tb->tb_id == RT_TABLE_MAIN)
2240 		seq_puts(seq, "Main:\n");
2241 	else
2242 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2243 }
2244 
2245 
2246 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2247 {
2248 	struct net *net = (struct net *)seq->private;
2249 	unsigned int h;
2250 
2251 	seq_printf(seq,
2252 		   "Basic info: size of leaf:"
2253 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2254 		   sizeof(struct leaf), sizeof(struct tnode));
2255 
2256 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2257 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2258 		struct hlist_node *node;
2259 		struct fib_table *tb;
2260 
2261 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2262 			struct trie *t = (struct trie *) tb->tb_data;
2263 			struct trie_stat stat;
2264 
2265 			if (!t)
2266 				continue;
2267 
2268 			fib_table_print(seq, tb);
2269 
2270 			trie_collect_stats(t, &stat);
2271 			trie_show_stats(seq, &stat);
2272 #ifdef CONFIG_IP_FIB_TRIE_STATS
2273 			trie_show_usage(seq, &t->stats);
2274 #endif
2275 		}
2276 	}
2277 
2278 	return 0;
2279 }
2280 
2281 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2282 {
2283 	return single_open_net(inode, file, fib_triestat_seq_show);
2284 }
2285 
2286 static const struct file_operations fib_triestat_fops = {
2287 	.owner	= THIS_MODULE,
2288 	.open	= fib_triestat_seq_open,
2289 	.read	= seq_read,
2290 	.llseek	= seq_lseek,
2291 	.release = single_release_net,
2292 };
2293 
2294 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2295 {
2296 	struct fib_trie_iter *iter = seq->private;
2297 	struct net *net = seq_file_net(seq);
2298 	loff_t idx = 0;
2299 	unsigned int h;
2300 
2301 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2302 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2303 		struct hlist_node *node;
2304 		struct fib_table *tb;
2305 
2306 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2307 			struct node *n;
2308 
2309 			for (n = fib_trie_get_first(iter,
2310 						    (struct trie *) tb->tb_data);
2311 			     n; n = fib_trie_get_next(iter))
2312 				if (pos == idx++) {
2313 					iter->tb = tb;
2314 					return n;
2315 				}
2316 		}
2317 	}
2318 
2319 	return NULL;
2320 }
2321 
2322 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2323 	__acquires(RCU)
2324 {
2325 	rcu_read_lock();
2326 	return fib_trie_get_idx(seq, *pos);
2327 }
2328 
2329 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2330 {
2331 	struct fib_trie_iter *iter = seq->private;
2332 	struct net *net = seq_file_net(seq);
2333 	struct fib_table *tb = iter->tb;
2334 	struct hlist_node *tb_node;
2335 	unsigned int h;
2336 	struct node *n;
2337 
2338 	++*pos;
2339 	/* next node in same table */
2340 	n = fib_trie_get_next(iter);
2341 	if (n)
2342 		return n;
2343 
2344 	/* walk rest of this hash chain */
2345 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2346 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2347 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2348 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2349 		if (n)
2350 			goto found;
2351 	}
2352 
2353 	/* new hash chain */
2354 	while (++h < FIB_TABLE_HASHSZ) {
2355 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2356 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2357 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2358 			if (n)
2359 				goto found;
2360 		}
2361 	}
2362 	return NULL;
2363 
2364 found:
2365 	iter->tb = tb;
2366 	return n;
2367 }
2368 
2369 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2370 	__releases(RCU)
2371 {
2372 	rcu_read_unlock();
2373 }
2374 
2375 static void seq_indent(struct seq_file *seq, int n)
2376 {
2377 	while (n-- > 0) seq_puts(seq, "   ");
2378 }
2379 
2380 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2381 {
2382 	switch (s) {
2383 	case RT_SCOPE_UNIVERSE: return "universe";
2384 	case RT_SCOPE_SITE:	return "site";
2385 	case RT_SCOPE_LINK:	return "link";
2386 	case RT_SCOPE_HOST:	return "host";
2387 	case RT_SCOPE_NOWHERE:	return "nowhere";
2388 	default:
2389 		snprintf(buf, len, "scope=%d", s);
2390 		return buf;
2391 	}
2392 }
2393 
2394 static const char *rtn_type_names[__RTN_MAX] = {
2395 	[RTN_UNSPEC] = "UNSPEC",
2396 	[RTN_UNICAST] = "UNICAST",
2397 	[RTN_LOCAL] = "LOCAL",
2398 	[RTN_BROADCAST] = "BROADCAST",
2399 	[RTN_ANYCAST] = "ANYCAST",
2400 	[RTN_MULTICAST] = "MULTICAST",
2401 	[RTN_BLACKHOLE] = "BLACKHOLE",
2402 	[RTN_UNREACHABLE] = "UNREACHABLE",
2403 	[RTN_PROHIBIT] = "PROHIBIT",
2404 	[RTN_THROW] = "THROW",
2405 	[RTN_NAT] = "NAT",
2406 	[RTN_XRESOLVE] = "XRESOLVE",
2407 };
2408 
2409 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2410 {
2411 	if (t < __RTN_MAX && rtn_type_names[t])
2412 		return rtn_type_names[t];
2413 	snprintf(buf, len, "type %u", t);
2414 	return buf;
2415 }
2416 
2417 /* Pretty print the trie */
2418 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2419 {
2420 	const struct fib_trie_iter *iter = seq->private;
2421 	struct node *n = v;
2422 
2423 	if (!node_parent_rcu(n))
2424 		fib_table_print(seq, iter->tb);
2425 
2426 	if (IS_TNODE(n)) {
2427 		struct tnode *tn = (struct tnode *) n;
2428 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2429 
2430 		seq_indent(seq, iter->depth-1);
2431 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2432 			   &prf, tn->pos, tn->bits, tn->full_children,
2433 			   tn->empty_children);
2434 
2435 	} else {
2436 		struct leaf *l = (struct leaf *) n;
2437 		struct leaf_info *li;
2438 		struct hlist_node *node;
2439 		__be32 val = htonl(l->key);
2440 
2441 		seq_indent(seq, iter->depth);
2442 		seq_printf(seq, "  |-- %pI4\n", &val);
2443 
2444 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2445 			struct fib_alias *fa;
2446 
2447 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2448 				char buf1[32], buf2[32];
2449 
2450 				seq_indent(seq, iter->depth+1);
2451 				seq_printf(seq, "  /%d %s %s", li->plen,
2452 					   rtn_scope(buf1, sizeof(buf1),
2453 						     fa->fa_scope),
2454 					   rtn_type(buf2, sizeof(buf2),
2455 						    fa->fa_type));
2456 				if (fa->fa_tos)
2457 					seq_printf(seq, " tos=%d", fa->fa_tos);
2458 				seq_putc(seq, '\n');
2459 			}
2460 		}
2461 	}
2462 
2463 	return 0;
2464 }
2465 
2466 static const struct seq_operations fib_trie_seq_ops = {
2467 	.start  = fib_trie_seq_start,
2468 	.next   = fib_trie_seq_next,
2469 	.stop   = fib_trie_seq_stop,
2470 	.show   = fib_trie_seq_show,
2471 };
2472 
2473 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2474 {
2475 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2476 			    sizeof(struct fib_trie_iter));
2477 }
2478 
2479 static const struct file_operations fib_trie_fops = {
2480 	.owner  = THIS_MODULE,
2481 	.open   = fib_trie_seq_open,
2482 	.read   = seq_read,
2483 	.llseek = seq_lseek,
2484 	.release = seq_release_net,
2485 };
2486 
2487 struct fib_route_iter {
2488 	struct seq_net_private p;
2489 	struct trie *main_trie;
2490 	loff_t	pos;
2491 	t_key	key;
2492 };
2493 
2494 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2495 {
2496 	struct leaf *l = NULL;
2497 	struct trie *t = iter->main_trie;
2498 
2499 	/* use cache location of last found key */
2500 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2501 		pos -= iter->pos;
2502 	else {
2503 		iter->pos = 0;
2504 		l = trie_firstleaf(t);
2505 	}
2506 
2507 	while (l && pos-- > 0) {
2508 		iter->pos++;
2509 		l = trie_nextleaf(l);
2510 	}
2511 
2512 	if (l)
2513 		iter->key = pos;	/* remember it */
2514 	else
2515 		iter->pos = 0;		/* forget it */
2516 
2517 	return l;
2518 }
2519 
2520 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2521 	__acquires(RCU)
2522 {
2523 	struct fib_route_iter *iter = seq->private;
2524 	struct fib_table *tb;
2525 
2526 	rcu_read_lock();
2527 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2528 	if (!tb)
2529 		return NULL;
2530 
2531 	iter->main_trie = (struct trie *) tb->tb_data;
2532 	if (*pos == 0)
2533 		return SEQ_START_TOKEN;
2534 	else
2535 		return fib_route_get_idx(iter, *pos - 1);
2536 }
2537 
2538 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2539 {
2540 	struct fib_route_iter *iter = seq->private;
2541 	struct leaf *l = v;
2542 
2543 	++*pos;
2544 	if (v == SEQ_START_TOKEN) {
2545 		iter->pos = 0;
2546 		l = trie_firstleaf(iter->main_trie);
2547 	} else {
2548 		iter->pos++;
2549 		l = trie_nextleaf(l);
2550 	}
2551 
2552 	if (l)
2553 		iter->key = l->key;
2554 	else
2555 		iter->pos = 0;
2556 	return l;
2557 }
2558 
2559 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2560 	__releases(RCU)
2561 {
2562 	rcu_read_unlock();
2563 }
2564 
2565 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2566 {
2567 	static unsigned type2flags[RTN_MAX + 1] = {
2568 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2569 	};
2570 	unsigned flags = type2flags[type];
2571 
2572 	if (fi && fi->fib_nh->nh_gw)
2573 		flags |= RTF_GATEWAY;
2574 	if (mask == htonl(0xFFFFFFFF))
2575 		flags |= RTF_HOST;
2576 	flags |= RTF_UP;
2577 	return flags;
2578 }
2579 
2580 /*
2581  *	This outputs /proc/net/route.
2582  *	The format of the file is not supposed to be changed
2583  * 	and needs to be same as fib_hash output to avoid breaking
2584  *	legacy utilities
2585  */
2586 static int fib_route_seq_show(struct seq_file *seq, void *v)
2587 {
2588 	struct leaf *l = v;
2589 	struct leaf_info *li;
2590 	struct hlist_node *node;
2591 
2592 	if (v == SEQ_START_TOKEN) {
2593 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2594 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2595 			   "\tWindow\tIRTT");
2596 		return 0;
2597 	}
2598 
2599 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2600 		struct fib_alias *fa;
2601 		__be32 mask, prefix;
2602 
2603 		mask = inet_make_mask(li->plen);
2604 		prefix = htonl(l->key);
2605 
2606 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2607 			const struct fib_info *fi = fa->fa_info;
2608 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2609 			int len;
2610 
2611 			if (fa->fa_type == RTN_BROADCAST
2612 			    || fa->fa_type == RTN_MULTICAST)
2613 				continue;
2614 
2615 			if (fi)
2616 				seq_printf(seq,
2617 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2618 					 "%d\t%08X\t%d\t%u\t%u%n",
2619 					 fi->fib_dev ? fi->fib_dev->name : "*",
2620 					 prefix,
2621 					 fi->fib_nh->nh_gw, flags, 0, 0,
2622 					 fi->fib_priority,
2623 					 mask,
2624 					 (fi->fib_advmss ?
2625 					  fi->fib_advmss + 40 : 0),
2626 					 fi->fib_window,
2627 					 fi->fib_rtt >> 3, &len);
2628 			else
2629 				seq_printf(seq,
2630 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2631 					 "%d\t%08X\t%d\t%u\t%u%n",
2632 					 prefix, 0, flags, 0, 0, 0,
2633 					 mask, 0, 0, 0, &len);
2634 
2635 			seq_printf(seq, "%*s\n", 127 - len, "");
2636 		}
2637 	}
2638 
2639 	return 0;
2640 }
2641 
2642 static const struct seq_operations fib_route_seq_ops = {
2643 	.start  = fib_route_seq_start,
2644 	.next   = fib_route_seq_next,
2645 	.stop   = fib_route_seq_stop,
2646 	.show   = fib_route_seq_show,
2647 };
2648 
2649 static int fib_route_seq_open(struct inode *inode, struct file *file)
2650 {
2651 	return seq_open_net(inode, file, &fib_route_seq_ops,
2652 			    sizeof(struct fib_route_iter));
2653 }
2654 
2655 static const struct file_operations fib_route_fops = {
2656 	.owner  = THIS_MODULE,
2657 	.open   = fib_route_seq_open,
2658 	.read   = seq_read,
2659 	.llseek = seq_lseek,
2660 	.release = seq_release_net,
2661 };
2662 
2663 int __net_init fib_proc_init(struct net *net)
2664 {
2665 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2666 		goto out1;
2667 
2668 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2669 				  &fib_triestat_fops))
2670 		goto out2;
2671 
2672 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2673 		goto out3;
2674 
2675 	return 0;
2676 
2677 out3:
2678 	proc_net_remove(net, "fib_triestat");
2679 out2:
2680 	proc_net_remove(net, "fib_trie");
2681 out1:
2682 	return -ENOMEM;
2683 }
2684 
2685 void __net_exit fib_proc_exit(struct net *net)
2686 {
2687 	proc_net_remove(net, "fib_trie");
2688 	proc_net_remove(net, "fib_triestat");
2689 	proc_net_remove(net, "route");
2690 }
2691 
2692 #endif /* CONFIG_PROC_FS */
2693