xref: /linux/net/ipv4/fib_trie.c (revision 2ba9268dd603d23e17643437b2246acb6844953b)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally described in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
57 #include <linux/mm.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
62 #include <linux/in.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #define MAX_STAT_DEPTH 32
85 
86 #define KEYLENGTH	(8*sizeof(t_key))
87 #define KEY_MAX		((t_key)~0)
88 
89 typedef unsigned int t_key;
90 
91 #define IS_TNODE(n) ((n)->bits)
92 #define IS_LEAF(n) (!(n)->bits)
93 
94 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
95 
96 struct tnode {
97 	t_key key;
98 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
99 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
100 	unsigned char slen;
101 	struct tnode __rcu *parent;
102 	struct rcu_head rcu;
103 	union {
104 		/* The fields in this struct are valid if bits > 0 (TNODE) */
105 		struct {
106 			t_key empty_children; /* KEYLENGTH bits needed */
107 			t_key full_children;  /* KEYLENGTH bits needed */
108 			struct tnode __rcu *child[0];
109 		};
110 		/* This list pointer if valid if bits == 0 (LEAF) */
111 		struct hlist_head list;
112 	};
113 };
114 
115 struct leaf_info {
116 	struct hlist_node hlist;
117 	int plen;
118 	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
119 	struct list_head falh;
120 	struct rcu_head rcu;
121 };
122 
123 #ifdef CONFIG_IP_FIB_TRIE_STATS
124 struct trie_use_stats {
125 	unsigned int gets;
126 	unsigned int backtrack;
127 	unsigned int semantic_match_passed;
128 	unsigned int semantic_match_miss;
129 	unsigned int null_node_hit;
130 	unsigned int resize_node_skipped;
131 };
132 #endif
133 
134 struct trie_stat {
135 	unsigned int totdepth;
136 	unsigned int maxdepth;
137 	unsigned int tnodes;
138 	unsigned int leaves;
139 	unsigned int nullpointers;
140 	unsigned int prefixes;
141 	unsigned int nodesizes[MAX_STAT_DEPTH];
142 };
143 
144 struct trie {
145 	struct tnode __rcu *trie;
146 #ifdef CONFIG_IP_FIB_TRIE_STATS
147 	struct trie_use_stats __percpu *stats;
148 #endif
149 };
150 
151 static void resize(struct trie *t, struct tnode *tn);
152 static size_t tnode_free_size;
153 
154 /*
155  * synchronize_rcu after call_rcu for that many pages; it should be especially
156  * useful before resizing the root node with PREEMPT_NONE configs; the value was
157  * obtained experimentally, aiming to avoid visible slowdown.
158  */
159 static const int sync_pages = 128;
160 
161 static struct kmem_cache *fn_alias_kmem __read_mostly;
162 static struct kmem_cache *trie_leaf_kmem __read_mostly;
163 
164 /* caller must hold RTNL */
165 #define node_parent(n) rtnl_dereference((n)->parent)
166 
167 /* caller must hold RCU read lock or RTNL */
168 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
169 
170 /* wrapper for rcu_assign_pointer */
171 static inline void node_set_parent(struct tnode *n, struct tnode *tp)
172 {
173 	if (n)
174 		rcu_assign_pointer(n->parent, tp);
175 }
176 
177 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
178 
179 /* This provides us with the number of children in this node, in the case of a
180  * leaf this will return 0 meaning none of the children are accessible.
181  */
182 static inline unsigned long tnode_child_length(const struct tnode *tn)
183 {
184 	return (1ul << tn->bits) & ~(1ul);
185 }
186 
187 /* caller must hold RTNL */
188 static inline struct tnode *tnode_get_child(const struct tnode *tn,
189 					    unsigned long i)
190 {
191 	return rtnl_dereference(tn->child[i]);
192 }
193 
194 /* caller must hold RCU read lock or RTNL */
195 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
196 						unsigned long i)
197 {
198 	return rcu_dereference_rtnl(tn->child[i]);
199 }
200 
201 /* To understand this stuff, an understanding of keys and all their bits is
202  * necessary. Every node in the trie has a key associated with it, but not
203  * all of the bits in that key are significant.
204  *
205  * Consider a node 'n' and its parent 'tp'.
206  *
207  * If n is a leaf, every bit in its key is significant. Its presence is
208  * necessitated by path compression, since during a tree traversal (when
209  * searching for a leaf - unless we are doing an insertion) we will completely
210  * ignore all skipped bits we encounter. Thus we need to verify, at the end of
211  * a potentially successful search, that we have indeed been walking the
212  * correct key path.
213  *
214  * Note that we can never "miss" the correct key in the tree if present by
215  * following the wrong path. Path compression ensures that segments of the key
216  * that are the same for all keys with a given prefix are skipped, but the
217  * skipped part *is* identical for each node in the subtrie below the skipped
218  * bit! trie_insert() in this implementation takes care of that.
219  *
220  * if n is an internal node - a 'tnode' here, the various parts of its key
221  * have many different meanings.
222  *
223  * Example:
224  * _________________________________________________________________
225  * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
226  * -----------------------------------------------------------------
227  *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
228  *
229  * _________________________________________________________________
230  * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
231  * -----------------------------------------------------------------
232  *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
233  *
234  * tp->pos = 22
235  * tp->bits = 3
236  * n->pos = 13
237  * n->bits = 4
238  *
239  * First, let's just ignore the bits that come before the parent tp, that is
240  * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
241  * point we do not use them for anything.
242  *
243  * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
244  * index into the parent's child array. That is, they will be used to find
245  * 'n' among tp's children.
246  *
247  * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
248  * for the node n.
249  *
250  * All the bits we have seen so far are significant to the node n. The rest
251  * of the bits are really not needed or indeed known in n->key.
252  *
253  * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
254  * n's child array, and will of course be different for each child.
255  *
256  * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
257  * at this point.
258  */
259 
260 static const int halve_threshold = 25;
261 static const int inflate_threshold = 50;
262 static const int halve_threshold_root = 15;
263 static const int inflate_threshold_root = 30;
264 
265 static void __alias_free_mem(struct rcu_head *head)
266 {
267 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
268 	kmem_cache_free(fn_alias_kmem, fa);
269 }
270 
271 static inline void alias_free_mem_rcu(struct fib_alias *fa)
272 {
273 	call_rcu(&fa->rcu, __alias_free_mem);
274 }
275 
276 #define TNODE_KMALLOC_MAX \
277 	ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
278 
279 static void __node_free_rcu(struct rcu_head *head)
280 {
281 	struct tnode *n = container_of(head, struct tnode, rcu);
282 
283 	if (IS_LEAF(n))
284 		kmem_cache_free(trie_leaf_kmem, n);
285 	else if (n->bits <= TNODE_KMALLOC_MAX)
286 		kfree(n);
287 	else
288 		vfree(n);
289 }
290 
291 #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
292 
293 static inline void free_leaf_info(struct leaf_info *leaf)
294 {
295 	kfree_rcu(leaf, rcu);
296 }
297 
298 static struct tnode *tnode_alloc(size_t size)
299 {
300 	if (size <= PAGE_SIZE)
301 		return kzalloc(size, GFP_KERNEL);
302 	else
303 		return vzalloc(size);
304 }
305 
306 static inline void empty_child_inc(struct tnode *n)
307 {
308 	++n->empty_children ? : ++n->full_children;
309 }
310 
311 static inline void empty_child_dec(struct tnode *n)
312 {
313 	n->empty_children-- ? : n->full_children--;
314 }
315 
316 static struct tnode *leaf_new(t_key key)
317 {
318 	struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
319 	if (l) {
320 		l->parent = NULL;
321 		/* set key and pos to reflect full key value
322 		 * any trailing zeros in the key should be ignored
323 		 * as the nodes are searched
324 		 */
325 		l->key = key;
326 		l->slen = 0;
327 		l->pos = 0;
328 		/* set bits to 0 indicating we are not a tnode */
329 		l->bits = 0;
330 
331 		INIT_HLIST_HEAD(&l->list);
332 	}
333 	return l;
334 }
335 
336 static struct leaf_info *leaf_info_new(int plen)
337 {
338 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
339 	if (li) {
340 		li->plen = plen;
341 		li->mask_plen = ntohl(inet_make_mask(plen));
342 		INIT_LIST_HEAD(&li->falh);
343 	}
344 	return li;
345 }
346 
347 static struct tnode *tnode_new(t_key key, int pos, int bits)
348 {
349 	size_t sz = offsetof(struct tnode, child[1ul << bits]);
350 	struct tnode *tn = tnode_alloc(sz);
351 	unsigned int shift = pos + bits;
352 
353 	/* verify bits and pos their msb bits clear and values are valid */
354 	BUG_ON(!bits || (shift > KEYLENGTH));
355 
356 	if (tn) {
357 		tn->parent = NULL;
358 		tn->slen = pos;
359 		tn->pos = pos;
360 		tn->bits = bits;
361 		tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
362 		if (bits == KEYLENGTH)
363 			tn->full_children = 1;
364 		else
365 			tn->empty_children = 1ul << bits;
366 	}
367 
368 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
369 		 sizeof(struct tnode *) << bits);
370 	return tn;
371 }
372 
373 /* Check whether a tnode 'n' is "full", i.e. it is an internal node
374  * and no bits are skipped. See discussion in dyntree paper p. 6
375  */
376 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
377 {
378 	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
379 }
380 
381 /* Add a child at position i overwriting the old value.
382  * Update the value of full_children and empty_children.
383  */
384 static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
385 {
386 	struct tnode *chi = tnode_get_child(tn, i);
387 	int isfull, wasfull;
388 
389 	BUG_ON(i >= tnode_child_length(tn));
390 
391 	/* update emptyChildren, overflow into fullChildren */
392 	if (n == NULL && chi != NULL)
393 		empty_child_inc(tn);
394 	if (n != NULL && chi == NULL)
395 		empty_child_dec(tn);
396 
397 	/* update fullChildren */
398 	wasfull = tnode_full(tn, chi);
399 	isfull = tnode_full(tn, n);
400 
401 	if (wasfull && !isfull)
402 		tn->full_children--;
403 	else if (!wasfull && isfull)
404 		tn->full_children++;
405 
406 	if (n && (tn->slen < n->slen))
407 		tn->slen = n->slen;
408 
409 	rcu_assign_pointer(tn->child[i], n);
410 }
411 
412 static void update_children(struct tnode *tn)
413 {
414 	unsigned long i;
415 
416 	/* update all of the child parent pointers */
417 	for (i = tnode_child_length(tn); i;) {
418 		struct tnode *inode = tnode_get_child(tn, --i);
419 
420 		if (!inode)
421 			continue;
422 
423 		/* Either update the children of a tnode that
424 		 * already belongs to us or update the child
425 		 * to point to ourselves.
426 		 */
427 		if (node_parent(inode) == tn)
428 			update_children(inode);
429 		else
430 			node_set_parent(inode, tn);
431 	}
432 }
433 
434 static inline void put_child_root(struct tnode *tp, struct trie *t,
435 				  t_key key, struct tnode *n)
436 {
437 	if (tp)
438 		put_child(tp, get_index(key, tp), n);
439 	else
440 		rcu_assign_pointer(t->trie, n);
441 }
442 
443 static inline void tnode_free_init(struct tnode *tn)
444 {
445 	tn->rcu.next = NULL;
446 }
447 
448 static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
449 {
450 	n->rcu.next = tn->rcu.next;
451 	tn->rcu.next = &n->rcu;
452 }
453 
454 static void tnode_free(struct tnode *tn)
455 {
456 	struct callback_head *head = &tn->rcu;
457 
458 	while (head) {
459 		head = head->next;
460 		tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
461 		node_free(tn);
462 
463 		tn = container_of(head, struct tnode, rcu);
464 	}
465 
466 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
467 		tnode_free_size = 0;
468 		synchronize_rcu();
469 	}
470 }
471 
472 static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
473 {
474 	struct tnode *tp = node_parent(oldtnode);
475 	unsigned long i;
476 
477 	/* setup the parent pointer out of and back into this node */
478 	NODE_INIT_PARENT(tn, tp);
479 	put_child_root(tp, t, tn->key, tn);
480 
481 	/* update all of the child parent pointers */
482 	update_children(tn);
483 
484 	/* all pointers should be clean so we are done */
485 	tnode_free(oldtnode);
486 
487 	/* resize children now that oldtnode is freed */
488 	for (i = tnode_child_length(tn); i;) {
489 		struct tnode *inode = tnode_get_child(tn, --i);
490 
491 		/* resize child node */
492 		if (tnode_full(tn, inode))
493 			resize(t, inode);
494 	}
495 }
496 
497 static int inflate(struct trie *t, struct tnode *oldtnode)
498 {
499 	struct tnode *tn;
500 	unsigned long i;
501 	t_key m;
502 
503 	pr_debug("In inflate\n");
504 
505 	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
506 	if (!tn)
507 		return -ENOMEM;
508 
509 	/* prepare oldtnode to be freed */
510 	tnode_free_init(oldtnode);
511 
512 	/* Assemble all of the pointers in our cluster, in this case that
513 	 * represents all of the pointers out of our allocated nodes that
514 	 * point to existing tnodes and the links between our allocated
515 	 * nodes.
516 	 */
517 	for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
518 		struct tnode *inode = tnode_get_child(oldtnode, --i);
519 		struct tnode *node0, *node1;
520 		unsigned long j, k;
521 
522 		/* An empty child */
523 		if (inode == NULL)
524 			continue;
525 
526 		/* A leaf or an internal node with skipped bits */
527 		if (!tnode_full(oldtnode, inode)) {
528 			put_child(tn, get_index(inode->key, tn), inode);
529 			continue;
530 		}
531 
532 		/* drop the node in the old tnode free list */
533 		tnode_free_append(oldtnode, inode);
534 
535 		/* An internal node with two children */
536 		if (inode->bits == 1) {
537 			put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
538 			put_child(tn, 2 * i, tnode_get_child(inode, 0));
539 			continue;
540 		}
541 
542 		/* We will replace this node 'inode' with two new
543 		 * ones, 'node0' and 'node1', each with half of the
544 		 * original children. The two new nodes will have
545 		 * a position one bit further down the key and this
546 		 * means that the "significant" part of their keys
547 		 * (see the discussion near the top of this file)
548 		 * will differ by one bit, which will be "0" in
549 		 * node0's key and "1" in node1's key. Since we are
550 		 * moving the key position by one step, the bit that
551 		 * we are moving away from - the bit at position
552 		 * (tn->pos) - is the one that will differ between
553 		 * node0 and node1. So... we synthesize that bit in the
554 		 * two new keys.
555 		 */
556 		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
557 		if (!node1)
558 			goto nomem;
559 		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
560 
561 		tnode_free_append(tn, node1);
562 		if (!node0)
563 			goto nomem;
564 		tnode_free_append(tn, node0);
565 
566 		/* populate child pointers in new nodes */
567 		for (k = tnode_child_length(inode), j = k / 2; j;) {
568 			put_child(node1, --j, tnode_get_child(inode, --k));
569 			put_child(node0, j, tnode_get_child(inode, j));
570 			put_child(node1, --j, tnode_get_child(inode, --k));
571 			put_child(node0, j, tnode_get_child(inode, j));
572 		}
573 
574 		/* link new nodes to parent */
575 		NODE_INIT_PARENT(node1, tn);
576 		NODE_INIT_PARENT(node0, tn);
577 
578 		/* link parent to nodes */
579 		put_child(tn, 2 * i + 1, node1);
580 		put_child(tn, 2 * i, node0);
581 	}
582 
583 	/* setup the parent pointers into and out of this node */
584 	replace(t, oldtnode, tn);
585 
586 	return 0;
587 nomem:
588 	/* all pointers should be clean so we are done */
589 	tnode_free(tn);
590 	return -ENOMEM;
591 }
592 
593 static int halve(struct trie *t, struct tnode *oldtnode)
594 {
595 	struct tnode *tn;
596 	unsigned long i;
597 
598 	pr_debug("In halve\n");
599 
600 	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
601 	if (!tn)
602 		return -ENOMEM;
603 
604 	/* prepare oldtnode to be freed */
605 	tnode_free_init(oldtnode);
606 
607 	/* Assemble all of the pointers in our cluster, in this case that
608 	 * represents all of the pointers out of our allocated nodes that
609 	 * point to existing tnodes and the links between our allocated
610 	 * nodes.
611 	 */
612 	for (i = tnode_child_length(oldtnode); i;) {
613 		struct tnode *node1 = tnode_get_child(oldtnode, --i);
614 		struct tnode *node0 = tnode_get_child(oldtnode, --i);
615 		struct tnode *inode;
616 
617 		/* At least one of the children is empty */
618 		if (!node1 || !node0) {
619 			put_child(tn, i / 2, node1 ? : node0);
620 			continue;
621 		}
622 
623 		/* Two nonempty children */
624 		inode = tnode_new(node0->key, oldtnode->pos, 1);
625 		if (!inode) {
626 			tnode_free(tn);
627 			return -ENOMEM;
628 		}
629 		tnode_free_append(tn, inode);
630 
631 		/* initialize pointers out of node */
632 		put_child(inode, 1, node1);
633 		put_child(inode, 0, node0);
634 		NODE_INIT_PARENT(inode, tn);
635 
636 		/* link parent to node */
637 		put_child(tn, i / 2, inode);
638 	}
639 
640 	/* setup the parent pointers into and out of this node */
641 	replace(t, oldtnode, tn);
642 
643 	return 0;
644 }
645 
646 static void collapse(struct trie *t, struct tnode *oldtnode)
647 {
648 	struct tnode *n, *tp;
649 	unsigned long i;
650 
651 	/* scan the tnode looking for that one child that might still exist */
652 	for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
653 		n = tnode_get_child(oldtnode, --i);
654 
655 	/* compress one level */
656 	tp = node_parent(oldtnode);
657 	put_child_root(tp, t, oldtnode->key, n);
658 	node_set_parent(n, tp);
659 
660 	/* drop dead node */
661 	node_free(oldtnode);
662 }
663 
664 static unsigned char update_suffix(struct tnode *tn)
665 {
666 	unsigned char slen = tn->pos;
667 	unsigned long stride, i;
668 
669 	/* search though the list of children looking for nodes that might
670 	 * have a suffix greater than the one we currently have.  This is
671 	 * why we start with a stride of 2 since a stride of 1 would
672 	 * represent the nodes with suffix length equal to tn->pos
673 	 */
674 	for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
675 		struct tnode *n = tnode_get_child(tn, i);
676 
677 		if (!n || (n->slen <= slen))
678 			continue;
679 
680 		/* update stride and slen based on new value */
681 		stride <<= (n->slen - slen);
682 		slen = n->slen;
683 		i &= ~(stride - 1);
684 
685 		/* if slen covers all but the last bit we can stop here
686 		 * there will be nothing longer than that since only node
687 		 * 0 and 1 << (bits - 1) could have that as their suffix
688 		 * length.
689 		 */
690 		if ((slen + 1) >= (tn->pos + tn->bits))
691 			break;
692 	}
693 
694 	tn->slen = slen;
695 
696 	return slen;
697 }
698 
699 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
700  * the Helsinki University of Technology and Matti Tikkanen of Nokia
701  * Telecommunications, page 6:
702  * "A node is doubled if the ratio of non-empty children to all
703  * children in the *doubled* node is at least 'high'."
704  *
705  * 'high' in this instance is the variable 'inflate_threshold'. It
706  * is expressed as a percentage, so we multiply it with
707  * tnode_child_length() and instead of multiplying by 2 (since the
708  * child array will be doubled by inflate()) and multiplying
709  * the left-hand side by 100 (to handle the percentage thing) we
710  * multiply the left-hand side by 50.
711  *
712  * The left-hand side may look a bit weird: tnode_child_length(tn)
713  * - tn->empty_children is of course the number of non-null children
714  * in the current node. tn->full_children is the number of "full"
715  * children, that is non-null tnodes with a skip value of 0.
716  * All of those will be doubled in the resulting inflated tnode, so
717  * we just count them one extra time here.
718  *
719  * A clearer way to write this would be:
720  *
721  * to_be_doubled = tn->full_children;
722  * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
723  *     tn->full_children;
724  *
725  * new_child_length = tnode_child_length(tn) * 2;
726  *
727  * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
728  *      new_child_length;
729  * if (new_fill_factor >= inflate_threshold)
730  *
731  * ...and so on, tho it would mess up the while () loop.
732  *
733  * anyway,
734  * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
735  *      inflate_threshold
736  *
737  * avoid a division:
738  * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
739  *      inflate_threshold * new_child_length
740  *
741  * expand not_to_be_doubled and to_be_doubled, and shorten:
742  * 100 * (tnode_child_length(tn) - tn->empty_children +
743  *    tn->full_children) >= inflate_threshold * new_child_length
744  *
745  * expand new_child_length:
746  * 100 * (tnode_child_length(tn) - tn->empty_children +
747  *    tn->full_children) >=
748  *      inflate_threshold * tnode_child_length(tn) * 2
749  *
750  * shorten again:
751  * 50 * (tn->full_children + tnode_child_length(tn) -
752  *    tn->empty_children) >= inflate_threshold *
753  *    tnode_child_length(tn)
754  *
755  */
756 static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
757 {
758 	unsigned long used = tnode_child_length(tn);
759 	unsigned long threshold = used;
760 
761 	/* Keep root node larger */
762 	threshold *= tp ? inflate_threshold : inflate_threshold_root;
763 	used -= tn->empty_children;
764 	used += tn->full_children;
765 
766 	/* if bits == KEYLENGTH then pos = 0, and will fail below */
767 
768 	return (used > 1) && tn->pos && ((50 * used) >= threshold);
769 }
770 
771 static bool should_halve(const struct tnode *tp, const struct tnode *tn)
772 {
773 	unsigned long used = tnode_child_length(tn);
774 	unsigned long threshold = used;
775 
776 	/* Keep root node larger */
777 	threshold *= tp ? halve_threshold : halve_threshold_root;
778 	used -= tn->empty_children;
779 
780 	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
781 
782 	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
783 }
784 
785 static bool should_collapse(const struct tnode *tn)
786 {
787 	unsigned long used = tnode_child_length(tn);
788 
789 	used -= tn->empty_children;
790 
791 	/* account for bits == KEYLENGTH case */
792 	if ((tn->bits == KEYLENGTH) && tn->full_children)
793 		used -= KEY_MAX;
794 
795 	/* One child or none, time to drop us from the trie */
796 	return used < 2;
797 }
798 
799 #define MAX_WORK 10
800 static void resize(struct trie *t, struct tnode *tn)
801 {
802 	struct tnode *tp = node_parent(tn);
803 	struct tnode __rcu **cptr;
804 	int max_work = MAX_WORK;
805 
806 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
807 		 tn, inflate_threshold, halve_threshold);
808 
809 	/* track the tnode via the pointer from the parent instead of
810 	 * doing it ourselves.  This way we can let RCU fully do its
811 	 * thing without us interfering
812 	 */
813 	cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
814 	BUG_ON(tn != rtnl_dereference(*cptr));
815 
816 	/* Double as long as the resulting node has a number of
817 	 * nonempty nodes that are above the threshold.
818 	 */
819 	while (should_inflate(tp, tn) && max_work) {
820 		if (inflate(t, tn)) {
821 #ifdef CONFIG_IP_FIB_TRIE_STATS
822 			this_cpu_inc(t->stats->resize_node_skipped);
823 #endif
824 			break;
825 		}
826 
827 		max_work--;
828 		tn = rtnl_dereference(*cptr);
829 	}
830 
831 	/* Return if at least one inflate is run */
832 	if (max_work != MAX_WORK)
833 		return;
834 
835 	/* Halve as long as the number of empty children in this
836 	 * node is above threshold.
837 	 */
838 	while (should_halve(tp, tn) && max_work) {
839 		if (halve(t, tn)) {
840 #ifdef CONFIG_IP_FIB_TRIE_STATS
841 			this_cpu_inc(t->stats->resize_node_skipped);
842 #endif
843 			break;
844 		}
845 
846 		max_work--;
847 		tn = rtnl_dereference(*cptr);
848 	}
849 
850 	/* Only one child remains */
851 	if (should_collapse(tn)) {
852 		collapse(t, tn);
853 		return;
854 	}
855 
856 	/* Return if at least one deflate was run */
857 	if (max_work != MAX_WORK)
858 		return;
859 
860 	/* push the suffix length to the parent node */
861 	if (tn->slen > tn->pos) {
862 		unsigned char slen = update_suffix(tn);
863 
864 		if (tp && (slen > tp->slen))
865 			tp->slen = slen;
866 	}
867 }
868 
869 /* readside must use rcu_read_lock currently dump routines
870  via get_fa_head and dump */
871 
872 static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
873 {
874 	struct hlist_head *head = &l->list;
875 	struct leaf_info *li;
876 
877 	hlist_for_each_entry_rcu(li, head, hlist)
878 		if (li->plen == plen)
879 			return li;
880 
881 	return NULL;
882 }
883 
884 static inline struct list_head *get_fa_head(struct tnode *l, int plen)
885 {
886 	struct leaf_info *li = find_leaf_info(l, plen);
887 
888 	if (!li)
889 		return NULL;
890 
891 	return &li->falh;
892 }
893 
894 static void leaf_pull_suffix(struct tnode *l)
895 {
896 	struct tnode *tp = node_parent(l);
897 
898 	while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
899 		if (update_suffix(tp) > l->slen)
900 			break;
901 		tp = node_parent(tp);
902 	}
903 }
904 
905 static void leaf_push_suffix(struct tnode *l)
906 {
907 	struct tnode *tn = node_parent(l);
908 
909 	/* if this is a new leaf then tn will be NULL and we can sort
910 	 * out parent suffix lengths as a part of trie_rebalance
911 	 */
912 	while (tn && (tn->slen < l->slen)) {
913 		tn->slen = l->slen;
914 		tn = node_parent(tn);
915 	}
916 }
917 
918 static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
919 {
920 	/* record the location of the previous list_info entry */
921 	struct hlist_node **pprev = old->hlist.pprev;
922 	struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
923 
924 	/* remove the leaf info from the list */
925 	hlist_del_rcu(&old->hlist);
926 
927 	/* only access li if it is pointing at the last valid hlist_node */
928 	if (hlist_empty(&l->list) || (*pprev))
929 		return;
930 
931 	/* update the trie with the latest suffix length */
932 	l->slen = KEYLENGTH - li->plen;
933 	leaf_pull_suffix(l);
934 }
935 
936 static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
937 {
938 	struct hlist_head *head = &l->list;
939 	struct leaf_info *li = NULL, *last = NULL;
940 
941 	if (hlist_empty(head)) {
942 		hlist_add_head_rcu(&new->hlist, head);
943 	} else {
944 		hlist_for_each_entry(li, head, hlist) {
945 			if (new->plen > li->plen)
946 				break;
947 
948 			last = li;
949 		}
950 		if (last)
951 			hlist_add_behind_rcu(&new->hlist, &last->hlist);
952 		else
953 			hlist_add_before_rcu(&new->hlist, &li->hlist);
954 	}
955 
956 	/* if we added to the tail node then we need to update slen */
957 	if (l->slen < (KEYLENGTH - new->plen)) {
958 		l->slen = KEYLENGTH - new->plen;
959 		leaf_push_suffix(l);
960 	}
961 }
962 
963 /* rcu_read_lock needs to be hold by caller from readside */
964 static struct tnode *fib_find_node(struct trie *t, u32 key)
965 {
966 	struct tnode *n = rcu_dereference_rtnl(t->trie);
967 
968 	while (n) {
969 		unsigned long index = get_index(key, n);
970 
971 		/* This bit of code is a bit tricky but it combines multiple
972 		 * checks into a single check.  The prefix consists of the
973 		 * prefix plus zeros for the bits in the cindex. The index
974 		 * is the difference between the key and this value.  From
975 		 * this we can actually derive several pieces of data.
976 		 *   if (index & (~0ul << bits))
977 		 *     we have a mismatch in skip bits and failed
978 		 *   else
979 		 *     we know the value is cindex
980 		 */
981 		if (index & (~0ul << n->bits))
982 			return NULL;
983 
984 		/* we have found a leaf. Prefixes have already been compared */
985 		if (IS_LEAF(n))
986 			break;
987 
988 		n = tnode_get_child_rcu(n, index);
989 	}
990 
991 	return n;
992 }
993 
994 /* Return the first fib alias matching TOS with
995  * priority less than or equal to PRIO.
996  */
997 static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
998 {
999 	struct fib_alias *fa;
1000 
1001 	if (!fah)
1002 		return NULL;
1003 
1004 	list_for_each_entry(fa, fah, fa_list) {
1005 		if (fa->fa_tos > tos)
1006 			continue;
1007 		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1008 			return fa;
1009 	}
1010 
1011 	return NULL;
1012 }
1013 
1014 static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 {
1016 	struct tnode *tp;
1017 
1018 	while ((tp = node_parent(tn)) != NULL) {
1019 		resize(t, tn);
1020 		tn = tp;
1021 	}
1022 
1023 	/* Handle last (top) tnode */
1024 	if (IS_TNODE(tn))
1025 		resize(t, tn);
1026 }
1027 
1028 /* only used from updater-side */
1029 
1030 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1031 {
1032 	struct list_head *fa_head = NULL;
1033 	struct tnode *l, *n, *tp = NULL;
1034 	struct leaf_info *li;
1035 
1036 	li = leaf_info_new(plen);
1037 	if (!li)
1038 		return NULL;
1039 	fa_head = &li->falh;
1040 
1041 	n = rtnl_dereference(t->trie);
1042 
1043 	/* If we point to NULL, stop. Either the tree is empty and we should
1044 	 * just put a new leaf in if, or we have reached an empty child slot,
1045 	 * and we should just put our new leaf in that.
1046 	 *
1047 	 * If we hit a node with a key that does't match then we should stop
1048 	 * and create a new tnode to replace that node and insert ourselves
1049 	 * and the other node into the new tnode.
1050 	 */
1051 	while (n) {
1052 		unsigned long index = get_index(key, n);
1053 
1054 		/* This bit of code is a bit tricky but it combines multiple
1055 		 * checks into a single check.  The prefix consists of the
1056 		 * prefix plus zeros for the "bits" in the prefix. The index
1057 		 * is the difference between the key and this value.  From
1058 		 * this we can actually derive several pieces of data.
1059 		 *   if !(index >> bits)
1060 		 *     we know the value is child index
1061 		 *   else
1062 		 *     we have a mismatch in skip bits and failed
1063 		 */
1064 		if (index >> n->bits)
1065 			break;
1066 
1067 		/* we have found a leaf. Prefixes have already been compared */
1068 		if (IS_LEAF(n)) {
1069 			/* Case 1: n is a leaf, and prefixes match*/
1070 			insert_leaf_info(n, li);
1071 			return fa_head;
1072 		}
1073 
1074 		tp = n;
1075 		n = tnode_get_child_rcu(n, index);
1076 	}
1077 
1078 	l = leaf_new(key);
1079 	if (!l) {
1080 		free_leaf_info(li);
1081 		return NULL;
1082 	}
1083 
1084 	insert_leaf_info(l, li);
1085 
1086 	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1087 	 *
1088 	 *  Add a new tnode here
1089 	 *  first tnode need some special handling
1090 	 *  leaves us in position for handling as case 3
1091 	 */
1092 	if (n) {
1093 		struct tnode *tn;
1094 
1095 		tn = tnode_new(key, __fls(key ^ n->key), 1);
1096 		if (!tn) {
1097 			free_leaf_info(li);
1098 			node_free(l);
1099 			return NULL;
1100 		}
1101 
1102 		/* initialize routes out of node */
1103 		NODE_INIT_PARENT(tn, tp);
1104 		put_child(tn, get_index(key, tn) ^ 1, n);
1105 
1106 		/* start adding routes into the node */
1107 		put_child_root(tp, t, key, tn);
1108 		node_set_parent(n, tn);
1109 
1110 		/* parent now has a NULL spot where the leaf can go */
1111 		tp = tn;
1112 	}
1113 
1114 	/* Case 3: n is NULL, and will just insert a new leaf */
1115 	if (tp) {
1116 		NODE_INIT_PARENT(l, tp);
1117 		put_child(tp, get_index(key, tp), l);
1118 		trie_rebalance(t, tp);
1119 	} else {
1120 		rcu_assign_pointer(t->trie, l);
1121 	}
1122 
1123 	return fa_head;
1124 }
1125 
1126 /*
1127  * Caller must hold RTNL.
1128  */
1129 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1130 {
1131 	struct trie *t = (struct trie *) tb->tb_data;
1132 	struct fib_alias *fa, *new_fa;
1133 	struct list_head *fa_head = NULL;
1134 	struct fib_info *fi;
1135 	int plen = cfg->fc_dst_len;
1136 	u8 tos = cfg->fc_tos;
1137 	u32 key, mask;
1138 	int err;
1139 	struct tnode *l;
1140 
1141 	if (plen > 32)
1142 		return -EINVAL;
1143 
1144 	key = ntohl(cfg->fc_dst);
1145 
1146 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1147 
1148 	mask = ntohl(inet_make_mask(plen));
1149 
1150 	if (key & ~mask)
1151 		return -EINVAL;
1152 
1153 	key = key & mask;
1154 
1155 	fi = fib_create_info(cfg);
1156 	if (IS_ERR(fi)) {
1157 		err = PTR_ERR(fi);
1158 		goto err;
1159 	}
1160 
1161 	l = fib_find_node(t, key);
1162 	fa = NULL;
1163 
1164 	if (l) {
1165 		fa_head = get_fa_head(l, plen);
1166 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1167 	}
1168 
1169 	/* Now fa, if non-NULL, points to the first fib alias
1170 	 * with the same keys [prefix,tos,priority], if such key already
1171 	 * exists or to the node before which we will insert new one.
1172 	 *
1173 	 * If fa is NULL, we will need to allocate a new one and
1174 	 * insert to the head of f.
1175 	 *
1176 	 * If f is NULL, no fib node matched the destination key
1177 	 * and we need to allocate a new one of those as well.
1178 	 */
1179 
1180 	if (fa && fa->fa_tos == tos &&
1181 	    fa->fa_info->fib_priority == fi->fib_priority) {
1182 		struct fib_alias *fa_first, *fa_match;
1183 
1184 		err = -EEXIST;
1185 		if (cfg->fc_nlflags & NLM_F_EXCL)
1186 			goto out;
1187 
1188 		/* We have 2 goals:
1189 		 * 1. Find exact match for type, scope, fib_info to avoid
1190 		 * duplicate routes
1191 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1192 		 */
1193 		fa_match = NULL;
1194 		fa_first = fa;
1195 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1196 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1197 			if (fa->fa_tos != tos)
1198 				break;
1199 			if (fa->fa_info->fib_priority != fi->fib_priority)
1200 				break;
1201 			if (fa->fa_type == cfg->fc_type &&
1202 			    fa->fa_info == fi) {
1203 				fa_match = fa;
1204 				break;
1205 			}
1206 		}
1207 
1208 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1209 			struct fib_info *fi_drop;
1210 			u8 state;
1211 
1212 			fa = fa_first;
1213 			if (fa_match) {
1214 				if (fa == fa_match)
1215 					err = 0;
1216 				goto out;
1217 			}
1218 			err = -ENOBUFS;
1219 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1220 			if (new_fa == NULL)
1221 				goto out;
1222 
1223 			fi_drop = fa->fa_info;
1224 			new_fa->fa_tos = fa->fa_tos;
1225 			new_fa->fa_info = fi;
1226 			new_fa->fa_type = cfg->fc_type;
1227 			state = fa->fa_state;
1228 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1229 
1230 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1231 			alias_free_mem_rcu(fa);
1232 
1233 			fib_release_info(fi_drop);
1234 			if (state & FA_S_ACCESSED)
1235 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1236 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1237 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1238 
1239 			goto succeeded;
1240 		}
1241 		/* Error if we find a perfect match which
1242 		 * uses the same scope, type, and nexthop
1243 		 * information.
1244 		 */
1245 		if (fa_match)
1246 			goto out;
1247 
1248 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1249 			fa = fa_first;
1250 	}
1251 	err = -ENOENT;
1252 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1253 		goto out;
1254 
1255 	err = -ENOBUFS;
1256 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1257 	if (new_fa == NULL)
1258 		goto out;
1259 
1260 	new_fa->fa_info = fi;
1261 	new_fa->fa_tos = tos;
1262 	new_fa->fa_type = cfg->fc_type;
1263 	new_fa->fa_state = 0;
1264 	/*
1265 	 * Insert new entry to the list.
1266 	 */
1267 
1268 	if (!fa_head) {
1269 		fa_head = fib_insert_node(t, key, plen);
1270 		if (unlikely(!fa_head)) {
1271 			err = -ENOMEM;
1272 			goto out_free_new_fa;
1273 		}
1274 	}
1275 
1276 	if (!plen)
1277 		tb->tb_num_default++;
1278 
1279 	list_add_tail_rcu(&new_fa->fa_list,
1280 			  (fa ? &fa->fa_list : fa_head));
1281 
1282 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1283 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1284 		  &cfg->fc_nlinfo, 0);
1285 succeeded:
1286 	return 0;
1287 
1288 out_free_new_fa:
1289 	kmem_cache_free(fn_alias_kmem, new_fa);
1290 out:
1291 	fib_release_info(fi);
1292 err:
1293 	return err;
1294 }
1295 
1296 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1297 {
1298 	t_key prefix = n->key;
1299 
1300 	return (key ^ prefix) & (prefix | -prefix);
1301 }
1302 
1303 /* should be called with rcu_read_lock */
1304 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1305 		     struct fib_result *res, int fib_flags)
1306 {
1307 	struct trie *t = (struct trie *)tb->tb_data;
1308 #ifdef CONFIG_IP_FIB_TRIE_STATS
1309 	struct trie_use_stats __percpu *stats = t->stats;
1310 #endif
1311 	const t_key key = ntohl(flp->daddr);
1312 	struct tnode *n, *pn;
1313 	struct leaf_info *li;
1314 	t_key cindex;
1315 
1316 	n = rcu_dereference(t->trie);
1317 	if (!n)
1318 		return -EAGAIN;
1319 
1320 #ifdef CONFIG_IP_FIB_TRIE_STATS
1321 	this_cpu_inc(stats->gets);
1322 #endif
1323 
1324 	pn = n;
1325 	cindex = 0;
1326 
1327 	/* Step 1: Travel to the longest prefix match in the trie */
1328 	for (;;) {
1329 		unsigned long index = get_index(key, n);
1330 
1331 		/* This bit of code is a bit tricky but it combines multiple
1332 		 * checks into a single check.  The prefix consists of the
1333 		 * prefix plus zeros for the "bits" in the prefix. The index
1334 		 * is the difference between the key and this value.  From
1335 		 * this we can actually derive several pieces of data.
1336 		 *   if (index & (~0ul << bits))
1337 		 *     we have a mismatch in skip bits and failed
1338 		 *   else
1339 		 *     we know the value is cindex
1340 		 */
1341 		if (index & (~0ul << n->bits))
1342 			break;
1343 
1344 		/* we have found a leaf. Prefixes have already been compared */
1345 		if (IS_LEAF(n))
1346 			goto found;
1347 
1348 		/* only record pn and cindex if we are going to be chopping
1349 		 * bits later.  Otherwise we are just wasting cycles.
1350 		 */
1351 		if (n->slen > n->pos) {
1352 			pn = n;
1353 			cindex = index;
1354 		}
1355 
1356 		n = tnode_get_child_rcu(n, index);
1357 		if (unlikely(!n))
1358 			goto backtrace;
1359 	}
1360 
1361 	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
1362 	for (;;) {
1363 		/* record the pointer where our next node pointer is stored */
1364 		struct tnode __rcu **cptr = n->child;
1365 
1366 		/* This test verifies that none of the bits that differ
1367 		 * between the key and the prefix exist in the region of
1368 		 * the lsb and higher in the prefix.
1369 		 */
1370 		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1371 			goto backtrace;
1372 
1373 		/* exit out and process leaf */
1374 		if (unlikely(IS_LEAF(n)))
1375 			break;
1376 
1377 		/* Don't bother recording parent info.  Since we are in
1378 		 * prefix match mode we will have to come back to wherever
1379 		 * we started this traversal anyway
1380 		 */
1381 
1382 		while ((n = rcu_dereference(*cptr)) == NULL) {
1383 backtrace:
1384 #ifdef CONFIG_IP_FIB_TRIE_STATS
1385 			if (!n)
1386 				this_cpu_inc(stats->null_node_hit);
1387 #endif
1388 			/* If we are at cindex 0 there are no more bits for
1389 			 * us to strip at this level so we must ascend back
1390 			 * up one level to see if there are any more bits to
1391 			 * be stripped there.
1392 			 */
1393 			while (!cindex) {
1394 				t_key pkey = pn->key;
1395 
1396 				pn = node_parent_rcu(pn);
1397 				if (unlikely(!pn))
1398 					return -EAGAIN;
1399 #ifdef CONFIG_IP_FIB_TRIE_STATS
1400 				this_cpu_inc(stats->backtrack);
1401 #endif
1402 				/* Get Child's index */
1403 				cindex = get_index(pkey, pn);
1404 			}
1405 
1406 			/* strip the least significant bit from the cindex */
1407 			cindex &= cindex - 1;
1408 
1409 			/* grab pointer for next child node */
1410 			cptr = &pn->child[cindex];
1411 		}
1412 	}
1413 
1414 found:
1415 	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1416 	hlist_for_each_entry_rcu(li, &n->list, hlist) {
1417 		struct fib_alias *fa;
1418 
1419 		if ((key ^ n->key) & li->mask_plen)
1420 			continue;
1421 
1422 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1423 			struct fib_info *fi = fa->fa_info;
1424 			int nhsel, err;
1425 
1426 			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1427 				continue;
1428 			if (fi->fib_dead)
1429 				continue;
1430 			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1431 				continue;
1432 			fib_alias_accessed(fa);
1433 			err = fib_props[fa->fa_type].error;
1434 			if (unlikely(err < 0)) {
1435 #ifdef CONFIG_IP_FIB_TRIE_STATS
1436 				this_cpu_inc(stats->semantic_match_passed);
1437 #endif
1438 				return err;
1439 			}
1440 			if (fi->fib_flags & RTNH_F_DEAD)
1441 				continue;
1442 			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1443 				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1444 
1445 				if (nh->nh_flags & RTNH_F_DEAD)
1446 					continue;
1447 				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1448 					continue;
1449 
1450 				if (!(fib_flags & FIB_LOOKUP_NOREF))
1451 					atomic_inc(&fi->fib_clntref);
1452 
1453 				res->prefixlen = li->plen;
1454 				res->nh_sel = nhsel;
1455 				res->type = fa->fa_type;
1456 				res->scope = fi->fib_scope;
1457 				res->fi = fi;
1458 				res->table = tb;
1459 				res->fa_head = &li->falh;
1460 #ifdef CONFIG_IP_FIB_TRIE_STATS
1461 				this_cpu_inc(stats->semantic_match_passed);
1462 #endif
1463 				return err;
1464 			}
1465 		}
1466 
1467 #ifdef CONFIG_IP_FIB_TRIE_STATS
1468 		this_cpu_inc(stats->semantic_match_miss);
1469 #endif
1470 	}
1471 	goto backtrace;
1472 }
1473 EXPORT_SYMBOL_GPL(fib_table_lookup);
1474 
1475 /*
1476  * Remove the leaf and return parent.
1477  */
1478 static void trie_leaf_remove(struct trie *t, struct tnode *l)
1479 {
1480 	struct tnode *tp = node_parent(l);
1481 
1482 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1483 
1484 	if (tp) {
1485 		put_child(tp, get_index(l->key, tp), NULL);
1486 		trie_rebalance(t, tp);
1487 	} else {
1488 		RCU_INIT_POINTER(t->trie, NULL);
1489 	}
1490 
1491 	node_free(l);
1492 }
1493 
1494 /*
1495  * Caller must hold RTNL.
1496  */
1497 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1498 {
1499 	struct trie *t = (struct trie *) tb->tb_data;
1500 	u32 key, mask;
1501 	int plen = cfg->fc_dst_len;
1502 	u8 tos = cfg->fc_tos;
1503 	struct fib_alias *fa, *fa_to_delete;
1504 	struct list_head *fa_head;
1505 	struct tnode *l;
1506 	struct leaf_info *li;
1507 
1508 	if (plen > 32)
1509 		return -EINVAL;
1510 
1511 	key = ntohl(cfg->fc_dst);
1512 	mask = ntohl(inet_make_mask(plen));
1513 
1514 	if (key & ~mask)
1515 		return -EINVAL;
1516 
1517 	key = key & mask;
1518 	l = fib_find_node(t, key);
1519 
1520 	if (!l)
1521 		return -ESRCH;
1522 
1523 	li = find_leaf_info(l, plen);
1524 
1525 	if (!li)
1526 		return -ESRCH;
1527 
1528 	fa_head = &li->falh;
1529 	fa = fib_find_alias(fa_head, tos, 0);
1530 
1531 	if (!fa)
1532 		return -ESRCH;
1533 
1534 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1535 
1536 	fa_to_delete = NULL;
1537 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1538 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1539 		struct fib_info *fi = fa->fa_info;
1540 
1541 		if (fa->fa_tos != tos)
1542 			break;
1543 
1544 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1545 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1546 		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1547 		    (!cfg->fc_prefsrc ||
1548 		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1549 		    (!cfg->fc_protocol ||
1550 		     fi->fib_protocol == cfg->fc_protocol) &&
1551 		    fib_nh_match(cfg, fi) == 0) {
1552 			fa_to_delete = fa;
1553 			break;
1554 		}
1555 	}
1556 
1557 	if (!fa_to_delete)
1558 		return -ESRCH;
1559 
1560 	fa = fa_to_delete;
1561 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1562 		  &cfg->fc_nlinfo, 0);
1563 
1564 	list_del_rcu(&fa->fa_list);
1565 
1566 	if (!plen)
1567 		tb->tb_num_default--;
1568 
1569 	if (list_empty(fa_head)) {
1570 		remove_leaf_info(l, li);
1571 		free_leaf_info(li);
1572 	}
1573 
1574 	if (hlist_empty(&l->list))
1575 		trie_leaf_remove(t, l);
1576 
1577 	if (fa->fa_state & FA_S_ACCESSED)
1578 		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1579 
1580 	fib_release_info(fa->fa_info);
1581 	alias_free_mem_rcu(fa);
1582 	return 0;
1583 }
1584 
1585 static int trie_flush_list(struct list_head *head)
1586 {
1587 	struct fib_alias *fa, *fa_node;
1588 	int found = 0;
1589 
1590 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1591 		struct fib_info *fi = fa->fa_info;
1592 
1593 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1594 			list_del_rcu(&fa->fa_list);
1595 			fib_release_info(fa->fa_info);
1596 			alias_free_mem_rcu(fa);
1597 			found++;
1598 		}
1599 	}
1600 	return found;
1601 }
1602 
1603 static int trie_flush_leaf(struct tnode *l)
1604 {
1605 	int found = 0;
1606 	struct hlist_head *lih = &l->list;
1607 	struct hlist_node *tmp;
1608 	struct leaf_info *li = NULL;
1609 	unsigned char plen = KEYLENGTH;
1610 
1611 	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1612 		found += trie_flush_list(&li->falh);
1613 
1614 		if (list_empty(&li->falh)) {
1615 			hlist_del_rcu(&li->hlist);
1616 			free_leaf_info(li);
1617 			continue;
1618 		}
1619 
1620 		plen = li->plen;
1621 	}
1622 
1623 	l->slen = KEYLENGTH - plen;
1624 
1625 	return found;
1626 }
1627 
1628 /*
1629  * Scan for the next right leaf starting at node p->child[idx]
1630  * Since we have back pointer, no recursion necessary.
1631  */
1632 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1633 {
1634 	do {
1635 		unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
1636 
1637 		while (idx < tnode_child_length(p)) {
1638 			c = tnode_get_child_rcu(p, idx++);
1639 			if (!c)
1640 				continue;
1641 
1642 			if (IS_LEAF(c))
1643 				return c;
1644 
1645 			/* Rescan start scanning in new node */
1646 			p = c;
1647 			idx = 0;
1648 		}
1649 
1650 		/* Node empty, walk back up to parent */
1651 		c = p;
1652 	} while ((p = node_parent_rcu(c)) != NULL);
1653 
1654 	return NULL; /* Root of trie */
1655 }
1656 
1657 static struct tnode *trie_firstleaf(struct trie *t)
1658 {
1659 	struct tnode *n = rcu_dereference_rtnl(t->trie);
1660 
1661 	if (!n)
1662 		return NULL;
1663 
1664 	if (IS_LEAF(n))          /* trie is just a leaf */
1665 		return n;
1666 
1667 	return leaf_walk_rcu(n, NULL);
1668 }
1669 
1670 static struct tnode *trie_nextleaf(struct tnode *l)
1671 {
1672 	struct tnode *p = node_parent_rcu(l);
1673 
1674 	if (!p)
1675 		return NULL;	/* trie with just one leaf */
1676 
1677 	return leaf_walk_rcu(p, l);
1678 }
1679 
1680 static struct tnode *trie_leafindex(struct trie *t, int index)
1681 {
1682 	struct tnode *l = trie_firstleaf(t);
1683 
1684 	while (l && index-- > 0)
1685 		l = trie_nextleaf(l);
1686 
1687 	return l;
1688 }
1689 
1690 
1691 /*
1692  * Caller must hold RTNL.
1693  */
1694 int fib_table_flush(struct fib_table *tb)
1695 {
1696 	struct trie *t = (struct trie *) tb->tb_data;
1697 	struct tnode *l, *ll = NULL;
1698 	int found = 0;
1699 
1700 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1701 		found += trie_flush_leaf(l);
1702 
1703 		if (ll) {
1704 			if (hlist_empty(&ll->list))
1705 				trie_leaf_remove(t, ll);
1706 			else
1707 				leaf_pull_suffix(ll);
1708 		}
1709 
1710 		ll = l;
1711 	}
1712 
1713 	if (ll) {
1714 		if (hlist_empty(&ll->list))
1715 			trie_leaf_remove(t, ll);
1716 		else
1717 			leaf_pull_suffix(ll);
1718 	}
1719 
1720 	pr_debug("trie_flush found=%d\n", found);
1721 	return found;
1722 }
1723 
1724 void fib_free_table(struct fib_table *tb)
1725 {
1726 #ifdef CONFIG_IP_FIB_TRIE_STATS
1727 	struct trie *t = (struct trie *)tb->tb_data;
1728 
1729 	free_percpu(t->stats);
1730 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1731 	kfree(tb);
1732 }
1733 
1734 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1735 			   struct fib_table *tb,
1736 			   struct sk_buff *skb, struct netlink_callback *cb)
1737 {
1738 	int i, s_i;
1739 	struct fib_alias *fa;
1740 	__be32 xkey = htonl(key);
1741 
1742 	s_i = cb->args[5];
1743 	i = 0;
1744 
1745 	/* rcu_read_lock is hold by caller */
1746 
1747 	list_for_each_entry_rcu(fa, fah, fa_list) {
1748 		if (i < s_i) {
1749 			i++;
1750 			continue;
1751 		}
1752 
1753 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1754 				  cb->nlh->nlmsg_seq,
1755 				  RTM_NEWROUTE,
1756 				  tb->tb_id,
1757 				  fa->fa_type,
1758 				  xkey,
1759 				  plen,
1760 				  fa->fa_tos,
1761 				  fa->fa_info, NLM_F_MULTI) < 0) {
1762 			cb->args[5] = i;
1763 			return -1;
1764 		}
1765 		i++;
1766 	}
1767 	cb->args[5] = i;
1768 	return skb->len;
1769 }
1770 
1771 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1772 			struct sk_buff *skb, struct netlink_callback *cb)
1773 {
1774 	struct leaf_info *li;
1775 	int i, s_i;
1776 
1777 	s_i = cb->args[4];
1778 	i = 0;
1779 
1780 	/* rcu_read_lock is hold by caller */
1781 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1782 		if (i < s_i) {
1783 			i++;
1784 			continue;
1785 		}
1786 
1787 		if (i > s_i)
1788 			cb->args[5] = 0;
1789 
1790 		if (list_empty(&li->falh))
1791 			continue;
1792 
1793 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1794 			cb->args[4] = i;
1795 			return -1;
1796 		}
1797 		i++;
1798 	}
1799 
1800 	cb->args[4] = i;
1801 	return skb->len;
1802 }
1803 
1804 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1805 		   struct netlink_callback *cb)
1806 {
1807 	struct tnode *l;
1808 	struct trie *t = (struct trie *) tb->tb_data;
1809 	t_key key = cb->args[2];
1810 	int count = cb->args[3];
1811 
1812 	rcu_read_lock();
1813 	/* Dump starting at last key.
1814 	 * Note: 0.0.0.0/0 (ie default) is first key.
1815 	 */
1816 	if (count == 0)
1817 		l = trie_firstleaf(t);
1818 	else {
1819 		/* Normally, continue from last key, but if that is missing
1820 		 * fallback to using slow rescan
1821 		 */
1822 		l = fib_find_node(t, key);
1823 		if (!l)
1824 			l = trie_leafindex(t, count);
1825 	}
1826 
1827 	while (l) {
1828 		cb->args[2] = l->key;
1829 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1830 			cb->args[3] = count;
1831 			rcu_read_unlock();
1832 			return -1;
1833 		}
1834 
1835 		++count;
1836 		l = trie_nextleaf(l);
1837 		memset(&cb->args[4], 0,
1838 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1839 	}
1840 	cb->args[3] = count;
1841 	rcu_read_unlock();
1842 
1843 	return skb->len;
1844 }
1845 
1846 void __init fib_trie_init(void)
1847 {
1848 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1849 					  sizeof(struct fib_alias),
1850 					  0, SLAB_PANIC, NULL);
1851 
1852 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1853 					   max(sizeof(struct tnode),
1854 					       sizeof(struct leaf_info)),
1855 					   0, SLAB_PANIC, NULL);
1856 }
1857 
1858 
1859 struct fib_table *fib_trie_table(u32 id)
1860 {
1861 	struct fib_table *tb;
1862 	struct trie *t;
1863 
1864 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1865 		     GFP_KERNEL);
1866 	if (tb == NULL)
1867 		return NULL;
1868 
1869 	tb->tb_id = id;
1870 	tb->tb_default = -1;
1871 	tb->tb_num_default = 0;
1872 
1873 	t = (struct trie *) tb->tb_data;
1874 	RCU_INIT_POINTER(t->trie, NULL);
1875 #ifdef CONFIG_IP_FIB_TRIE_STATS
1876 	t->stats = alloc_percpu(struct trie_use_stats);
1877 	if (!t->stats) {
1878 		kfree(tb);
1879 		tb = NULL;
1880 	}
1881 #endif
1882 
1883 	return tb;
1884 }
1885 
1886 #ifdef CONFIG_PROC_FS
1887 /* Depth first Trie walk iterator */
1888 struct fib_trie_iter {
1889 	struct seq_net_private p;
1890 	struct fib_table *tb;
1891 	struct tnode *tnode;
1892 	unsigned int index;
1893 	unsigned int depth;
1894 };
1895 
1896 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1897 {
1898 	unsigned long cindex = iter->index;
1899 	struct tnode *tn = iter->tnode;
1900 	struct tnode *p;
1901 
1902 	/* A single entry routing table */
1903 	if (!tn)
1904 		return NULL;
1905 
1906 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1907 		 iter->tnode, iter->index, iter->depth);
1908 rescan:
1909 	while (cindex < tnode_child_length(tn)) {
1910 		struct tnode *n = tnode_get_child_rcu(tn, cindex);
1911 
1912 		if (n) {
1913 			if (IS_LEAF(n)) {
1914 				iter->tnode = tn;
1915 				iter->index = cindex + 1;
1916 			} else {
1917 				/* push down one level */
1918 				iter->tnode = n;
1919 				iter->index = 0;
1920 				++iter->depth;
1921 			}
1922 			return n;
1923 		}
1924 
1925 		++cindex;
1926 	}
1927 
1928 	/* Current node exhausted, pop back up */
1929 	p = node_parent_rcu(tn);
1930 	if (p) {
1931 		cindex = get_index(tn->key, p) + 1;
1932 		tn = p;
1933 		--iter->depth;
1934 		goto rescan;
1935 	}
1936 
1937 	/* got root? */
1938 	return NULL;
1939 }
1940 
1941 static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1942 				       struct trie *t)
1943 {
1944 	struct tnode *n;
1945 
1946 	if (!t)
1947 		return NULL;
1948 
1949 	n = rcu_dereference(t->trie);
1950 	if (!n)
1951 		return NULL;
1952 
1953 	if (IS_TNODE(n)) {
1954 		iter->tnode = n;
1955 		iter->index = 0;
1956 		iter->depth = 1;
1957 	} else {
1958 		iter->tnode = NULL;
1959 		iter->index = 0;
1960 		iter->depth = 0;
1961 	}
1962 
1963 	return n;
1964 }
1965 
1966 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1967 {
1968 	struct tnode *n;
1969 	struct fib_trie_iter iter;
1970 
1971 	memset(s, 0, sizeof(*s));
1972 
1973 	rcu_read_lock();
1974 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1975 		if (IS_LEAF(n)) {
1976 			struct leaf_info *li;
1977 
1978 			s->leaves++;
1979 			s->totdepth += iter.depth;
1980 			if (iter.depth > s->maxdepth)
1981 				s->maxdepth = iter.depth;
1982 
1983 			hlist_for_each_entry_rcu(li, &n->list, hlist)
1984 				++s->prefixes;
1985 		} else {
1986 			s->tnodes++;
1987 			if (n->bits < MAX_STAT_DEPTH)
1988 				s->nodesizes[n->bits]++;
1989 			s->nullpointers += n->empty_children;
1990 		}
1991 	}
1992 	rcu_read_unlock();
1993 }
1994 
1995 /*
1996  *	This outputs /proc/net/fib_triestats
1997  */
1998 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1999 {
2000 	unsigned int i, max, pointers, bytes, avdepth;
2001 
2002 	if (stat->leaves)
2003 		avdepth = stat->totdepth*100 / stat->leaves;
2004 	else
2005 		avdepth = 0;
2006 
2007 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2008 		   avdepth / 100, avdepth % 100);
2009 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2010 
2011 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2012 	bytes = sizeof(struct tnode) * stat->leaves;
2013 
2014 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2015 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2016 
2017 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2018 	bytes += sizeof(struct tnode) * stat->tnodes;
2019 
2020 	max = MAX_STAT_DEPTH;
2021 	while (max > 0 && stat->nodesizes[max-1] == 0)
2022 		max--;
2023 
2024 	pointers = 0;
2025 	for (i = 1; i < max; i++)
2026 		if (stat->nodesizes[i] != 0) {
2027 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2028 			pointers += (1<<i) * stat->nodesizes[i];
2029 		}
2030 	seq_putc(seq, '\n');
2031 	seq_printf(seq, "\tPointers: %u\n", pointers);
2032 
2033 	bytes += sizeof(struct tnode *) * pointers;
2034 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2035 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2036 }
2037 
2038 #ifdef CONFIG_IP_FIB_TRIE_STATS
2039 static void trie_show_usage(struct seq_file *seq,
2040 			    const struct trie_use_stats __percpu *stats)
2041 {
2042 	struct trie_use_stats s = { 0 };
2043 	int cpu;
2044 
2045 	/* loop through all of the CPUs and gather up the stats */
2046 	for_each_possible_cpu(cpu) {
2047 		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2048 
2049 		s.gets += pcpu->gets;
2050 		s.backtrack += pcpu->backtrack;
2051 		s.semantic_match_passed += pcpu->semantic_match_passed;
2052 		s.semantic_match_miss += pcpu->semantic_match_miss;
2053 		s.null_node_hit += pcpu->null_node_hit;
2054 		s.resize_node_skipped += pcpu->resize_node_skipped;
2055 	}
2056 
2057 	seq_printf(seq, "\nCounters:\n---------\n");
2058 	seq_printf(seq, "gets = %u\n", s.gets);
2059 	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2060 	seq_printf(seq, "semantic match passed = %u\n",
2061 		   s.semantic_match_passed);
2062 	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2063 	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2064 	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2065 }
2066 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2067 
2068 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2069 {
2070 	if (tb->tb_id == RT_TABLE_LOCAL)
2071 		seq_puts(seq, "Local:\n");
2072 	else if (tb->tb_id == RT_TABLE_MAIN)
2073 		seq_puts(seq, "Main:\n");
2074 	else
2075 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2076 }
2077 
2078 
2079 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2080 {
2081 	struct net *net = (struct net *)seq->private;
2082 	unsigned int h;
2083 
2084 	seq_printf(seq,
2085 		   "Basic info: size of leaf:"
2086 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2087 		   sizeof(struct tnode), sizeof(struct tnode));
2088 
2089 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2090 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2091 		struct fib_table *tb;
2092 
2093 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2094 			struct trie *t = (struct trie *) tb->tb_data;
2095 			struct trie_stat stat;
2096 
2097 			if (!t)
2098 				continue;
2099 
2100 			fib_table_print(seq, tb);
2101 
2102 			trie_collect_stats(t, &stat);
2103 			trie_show_stats(seq, &stat);
2104 #ifdef CONFIG_IP_FIB_TRIE_STATS
2105 			trie_show_usage(seq, t->stats);
2106 #endif
2107 		}
2108 	}
2109 
2110 	return 0;
2111 }
2112 
2113 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2114 {
2115 	return single_open_net(inode, file, fib_triestat_seq_show);
2116 }
2117 
2118 static const struct file_operations fib_triestat_fops = {
2119 	.owner	= THIS_MODULE,
2120 	.open	= fib_triestat_seq_open,
2121 	.read	= seq_read,
2122 	.llseek	= seq_lseek,
2123 	.release = single_release_net,
2124 };
2125 
2126 static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2127 {
2128 	struct fib_trie_iter *iter = seq->private;
2129 	struct net *net = seq_file_net(seq);
2130 	loff_t idx = 0;
2131 	unsigned int h;
2132 
2133 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2134 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2135 		struct fib_table *tb;
2136 
2137 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2138 			struct tnode *n;
2139 
2140 			for (n = fib_trie_get_first(iter,
2141 						    (struct trie *) tb->tb_data);
2142 			     n; n = fib_trie_get_next(iter))
2143 				if (pos == idx++) {
2144 					iter->tb = tb;
2145 					return n;
2146 				}
2147 		}
2148 	}
2149 
2150 	return NULL;
2151 }
2152 
2153 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2154 	__acquires(RCU)
2155 {
2156 	rcu_read_lock();
2157 	return fib_trie_get_idx(seq, *pos);
2158 }
2159 
2160 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161 {
2162 	struct fib_trie_iter *iter = seq->private;
2163 	struct net *net = seq_file_net(seq);
2164 	struct fib_table *tb = iter->tb;
2165 	struct hlist_node *tb_node;
2166 	unsigned int h;
2167 	struct tnode *n;
2168 
2169 	++*pos;
2170 	/* next node in same table */
2171 	n = fib_trie_get_next(iter);
2172 	if (n)
2173 		return n;
2174 
2175 	/* walk rest of this hash chain */
2176 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2177 	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2178 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2179 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2180 		if (n)
2181 			goto found;
2182 	}
2183 
2184 	/* new hash chain */
2185 	while (++h < FIB_TABLE_HASHSZ) {
2186 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2187 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2188 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2189 			if (n)
2190 				goto found;
2191 		}
2192 	}
2193 	return NULL;
2194 
2195 found:
2196 	iter->tb = tb;
2197 	return n;
2198 }
2199 
2200 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2201 	__releases(RCU)
2202 {
2203 	rcu_read_unlock();
2204 }
2205 
2206 static void seq_indent(struct seq_file *seq, int n)
2207 {
2208 	while (n-- > 0)
2209 		seq_puts(seq, "   ");
2210 }
2211 
2212 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2213 {
2214 	switch (s) {
2215 	case RT_SCOPE_UNIVERSE: return "universe";
2216 	case RT_SCOPE_SITE:	return "site";
2217 	case RT_SCOPE_LINK:	return "link";
2218 	case RT_SCOPE_HOST:	return "host";
2219 	case RT_SCOPE_NOWHERE:	return "nowhere";
2220 	default:
2221 		snprintf(buf, len, "scope=%d", s);
2222 		return buf;
2223 	}
2224 }
2225 
2226 static const char *const rtn_type_names[__RTN_MAX] = {
2227 	[RTN_UNSPEC] = "UNSPEC",
2228 	[RTN_UNICAST] = "UNICAST",
2229 	[RTN_LOCAL] = "LOCAL",
2230 	[RTN_BROADCAST] = "BROADCAST",
2231 	[RTN_ANYCAST] = "ANYCAST",
2232 	[RTN_MULTICAST] = "MULTICAST",
2233 	[RTN_BLACKHOLE] = "BLACKHOLE",
2234 	[RTN_UNREACHABLE] = "UNREACHABLE",
2235 	[RTN_PROHIBIT] = "PROHIBIT",
2236 	[RTN_THROW] = "THROW",
2237 	[RTN_NAT] = "NAT",
2238 	[RTN_XRESOLVE] = "XRESOLVE",
2239 };
2240 
2241 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2242 {
2243 	if (t < __RTN_MAX && rtn_type_names[t])
2244 		return rtn_type_names[t];
2245 	snprintf(buf, len, "type %u", t);
2246 	return buf;
2247 }
2248 
2249 /* Pretty print the trie */
2250 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2251 {
2252 	const struct fib_trie_iter *iter = seq->private;
2253 	struct tnode *n = v;
2254 
2255 	if (!node_parent_rcu(n))
2256 		fib_table_print(seq, iter->tb);
2257 
2258 	if (IS_TNODE(n)) {
2259 		__be32 prf = htonl(n->key);
2260 
2261 		seq_indent(seq, iter->depth-1);
2262 		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
2263 			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2264 			   n->full_children, n->empty_children);
2265 	} else {
2266 		struct leaf_info *li;
2267 		__be32 val = htonl(n->key);
2268 
2269 		seq_indent(seq, iter->depth);
2270 		seq_printf(seq, "  |-- %pI4\n", &val);
2271 
2272 		hlist_for_each_entry_rcu(li, &n->list, hlist) {
2273 			struct fib_alias *fa;
2274 
2275 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2276 				char buf1[32], buf2[32];
2277 
2278 				seq_indent(seq, iter->depth+1);
2279 				seq_printf(seq, "  /%d %s %s", li->plen,
2280 					   rtn_scope(buf1, sizeof(buf1),
2281 						     fa->fa_info->fib_scope),
2282 					   rtn_type(buf2, sizeof(buf2),
2283 						    fa->fa_type));
2284 				if (fa->fa_tos)
2285 					seq_printf(seq, " tos=%d", fa->fa_tos);
2286 				seq_putc(seq, '\n');
2287 			}
2288 		}
2289 	}
2290 
2291 	return 0;
2292 }
2293 
2294 static const struct seq_operations fib_trie_seq_ops = {
2295 	.start  = fib_trie_seq_start,
2296 	.next   = fib_trie_seq_next,
2297 	.stop   = fib_trie_seq_stop,
2298 	.show   = fib_trie_seq_show,
2299 };
2300 
2301 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2302 {
2303 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2304 			    sizeof(struct fib_trie_iter));
2305 }
2306 
2307 static const struct file_operations fib_trie_fops = {
2308 	.owner  = THIS_MODULE,
2309 	.open   = fib_trie_seq_open,
2310 	.read   = seq_read,
2311 	.llseek = seq_lseek,
2312 	.release = seq_release_net,
2313 };
2314 
2315 struct fib_route_iter {
2316 	struct seq_net_private p;
2317 	struct trie *main_trie;
2318 	loff_t	pos;
2319 	t_key	key;
2320 };
2321 
2322 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2323 {
2324 	struct tnode *l = NULL;
2325 	struct trie *t = iter->main_trie;
2326 
2327 	/* use cache location of last found key */
2328 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2329 		pos -= iter->pos;
2330 	else {
2331 		iter->pos = 0;
2332 		l = trie_firstleaf(t);
2333 	}
2334 
2335 	while (l && pos-- > 0) {
2336 		iter->pos++;
2337 		l = trie_nextleaf(l);
2338 	}
2339 
2340 	if (l)
2341 		iter->key = pos;	/* remember it */
2342 	else
2343 		iter->pos = 0;		/* forget it */
2344 
2345 	return l;
2346 }
2347 
2348 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2349 	__acquires(RCU)
2350 {
2351 	struct fib_route_iter *iter = seq->private;
2352 	struct fib_table *tb;
2353 
2354 	rcu_read_lock();
2355 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2356 	if (!tb)
2357 		return NULL;
2358 
2359 	iter->main_trie = (struct trie *) tb->tb_data;
2360 	if (*pos == 0)
2361 		return SEQ_START_TOKEN;
2362 	else
2363 		return fib_route_get_idx(iter, *pos - 1);
2364 }
2365 
2366 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2367 {
2368 	struct fib_route_iter *iter = seq->private;
2369 	struct tnode *l = v;
2370 
2371 	++*pos;
2372 	if (v == SEQ_START_TOKEN) {
2373 		iter->pos = 0;
2374 		l = trie_firstleaf(iter->main_trie);
2375 	} else {
2376 		iter->pos++;
2377 		l = trie_nextleaf(l);
2378 	}
2379 
2380 	if (l)
2381 		iter->key = l->key;
2382 	else
2383 		iter->pos = 0;
2384 	return l;
2385 }
2386 
2387 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2388 	__releases(RCU)
2389 {
2390 	rcu_read_unlock();
2391 }
2392 
2393 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2394 {
2395 	unsigned int flags = 0;
2396 
2397 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2398 		flags = RTF_REJECT;
2399 	if (fi && fi->fib_nh->nh_gw)
2400 		flags |= RTF_GATEWAY;
2401 	if (mask == htonl(0xFFFFFFFF))
2402 		flags |= RTF_HOST;
2403 	flags |= RTF_UP;
2404 	return flags;
2405 }
2406 
2407 /*
2408  *	This outputs /proc/net/route.
2409  *	The format of the file is not supposed to be changed
2410  *	and needs to be same as fib_hash output to avoid breaking
2411  *	legacy utilities
2412  */
2413 static int fib_route_seq_show(struct seq_file *seq, void *v)
2414 {
2415 	struct tnode *l = v;
2416 	struct leaf_info *li;
2417 
2418 	if (v == SEQ_START_TOKEN) {
2419 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2420 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2421 			   "\tWindow\tIRTT");
2422 		return 0;
2423 	}
2424 
2425 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2426 		struct fib_alias *fa;
2427 		__be32 mask, prefix;
2428 
2429 		mask = inet_make_mask(li->plen);
2430 		prefix = htonl(l->key);
2431 
2432 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2433 			const struct fib_info *fi = fa->fa_info;
2434 			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2435 
2436 			if (fa->fa_type == RTN_BROADCAST
2437 			    || fa->fa_type == RTN_MULTICAST)
2438 				continue;
2439 
2440 			seq_setwidth(seq, 127);
2441 
2442 			if (fi)
2443 				seq_printf(seq,
2444 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2445 					 "%d\t%08X\t%d\t%u\t%u",
2446 					 fi->fib_dev ? fi->fib_dev->name : "*",
2447 					 prefix,
2448 					 fi->fib_nh->nh_gw, flags, 0, 0,
2449 					 fi->fib_priority,
2450 					 mask,
2451 					 (fi->fib_advmss ?
2452 					  fi->fib_advmss + 40 : 0),
2453 					 fi->fib_window,
2454 					 fi->fib_rtt >> 3);
2455 			else
2456 				seq_printf(seq,
2457 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2458 					 "%d\t%08X\t%d\t%u\t%u",
2459 					 prefix, 0, flags, 0, 0, 0,
2460 					 mask, 0, 0, 0);
2461 
2462 			seq_pad(seq, '\n');
2463 		}
2464 	}
2465 
2466 	return 0;
2467 }
2468 
2469 static const struct seq_operations fib_route_seq_ops = {
2470 	.start  = fib_route_seq_start,
2471 	.next   = fib_route_seq_next,
2472 	.stop   = fib_route_seq_stop,
2473 	.show   = fib_route_seq_show,
2474 };
2475 
2476 static int fib_route_seq_open(struct inode *inode, struct file *file)
2477 {
2478 	return seq_open_net(inode, file, &fib_route_seq_ops,
2479 			    sizeof(struct fib_route_iter));
2480 }
2481 
2482 static const struct file_operations fib_route_fops = {
2483 	.owner  = THIS_MODULE,
2484 	.open   = fib_route_seq_open,
2485 	.read   = seq_read,
2486 	.llseek = seq_lseek,
2487 	.release = seq_release_net,
2488 };
2489 
2490 int __net_init fib_proc_init(struct net *net)
2491 {
2492 	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2493 		goto out1;
2494 
2495 	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2496 			 &fib_triestat_fops))
2497 		goto out2;
2498 
2499 	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2500 		goto out3;
2501 
2502 	return 0;
2503 
2504 out3:
2505 	remove_proc_entry("fib_triestat", net->proc_net);
2506 out2:
2507 	remove_proc_entry("fib_trie", net->proc_net);
2508 out1:
2509 	return -ENOMEM;
2510 }
2511 
2512 void __net_exit fib_proc_exit(struct net *net)
2513 {
2514 	remove_proc_entry("fib_trie", net->proc_net);
2515 	remove_proc_entry("fib_triestat", net->proc_net);
2516 	remove_proc_entry("route", net->proc_net);
2517 }
2518 
2519 #endif /* CONFIG_PROC_FS */
2520