xref: /linux/net/ipv4/route.c (revision 2504075d383fcefd746dac42a0cd1c3bdc006bd1)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 
115 #define IP_MAX_MTU	0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
123 static int ip_rt_redirect_number __read_mostly	= 9;
124 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly	= HZ;
127 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly	= 8;
129 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly	= 256;
132 static int rt_chain_length_max __read_mostly	= 20;
133 
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136 
137 /*
138  *	Interface to generic destination cache.
139  */
140 
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void		 ipv4_dst_destroy(struct dst_entry *dst);
143 static void		 ipv4_dst_ifdown(struct dst_entry *dst,
144 					 struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void		 ipv4_link_failure(struct sk_buff *skb);
147 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 
150 
151 static struct dst_ops ipv4_dst_ops = {
152 	.family =		AF_INET,
153 	.protocol =		cpu_to_be16(ETH_P_IP),
154 	.gc =			rt_garbage_collect,
155 	.check =		ipv4_dst_check,
156 	.destroy =		ipv4_dst_destroy,
157 	.ifdown =		ipv4_dst_ifdown,
158 	.negative_advice =	ipv4_negative_advice,
159 	.link_failure =		ipv4_link_failure,
160 	.update_pmtu =		ip_rt_update_pmtu,
161 	.local_out =		__ip_local_out,
162 };
163 
164 #define ECN_OR_COST(class)	TC_PRIO_##class
165 
166 const __u8 ip_tos2prio[16] = {
167 	TC_PRIO_BESTEFFORT,
168 	ECN_OR_COST(FILLER),
169 	TC_PRIO_BESTEFFORT,
170 	ECN_OR_COST(BESTEFFORT),
171 	TC_PRIO_BULK,
172 	ECN_OR_COST(BULK),
173 	TC_PRIO_BULK,
174 	ECN_OR_COST(BULK),
175 	TC_PRIO_INTERACTIVE,
176 	ECN_OR_COST(INTERACTIVE),
177 	TC_PRIO_INTERACTIVE,
178 	ECN_OR_COST(INTERACTIVE),
179 	TC_PRIO_INTERACTIVE_BULK,
180 	ECN_OR_COST(INTERACTIVE_BULK),
181 	TC_PRIO_INTERACTIVE_BULK,
182 	ECN_OR_COST(INTERACTIVE_BULK)
183 };
184 
185 
186 /*
187  * Route cache.
188  */
189 
190 /* The locking scheme is rather straight forward:
191  *
192  * 1) Read-Copy Update protects the buckets of the central route hash.
193  * 2) Only writers remove entries, and they hold the lock
194  *    as they look at rtable reference counts.
195  * 3) Only readers acquire references to rtable entries,
196  *    they do so with atomic increments and with the
197  *    lock held.
198  */
199 
200 struct rt_hash_bucket {
201 	struct rtable __rcu	*chain;
202 };
203 
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 	defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ	256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ	4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ	2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ	1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ	512
222 # else
223 #  define RT_HASH_LOCK_SZ	256
224 # endif
225 #endif
226 
227 static spinlock_t	*rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229 
230 static __init void rt_hash_lock_init(void)
231 {
232 	int i;
233 
234 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 			GFP_KERNEL);
236 	if (!rt_hash_locks)
237 		panic("IP: failed to allocate rt_hash_locks\n");
238 
239 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 		spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249 
250 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
251 static unsigned			rt_hash_mask __read_mostly;
252 static unsigned int		rt_hash_log  __read_mostly;
253 
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
256 
257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
258 				   int genid)
259 {
260 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
261 			    idx, genid)
262 		& rt_hash_mask;
263 }
264 
265 static inline int rt_genid(struct net *net)
266 {
267 	return atomic_read(&net->ipv4.rt_genid);
268 }
269 
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272 	struct seq_net_private p;
273 	int bucket;
274 	int genid;
275 };
276 
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279 	struct rt_cache_iter_state *st = seq->private;
280 	struct rtable *r = NULL;
281 
282 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
284 			continue;
285 		rcu_read_lock_bh();
286 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
287 		while (r) {
288 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
289 			    r->rt_genid == st->genid)
290 				return r;
291 			r = rcu_dereference_bh(r->dst.rt_next);
292 		}
293 		rcu_read_unlock_bh();
294 	}
295 	return r;
296 }
297 
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 					  struct rtable *r)
300 {
301 	struct rt_cache_iter_state *st = seq->private;
302 
303 	r = rcu_dereference_bh(r->dst.rt_next);
304 	while (!r) {
305 		rcu_read_unlock_bh();
306 		do {
307 			if (--st->bucket < 0)
308 				return NULL;
309 		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
310 		rcu_read_lock_bh();
311 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
312 	}
313 	return r;
314 }
315 
316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
317 					struct rtable *r)
318 {
319 	struct rt_cache_iter_state *st = seq->private;
320 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
321 		if (dev_net(r->dst.dev) != seq_file_net(seq))
322 			continue;
323 		if (r->rt_genid == st->genid)
324 			break;
325 	}
326 	return r;
327 }
328 
329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
330 {
331 	struct rtable *r = rt_cache_get_first(seq);
332 
333 	if (r)
334 		while (pos && (r = rt_cache_get_next(seq, r)))
335 			--pos;
336 	return pos ? NULL : r;
337 }
338 
339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
340 {
341 	struct rt_cache_iter_state *st = seq->private;
342 	if (*pos)
343 		return rt_cache_get_idx(seq, *pos - 1);
344 	st->genid = rt_genid(seq_file_net(seq));
345 	return SEQ_START_TOKEN;
346 }
347 
348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
349 {
350 	struct rtable *r;
351 
352 	if (v == SEQ_START_TOKEN)
353 		r = rt_cache_get_first(seq);
354 	else
355 		r = rt_cache_get_next(seq, v);
356 	++*pos;
357 	return r;
358 }
359 
360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
361 {
362 	if (v && v != SEQ_START_TOKEN)
363 		rcu_read_unlock_bh();
364 }
365 
366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
367 {
368 	if (v == SEQ_START_TOKEN)
369 		seq_printf(seq, "%-127s\n",
370 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
371 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
372 			   "HHUptod\tSpecDst");
373 	else {
374 		struct rtable *r = v;
375 		int len;
376 
377 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
378 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
379 			r->dst.dev ? r->dst.dev->name : "*",
380 			(__force u32)r->rt_dst,
381 			(__force u32)r->rt_gateway,
382 			r->rt_flags, atomic_read(&r->dst.__refcnt),
383 			r->dst.__use, 0, (__force u32)r->rt_src,
384 			(dst_metric(&r->dst, RTAX_ADVMSS) ?
385 			     (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386 			dst_metric(&r->dst, RTAX_WINDOW),
387 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388 			      dst_metric(&r->dst, RTAX_RTTVAR)),
389 			r->fl.fl4_tos,
390 			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
391 			r->dst.hh ? (r->dst.hh->hh_output ==
392 				       dev_queue_xmit) : 0,
393 			r->rt_spec_dst, &len);
394 
395 		seq_printf(seq, "%*s\n", 127 - len, "");
396 	}
397 	return 0;
398 }
399 
400 static const struct seq_operations rt_cache_seq_ops = {
401 	.start  = rt_cache_seq_start,
402 	.next   = rt_cache_seq_next,
403 	.stop   = rt_cache_seq_stop,
404 	.show   = rt_cache_seq_show,
405 };
406 
407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
408 {
409 	return seq_open_net(inode, file, &rt_cache_seq_ops,
410 			sizeof(struct rt_cache_iter_state));
411 }
412 
413 static const struct file_operations rt_cache_seq_fops = {
414 	.owner	 = THIS_MODULE,
415 	.open	 = rt_cache_seq_open,
416 	.read	 = seq_read,
417 	.llseek	 = seq_lseek,
418 	.release = seq_release_net,
419 };
420 
421 
422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
423 {
424 	int cpu;
425 
426 	if (*pos == 0)
427 		return SEQ_START_TOKEN;
428 
429 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
430 		if (!cpu_possible(cpu))
431 			continue;
432 		*pos = cpu+1;
433 		return &per_cpu(rt_cache_stat, cpu);
434 	}
435 	return NULL;
436 }
437 
438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
439 {
440 	int cpu;
441 
442 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
443 		if (!cpu_possible(cpu))
444 			continue;
445 		*pos = cpu+1;
446 		return &per_cpu(rt_cache_stat, cpu);
447 	}
448 	return NULL;
449 
450 }
451 
452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
453 {
454 
455 }
456 
457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
458 {
459 	struct rt_cache_stat *st = v;
460 
461 	if (v == SEQ_START_TOKEN) {
462 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
463 		return 0;
464 	}
465 
466 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
467 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
468 		   dst_entries_get_slow(&ipv4_dst_ops),
469 		   st->in_hit,
470 		   st->in_slow_tot,
471 		   st->in_slow_mc,
472 		   st->in_no_route,
473 		   st->in_brd,
474 		   st->in_martian_dst,
475 		   st->in_martian_src,
476 
477 		   st->out_hit,
478 		   st->out_slow_tot,
479 		   st->out_slow_mc,
480 
481 		   st->gc_total,
482 		   st->gc_ignored,
483 		   st->gc_goal_miss,
484 		   st->gc_dst_overflow,
485 		   st->in_hlist_search,
486 		   st->out_hlist_search
487 		);
488 	return 0;
489 }
490 
491 static const struct seq_operations rt_cpu_seq_ops = {
492 	.start  = rt_cpu_seq_start,
493 	.next   = rt_cpu_seq_next,
494 	.stop   = rt_cpu_seq_stop,
495 	.show   = rt_cpu_seq_show,
496 };
497 
498 
499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
500 {
501 	return seq_open(file, &rt_cpu_seq_ops);
502 }
503 
504 static const struct file_operations rt_cpu_seq_fops = {
505 	.owner	 = THIS_MODULE,
506 	.open	 = rt_cpu_seq_open,
507 	.read	 = seq_read,
508 	.llseek	 = seq_lseek,
509 	.release = seq_release,
510 };
511 
512 #ifdef CONFIG_NET_CLS_ROUTE
513 static int rt_acct_proc_show(struct seq_file *m, void *v)
514 {
515 	struct ip_rt_acct *dst, *src;
516 	unsigned int i, j;
517 
518 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
519 	if (!dst)
520 		return -ENOMEM;
521 
522 	for_each_possible_cpu(i) {
523 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
524 		for (j = 0; j < 256; j++) {
525 			dst[j].o_bytes   += src[j].o_bytes;
526 			dst[j].o_packets += src[j].o_packets;
527 			dst[j].i_bytes   += src[j].i_bytes;
528 			dst[j].i_packets += src[j].i_packets;
529 		}
530 	}
531 
532 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
533 	kfree(dst);
534 	return 0;
535 }
536 
537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
538 {
539 	return single_open(file, rt_acct_proc_show, NULL);
540 }
541 
542 static const struct file_operations rt_acct_proc_fops = {
543 	.owner		= THIS_MODULE,
544 	.open		= rt_acct_proc_open,
545 	.read		= seq_read,
546 	.llseek		= seq_lseek,
547 	.release	= single_release,
548 };
549 #endif
550 
551 static int __net_init ip_rt_do_proc_init(struct net *net)
552 {
553 	struct proc_dir_entry *pde;
554 
555 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
556 			&rt_cache_seq_fops);
557 	if (!pde)
558 		goto err1;
559 
560 	pde = proc_create("rt_cache", S_IRUGO,
561 			  net->proc_net_stat, &rt_cpu_seq_fops);
562 	if (!pde)
563 		goto err2;
564 
565 #ifdef CONFIG_NET_CLS_ROUTE
566 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
567 	if (!pde)
568 		goto err3;
569 #endif
570 	return 0;
571 
572 #ifdef CONFIG_NET_CLS_ROUTE
573 err3:
574 	remove_proc_entry("rt_cache", net->proc_net_stat);
575 #endif
576 err2:
577 	remove_proc_entry("rt_cache", net->proc_net);
578 err1:
579 	return -ENOMEM;
580 }
581 
582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
583 {
584 	remove_proc_entry("rt_cache", net->proc_net_stat);
585 	remove_proc_entry("rt_cache", net->proc_net);
586 #ifdef CONFIG_NET_CLS_ROUTE
587 	remove_proc_entry("rt_acct", net->proc_net);
588 #endif
589 }
590 
591 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
592 	.init = ip_rt_do_proc_init,
593 	.exit = ip_rt_do_proc_exit,
594 };
595 
596 static int __init ip_rt_proc_init(void)
597 {
598 	return register_pernet_subsys(&ip_rt_proc_ops);
599 }
600 
601 #else
602 static inline int ip_rt_proc_init(void)
603 {
604 	return 0;
605 }
606 #endif /* CONFIG_PROC_FS */
607 
608 static inline void rt_free(struct rtable *rt)
609 {
610 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
611 }
612 
613 static inline void rt_drop(struct rtable *rt)
614 {
615 	ip_rt_put(rt);
616 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
617 }
618 
619 static inline int rt_fast_clean(struct rtable *rth)
620 {
621 	/* Kill broadcast/multicast entries very aggresively, if they
622 	   collide in hash table with more useful entries */
623 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624 		rth->fl.iif && rth->dst.rt_next;
625 }
626 
627 static inline int rt_valuable(struct rtable *rth)
628 {
629 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
630 		rth->dst.expires;
631 }
632 
633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
634 {
635 	unsigned long age;
636 	int ret = 0;
637 
638 	if (atomic_read(&rth->dst.__refcnt))
639 		goto out;
640 
641 	ret = 1;
642 	if (rth->dst.expires &&
643 	    time_after_eq(jiffies, rth->dst.expires))
644 		goto out;
645 
646 	age = jiffies - rth->dst.lastuse;
647 	ret = 0;
648 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
649 	    (age <= tmo2 && rt_valuable(rth)))
650 		goto out;
651 	ret = 1;
652 out:	return ret;
653 }
654 
655 /* Bits of score are:
656  * 31: very valuable
657  * 30: not quite useless
658  * 29..0: usage counter
659  */
660 static inline u32 rt_score(struct rtable *rt)
661 {
662 	u32 score = jiffies - rt->dst.lastuse;
663 
664 	score = ~score & ~(3<<30);
665 
666 	if (rt_valuable(rt))
667 		score |= (1<<31);
668 
669 	if (!rt->fl.iif ||
670 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671 		score |= (1<<30);
672 
673 	return score;
674 }
675 
676 static inline bool rt_caching(const struct net *net)
677 {
678 	return net->ipv4.current_rt_cache_rebuild_count <=
679 		net->ipv4.sysctl_rt_cache_rebuild_count;
680 }
681 
682 static inline bool compare_hash_inputs(const struct flowi *fl1,
683 					const struct flowi *fl2)
684 {
685 	return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
686 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
687 		(fl1->iif ^ fl2->iif)) == 0);
688 }
689 
690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
691 {
692 	return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
693 		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
694 		(fl1->mark ^ fl2->mark) |
695 		(*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
696 		(fl1->oif ^ fl2->oif) |
697 		(fl1->iif ^ fl2->iif)) == 0;
698 }
699 
700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
701 {
702 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
703 }
704 
705 static inline int rt_is_expired(struct rtable *rth)
706 {
707 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
708 }
709 
710 /*
711  * Perform a full scan of hash table and free all entries.
712  * Can be called by a softirq or a process.
713  * In the later case, we want to be reschedule if necessary
714  */
715 static void rt_do_flush(int process_context)
716 {
717 	unsigned int i;
718 	struct rtable *rth, *next;
719 	struct rtable * tail;
720 
721 	for (i = 0; i <= rt_hash_mask; i++) {
722 		if (process_context && need_resched())
723 			cond_resched();
724 		rth = rcu_dereference_raw(rt_hash_table[i].chain);
725 		if (!rth)
726 			continue;
727 
728 		spin_lock_bh(rt_hash_lock_addr(i));
729 #ifdef CONFIG_NET_NS
730 		{
731 		struct rtable __rcu **prev;
732 		struct rtable *p;
733 
734 		rth = rcu_dereference_protected(rt_hash_table[i].chain,
735 			lockdep_is_held(rt_hash_lock_addr(i)));
736 
737 		/* defer releasing the head of the list after spin_unlock */
738 		for (tail = rth; tail;
739 		     tail = rcu_dereference_protected(tail->dst.rt_next,
740 				lockdep_is_held(rt_hash_lock_addr(i))))
741 			if (!rt_is_expired(tail))
742 				break;
743 		if (rth != tail)
744 			rt_hash_table[i].chain = tail;
745 
746 		/* call rt_free on entries after the tail requiring flush */
747 		prev = &rt_hash_table[i].chain;
748 		for (p = rcu_dereference_protected(*prev,
749 				lockdep_is_held(rt_hash_lock_addr(i)));
750 		     p != NULL;
751 		     p = next) {
752 			next = rcu_dereference_protected(p->dst.rt_next,
753 				lockdep_is_held(rt_hash_lock_addr(i)));
754 			if (!rt_is_expired(p)) {
755 				prev = &p->dst.rt_next;
756 			} else {
757 				*prev = next;
758 				rt_free(p);
759 			}
760 		}
761 		}
762 #else
763 		rth = rcu_dereference_protected(rt_hash_table[i].chain,
764 			lockdep_is_held(rt_hash_lock_addr(i)));
765 		rcu_assign_pointer(rt_hash_table[i].chain, NULL);
766 		tail = NULL;
767 #endif
768 		spin_unlock_bh(rt_hash_lock_addr(i));
769 
770 		for (; rth != tail; rth = next) {
771 			next = rcu_dereference_protected(rth->dst.rt_next, 1);
772 			rt_free(rth);
773 		}
774 	}
775 }
776 
777 /*
778  * While freeing expired entries, we compute average chain length
779  * and standard deviation, using fixed-point arithmetic.
780  * This to have an estimation of rt_chain_length_max
781  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
782  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
783  */
784 
785 #define FRACT_BITS 3
786 #define ONE (1UL << FRACT_BITS)
787 
788 /*
789  * Given a hash chain and an item in this hash chain,
790  * find if a previous entry has the same hash_inputs
791  * (but differs on tos, mark or oif)
792  * Returns 0 if an alias is found.
793  * Returns ONE if rth has no alias before itself.
794  */
795 static int has_noalias(const struct rtable *head, const struct rtable *rth)
796 {
797 	const struct rtable *aux = head;
798 
799 	while (aux != rth) {
800 		if (compare_hash_inputs(&aux->fl, &rth->fl))
801 			return 0;
802 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
803 	}
804 	return ONE;
805 }
806 
807 static void rt_check_expire(void)
808 {
809 	static unsigned int rover;
810 	unsigned int i = rover, goal;
811 	struct rtable *rth;
812 	struct rtable __rcu **rthp;
813 	unsigned long samples = 0;
814 	unsigned long sum = 0, sum2 = 0;
815 	unsigned long delta;
816 	u64 mult;
817 
818 	delta = jiffies - expires_ljiffies;
819 	expires_ljiffies = jiffies;
820 	mult = ((u64)delta) << rt_hash_log;
821 	if (ip_rt_gc_timeout > 1)
822 		do_div(mult, ip_rt_gc_timeout);
823 	goal = (unsigned int)mult;
824 	if (goal > rt_hash_mask)
825 		goal = rt_hash_mask + 1;
826 	for (; goal > 0; goal--) {
827 		unsigned long tmo = ip_rt_gc_timeout;
828 		unsigned long length;
829 
830 		i = (i + 1) & rt_hash_mask;
831 		rthp = &rt_hash_table[i].chain;
832 
833 		if (need_resched())
834 			cond_resched();
835 
836 		samples++;
837 
838 		if (rcu_dereference_raw(*rthp) == NULL)
839 			continue;
840 		length = 0;
841 		spin_lock_bh(rt_hash_lock_addr(i));
842 		while ((rth = rcu_dereference_protected(*rthp,
843 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
844 			prefetch(rth->dst.rt_next);
845 			if (rt_is_expired(rth)) {
846 				*rthp = rth->dst.rt_next;
847 				rt_free(rth);
848 				continue;
849 			}
850 			if (rth->dst.expires) {
851 				/* Entry is expired even if it is in use */
852 				if (time_before_eq(jiffies, rth->dst.expires)) {
853 nofree:
854 					tmo >>= 1;
855 					rthp = &rth->dst.rt_next;
856 					/*
857 					 * We only count entries on
858 					 * a chain with equal hash inputs once
859 					 * so that entries for different QOS
860 					 * levels, and other non-hash input
861 					 * attributes don't unfairly skew
862 					 * the length computation
863 					 */
864 					length += has_noalias(rt_hash_table[i].chain, rth);
865 					continue;
866 				}
867 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
868 				goto nofree;
869 
870 			/* Cleanup aged off entries. */
871 			*rthp = rth->dst.rt_next;
872 			rt_free(rth);
873 		}
874 		spin_unlock_bh(rt_hash_lock_addr(i));
875 		sum += length;
876 		sum2 += length*length;
877 	}
878 	if (samples) {
879 		unsigned long avg = sum / samples;
880 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
881 		rt_chain_length_max = max_t(unsigned long,
882 					ip_rt_gc_elasticity,
883 					(avg + 4*sd) >> FRACT_BITS);
884 	}
885 	rover = i;
886 }
887 
888 /*
889  * rt_worker_func() is run in process context.
890  * we call rt_check_expire() to scan part of the hash table
891  */
892 static void rt_worker_func(struct work_struct *work)
893 {
894 	rt_check_expire();
895 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
896 }
897 
898 /*
899  * Pertubation of rt_genid by a small quantity [1..256]
900  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
901  * many times (2^24) without giving recent rt_genid.
902  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
903  */
904 static void rt_cache_invalidate(struct net *net)
905 {
906 	unsigned char shuffle;
907 
908 	get_random_bytes(&shuffle, sizeof(shuffle));
909 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
910 }
911 
912 /*
913  * delay < 0  : invalidate cache (fast : entries will be deleted later)
914  * delay >= 0 : invalidate & flush cache (can be long)
915  */
916 void rt_cache_flush(struct net *net, int delay)
917 {
918 	rt_cache_invalidate(net);
919 	if (delay >= 0)
920 		rt_do_flush(!in_softirq());
921 }
922 
923 /* Flush previous cache invalidated entries from the cache */
924 void rt_cache_flush_batch(void)
925 {
926 	rt_do_flush(!in_softirq());
927 }
928 
929 static void rt_emergency_hash_rebuild(struct net *net)
930 {
931 	if (net_ratelimit())
932 		printk(KERN_WARNING "Route hash chain too long!\n");
933 	rt_cache_invalidate(net);
934 }
935 
936 /*
937    Short description of GC goals.
938 
939    We want to build algorithm, which will keep routing cache
940    at some equilibrium point, when number of aged off entries
941    is kept approximately equal to newly generated ones.
942 
943    Current expiration strength is variable "expire".
944    We try to adjust it dynamically, so that if networking
945    is idle expires is large enough to keep enough of warm entries,
946    and when load increases it reduces to limit cache size.
947  */
948 
949 static int rt_garbage_collect(struct dst_ops *ops)
950 {
951 	static unsigned long expire = RT_GC_TIMEOUT;
952 	static unsigned long last_gc;
953 	static int rover;
954 	static int equilibrium;
955 	struct rtable *rth;
956 	struct rtable __rcu **rthp;
957 	unsigned long now = jiffies;
958 	int goal;
959 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
960 
961 	/*
962 	 * Garbage collection is pretty expensive,
963 	 * do not make it too frequently.
964 	 */
965 
966 	RT_CACHE_STAT_INC(gc_total);
967 
968 	if (now - last_gc < ip_rt_gc_min_interval &&
969 	    entries < ip_rt_max_size) {
970 		RT_CACHE_STAT_INC(gc_ignored);
971 		goto out;
972 	}
973 
974 	entries = dst_entries_get_slow(&ipv4_dst_ops);
975 	/* Calculate number of entries, which we want to expire now. */
976 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
977 	if (goal <= 0) {
978 		if (equilibrium < ipv4_dst_ops.gc_thresh)
979 			equilibrium = ipv4_dst_ops.gc_thresh;
980 		goal = entries - equilibrium;
981 		if (goal > 0) {
982 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
983 			goal = entries - equilibrium;
984 		}
985 	} else {
986 		/* We are in dangerous area. Try to reduce cache really
987 		 * aggressively.
988 		 */
989 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
990 		equilibrium = entries - goal;
991 	}
992 
993 	if (now - last_gc >= ip_rt_gc_min_interval)
994 		last_gc = now;
995 
996 	if (goal <= 0) {
997 		equilibrium += goal;
998 		goto work_done;
999 	}
1000 
1001 	do {
1002 		int i, k;
1003 
1004 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1005 			unsigned long tmo = expire;
1006 
1007 			k = (k + 1) & rt_hash_mask;
1008 			rthp = &rt_hash_table[k].chain;
1009 			spin_lock_bh(rt_hash_lock_addr(k));
1010 			while ((rth = rcu_dereference_protected(*rthp,
1011 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1012 				if (!rt_is_expired(rth) &&
1013 					!rt_may_expire(rth, tmo, expire)) {
1014 					tmo >>= 1;
1015 					rthp = &rth->dst.rt_next;
1016 					continue;
1017 				}
1018 				*rthp = rth->dst.rt_next;
1019 				rt_free(rth);
1020 				goal--;
1021 			}
1022 			spin_unlock_bh(rt_hash_lock_addr(k));
1023 			if (goal <= 0)
1024 				break;
1025 		}
1026 		rover = k;
1027 
1028 		if (goal <= 0)
1029 			goto work_done;
1030 
1031 		/* Goal is not achieved. We stop process if:
1032 
1033 		   - if expire reduced to zero. Otherwise, expire is halfed.
1034 		   - if table is not full.
1035 		   - if we are called from interrupt.
1036 		   - jiffies check is just fallback/debug loop breaker.
1037 		     We will not spin here for long time in any case.
1038 		 */
1039 
1040 		RT_CACHE_STAT_INC(gc_goal_miss);
1041 
1042 		if (expire == 0)
1043 			break;
1044 
1045 		expire >>= 1;
1046 #if RT_CACHE_DEBUG >= 2
1047 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048 				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1049 #endif
1050 
1051 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1052 			goto out;
1053 	} while (!in_softirq() && time_before_eq(jiffies, now));
1054 
1055 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1056 		goto out;
1057 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1058 		goto out;
1059 	if (net_ratelimit())
1060 		printk(KERN_WARNING "dst cache overflow\n");
1061 	RT_CACHE_STAT_INC(gc_dst_overflow);
1062 	return 1;
1063 
1064 work_done:
1065 	expire += ip_rt_gc_min_interval;
1066 	if (expire > ip_rt_gc_timeout ||
1067 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1068 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1069 		expire = ip_rt_gc_timeout;
1070 #if RT_CACHE_DEBUG >= 2
1071 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1072 			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1073 #endif
1074 out:	return 0;
1075 }
1076 
1077 /*
1078  * Returns number of entries in a hash chain that have different hash_inputs
1079  */
1080 static int slow_chain_length(const struct rtable *head)
1081 {
1082 	int length = 0;
1083 	const struct rtable *rth = head;
1084 
1085 	while (rth) {
1086 		length += has_noalias(head, rth);
1087 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1088 	}
1089 	return length >> FRACT_BITS;
1090 }
1091 
1092 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1093 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
1094 {
1095 	struct rtable	*rth, *cand;
1096 	struct rtable __rcu **rthp, **candp;
1097 	unsigned long	now;
1098 	u32 		min_score;
1099 	int		chain_length;
1100 	int attempts = !in_softirq();
1101 
1102 restart:
1103 	chain_length = 0;
1104 	min_score = ~(u32)0;
1105 	cand = NULL;
1106 	candp = NULL;
1107 	now = jiffies;
1108 
1109 	if (!rt_caching(dev_net(rt->dst.dev))) {
1110 		/*
1111 		 * If we're not caching, just tell the caller we
1112 		 * were successful and don't touch the route.  The
1113 		 * caller hold the sole reference to the cache entry, and
1114 		 * it will be released when the caller is done with it.
1115 		 * If we drop it here, the callers have no way to resolve routes
1116 		 * when we're not caching.  Instead, just point *rp at rt, so
1117 		 * the caller gets a single use out of the route
1118 		 * Note that we do rt_free on this new route entry, so that
1119 		 * once its refcount hits zero, we are still able to reap it
1120 		 * (Thanks Alexey)
1121 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1122 		 * we set DST_NOCACHE so that dst_release() can free dst without
1123 		 * waiting a grace period.
1124 		 */
1125 
1126 		rt->dst.flags |= DST_NOCACHE;
1127 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1128 			int err = arp_bind_neighbour(&rt->dst);
1129 			if (err) {
1130 				if (net_ratelimit())
1131 					printk(KERN_WARNING
1132 					    "Neighbour table failure & not caching routes.\n");
1133 				ip_rt_put(rt);
1134 				return err;
1135 			}
1136 		}
1137 
1138 		goto skip_hashing;
1139 	}
1140 
1141 	rthp = &rt_hash_table[hash].chain;
1142 
1143 	spin_lock_bh(rt_hash_lock_addr(hash));
1144 	while ((rth = rcu_dereference_protected(*rthp,
1145 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1146 		if (rt_is_expired(rth)) {
1147 			*rthp = rth->dst.rt_next;
1148 			rt_free(rth);
1149 			continue;
1150 		}
1151 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1152 			/* Put it first */
1153 			*rthp = rth->dst.rt_next;
1154 			/*
1155 			 * Since lookup is lockfree, the deletion
1156 			 * must be visible to another weakly ordered CPU before
1157 			 * the insertion at the start of the hash chain.
1158 			 */
1159 			rcu_assign_pointer(rth->dst.rt_next,
1160 					   rt_hash_table[hash].chain);
1161 			/*
1162 			 * Since lookup is lockfree, the update writes
1163 			 * must be ordered for consistency on SMP.
1164 			 */
1165 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1166 
1167 			dst_use(&rth->dst, now);
1168 			spin_unlock_bh(rt_hash_lock_addr(hash));
1169 
1170 			rt_drop(rt);
1171 			if (rp)
1172 				*rp = rth;
1173 			else
1174 				skb_dst_set(skb, &rth->dst);
1175 			return 0;
1176 		}
1177 
1178 		if (!atomic_read(&rth->dst.__refcnt)) {
1179 			u32 score = rt_score(rth);
1180 
1181 			if (score <= min_score) {
1182 				cand = rth;
1183 				candp = rthp;
1184 				min_score = score;
1185 			}
1186 		}
1187 
1188 		chain_length++;
1189 
1190 		rthp = &rth->dst.rt_next;
1191 	}
1192 
1193 	if (cand) {
1194 		/* ip_rt_gc_elasticity used to be average length of chain
1195 		 * length, when exceeded gc becomes really aggressive.
1196 		 *
1197 		 * The second limit is less certain. At the moment it allows
1198 		 * only 2 entries per bucket. We will see.
1199 		 */
1200 		if (chain_length > ip_rt_gc_elasticity) {
1201 			*candp = cand->dst.rt_next;
1202 			rt_free(cand);
1203 		}
1204 	} else {
1205 		if (chain_length > rt_chain_length_max &&
1206 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1207 			struct net *net = dev_net(rt->dst.dev);
1208 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1209 			if (!rt_caching(net)) {
1210 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1211 					rt->dst.dev->name, num);
1212 			}
1213 			rt_emergency_hash_rebuild(net);
1214 			spin_unlock_bh(rt_hash_lock_addr(hash));
1215 
1216 			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1217 					ifindex, rt_genid(net));
1218 			goto restart;
1219 		}
1220 	}
1221 
1222 	/* Try to bind route to arp only if it is output
1223 	   route or unicast forwarding path.
1224 	 */
1225 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1226 		int err = arp_bind_neighbour(&rt->dst);
1227 		if (err) {
1228 			spin_unlock_bh(rt_hash_lock_addr(hash));
1229 
1230 			if (err != -ENOBUFS) {
1231 				rt_drop(rt);
1232 				return err;
1233 			}
1234 
1235 			/* Neighbour tables are full and nothing
1236 			   can be released. Try to shrink route cache,
1237 			   it is most likely it holds some neighbour records.
1238 			 */
1239 			if (attempts-- > 0) {
1240 				int saved_elasticity = ip_rt_gc_elasticity;
1241 				int saved_int = ip_rt_gc_min_interval;
1242 				ip_rt_gc_elasticity	= 1;
1243 				ip_rt_gc_min_interval	= 0;
1244 				rt_garbage_collect(&ipv4_dst_ops);
1245 				ip_rt_gc_min_interval	= saved_int;
1246 				ip_rt_gc_elasticity	= saved_elasticity;
1247 				goto restart;
1248 			}
1249 
1250 			if (net_ratelimit())
1251 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1252 			rt_drop(rt);
1253 			return -ENOBUFS;
1254 		}
1255 	}
1256 
1257 	rt->dst.rt_next = rt_hash_table[hash].chain;
1258 
1259 #if RT_CACHE_DEBUG >= 2
1260 	if (rt->dst.rt_next) {
1261 		struct rtable *trt;
1262 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1263 		       hash, &rt->rt_dst);
1264 		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1265 			printk(" . %pI4", &trt->rt_dst);
1266 		printk("\n");
1267 	}
1268 #endif
1269 	/*
1270 	 * Since lookup is lockfree, we must make sure
1271 	 * previous writes to rt are comitted to memory
1272 	 * before making rt visible to other CPUS.
1273 	 */
1274 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1275 
1276 	spin_unlock_bh(rt_hash_lock_addr(hash));
1277 
1278 skip_hashing:
1279 	if (rp)
1280 		*rp = rt;
1281 	else
1282 		skb_dst_set(skb, &rt->dst);
1283 	return 0;
1284 }
1285 
1286 void rt_bind_peer(struct rtable *rt, int create)
1287 {
1288 	struct inet_peer *peer;
1289 
1290 	peer = inet_getpeer(rt->rt_dst, create);
1291 
1292 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1293 		inet_putpeer(peer);
1294 }
1295 
1296 /*
1297  * Peer allocation may fail only in serious out-of-memory conditions.  However
1298  * we still can generate some output.
1299  * Random ID selection looks a bit dangerous because we have no chances to
1300  * select ID being unique in a reasonable period of time.
1301  * But broken packet identifier may be better than no packet at all.
1302  */
1303 static void ip_select_fb_ident(struct iphdr *iph)
1304 {
1305 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1306 	static u32 ip_fallback_id;
1307 	u32 salt;
1308 
1309 	spin_lock_bh(&ip_fb_id_lock);
1310 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1311 	iph->id = htons(salt & 0xFFFF);
1312 	ip_fallback_id = salt;
1313 	spin_unlock_bh(&ip_fb_id_lock);
1314 }
1315 
1316 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1317 {
1318 	struct rtable *rt = (struct rtable *) dst;
1319 
1320 	if (rt) {
1321 		if (rt->peer == NULL)
1322 			rt_bind_peer(rt, 1);
1323 
1324 		/* If peer is attached to destination, it is never detached,
1325 		   so that we need not to grab a lock to dereference it.
1326 		 */
1327 		if (rt->peer) {
1328 			iph->id = htons(inet_getid(rt->peer, more));
1329 			return;
1330 		}
1331 	} else
1332 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1333 		       __builtin_return_address(0));
1334 
1335 	ip_select_fb_ident(iph);
1336 }
1337 EXPORT_SYMBOL(__ip_select_ident);
1338 
1339 static void rt_del(unsigned hash, struct rtable *rt)
1340 {
1341 	struct rtable __rcu **rthp;
1342 	struct rtable *aux;
1343 
1344 	rthp = &rt_hash_table[hash].chain;
1345 	spin_lock_bh(rt_hash_lock_addr(hash));
1346 	ip_rt_put(rt);
1347 	while ((aux = rcu_dereference_protected(*rthp,
1348 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1349 		if (aux == rt || rt_is_expired(aux)) {
1350 			*rthp = aux->dst.rt_next;
1351 			rt_free(aux);
1352 			continue;
1353 		}
1354 		rthp = &aux->dst.rt_next;
1355 	}
1356 	spin_unlock_bh(rt_hash_lock_addr(hash));
1357 }
1358 
1359 /* called in rcu_read_lock() section */
1360 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1361 		    __be32 saddr, struct net_device *dev)
1362 {
1363 	int i, k;
1364 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1365 	struct rtable *rth;
1366 	struct rtable __rcu **rthp;
1367 	__be32  skeys[2] = { saddr, 0 };
1368 	int  ikeys[2] = { dev->ifindex, 0 };
1369 	struct netevent_redirect netevent;
1370 	struct net *net;
1371 
1372 	if (!in_dev)
1373 		return;
1374 
1375 	net = dev_net(dev);
1376 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1377 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1378 	    ipv4_is_zeronet(new_gw))
1379 		goto reject_redirect;
1380 
1381 	if (!rt_caching(net))
1382 		goto reject_redirect;
1383 
1384 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1385 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1386 			goto reject_redirect;
1387 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1388 			goto reject_redirect;
1389 	} else {
1390 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1391 			goto reject_redirect;
1392 	}
1393 
1394 	for (i = 0; i < 2; i++) {
1395 		for (k = 0; k < 2; k++) {
1396 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1397 						rt_genid(net));
1398 
1399 			rthp = &rt_hash_table[hash].chain;
1400 
1401 			while ((rth = rcu_dereference(*rthp)) != NULL) {
1402 				struct rtable *rt;
1403 
1404 				if (rth->fl.fl4_dst != daddr ||
1405 				    rth->fl.fl4_src != skeys[i] ||
1406 				    rth->fl.oif != ikeys[k] ||
1407 				    rth->fl.iif != 0 ||
1408 				    rt_is_expired(rth) ||
1409 				    !net_eq(dev_net(rth->dst.dev), net)) {
1410 					rthp = &rth->dst.rt_next;
1411 					continue;
1412 				}
1413 
1414 				if (rth->rt_dst != daddr ||
1415 				    rth->rt_src != saddr ||
1416 				    rth->dst.error ||
1417 				    rth->rt_gateway != old_gw ||
1418 				    rth->dst.dev != dev)
1419 					break;
1420 
1421 				dst_hold(&rth->dst);
1422 
1423 				rt = dst_alloc(&ipv4_dst_ops);
1424 				if (rt == NULL) {
1425 					ip_rt_put(rth);
1426 					return;
1427 				}
1428 
1429 				/* Copy all the information. */
1430 				*rt = *rth;
1431 				rt->dst.__use		= 1;
1432 				atomic_set(&rt->dst.__refcnt, 1);
1433 				rt->dst.child		= NULL;
1434 				if (rt->dst.dev)
1435 					dev_hold(rt->dst.dev);
1436 				if (rt->idev)
1437 					in_dev_hold(rt->idev);
1438 				rt->dst.obsolete	= -1;
1439 				rt->dst.lastuse	= jiffies;
1440 				rt->dst.path		= &rt->dst;
1441 				rt->dst.neighbour	= NULL;
1442 				rt->dst.hh		= NULL;
1443 #ifdef CONFIG_XFRM
1444 				rt->dst.xfrm		= NULL;
1445 #endif
1446 				rt->rt_genid		= rt_genid(net);
1447 				rt->rt_flags		|= RTCF_REDIRECTED;
1448 
1449 				/* Gateway is different ... */
1450 				rt->rt_gateway		= new_gw;
1451 
1452 				/* Redirect received -> path was valid */
1453 				dst_confirm(&rth->dst);
1454 
1455 				if (rt->peer)
1456 					atomic_inc(&rt->peer->refcnt);
1457 
1458 				if (arp_bind_neighbour(&rt->dst) ||
1459 				    !(rt->dst.neighbour->nud_state &
1460 					    NUD_VALID)) {
1461 					if (rt->dst.neighbour)
1462 						neigh_event_send(rt->dst.neighbour, NULL);
1463 					ip_rt_put(rth);
1464 					rt_drop(rt);
1465 					goto do_next;
1466 				}
1467 
1468 				netevent.old = &rth->dst;
1469 				netevent.new = &rt->dst;
1470 				call_netevent_notifiers(NETEVENT_REDIRECT,
1471 							&netevent);
1472 
1473 				rt_del(hash, rth);
1474 				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1475 					ip_rt_put(rt);
1476 				goto do_next;
1477 			}
1478 		do_next:
1479 			;
1480 		}
1481 	}
1482 	return;
1483 
1484 reject_redirect:
1485 #ifdef CONFIG_IP_ROUTE_VERBOSE
1486 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1487 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1488 			"  Advised path = %pI4 -> %pI4\n",
1489 		       &old_gw, dev->name, &new_gw,
1490 		       &saddr, &daddr);
1491 #endif
1492 	;
1493 }
1494 
1495 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1496 {
1497 	struct rtable *rt = (struct rtable *)dst;
1498 	struct dst_entry *ret = dst;
1499 
1500 	if (rt) {
1501 		if (dst->obsolete > 0) {
1502 			ip_rt_put(rt);
1503 			ret = NULL;
1504 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1505 			   (rt->dst.expires &&
1506 			    time_after_eq(jiffies, rt->dst.expires))) {
1507 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1508 						rt->fl.oif,
1509 						rt_genid(dev_net(dst->dev)));
1510 #if RT_CACHE_DEBUG >= 1
1511 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1512 				&rt->rt_dst, rt->fl.fl4_tos);
1513 #endif
1514 			rt_del(hash, rt);
1515 			ret = NULL;
1516 		}
1517 	}
1518 	return ret;
1519 }
1520 
1521 /*
1522  * Algorithm:
1523  *	1. The first ip_rt_redirect_number redirects are sent
1524  *	   with exponential backoff, then we stop sending them at all,
1525  *	   assuming that the host ignores our redirects.
1526  *	2. If we did not see packets requiring redirects
1527  *	   during ip_rt_redirect_silence, we assume that the host
1528  *	   forgot redirected route and start to send redirects again.
1529  *
1530  * This algorithm is much cheaper and more intelligent than dumb load limiting
1531  * in icmp.c.
1532  *
1533  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1534  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1535  */
1536 
1537 void ip_rt_send_redirect(struct sk_buff *skb)
1538 {
1539 	struct rtable *rt = skb_rtable(skb);
1540 	struct in_device *in_dev;
1541 	int log_martians;
1542 
1543 	rcu_read_lock();
1544 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1545 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1546 		rcu_read_unlock();
1547 		return;
1548 	}
1549 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1550 	rcu_read_unlock();
1551 
1552 	/* No redirected packets during ip_rt_redirect_silence;
1553 	 * reset the algorithm.
1554 	 */
1555 	if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1556 		rt->dst.rate_tokens = 0;
1557 
1558 	/* Too many ignored redirects; do not send anything
1559 	 * set dst.rate_last to the last seen redirected packet.
1560 	 */
1561 	if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1562 		rt->dst.rate_last = jiffies;
1563 		return;
1564 	}
1565 
1566 	/* Check for load limit; set rate_last to the latest sent
1567 	 * redirect.
1568 	 */
1569 	if (rt->dst.rate_tokens == 0 ||
1570 	    time_after(jiffies,
1571 		       (rt->dst.rate_last +
1572 			(ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1573 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1574 		rt->dst.rate_last = jiffies;
1575 		++rt->dst.rate_tokens;
1576 #ifdef CONFIG_IP_ROUTE_VERBOSE
1577 		if (log_martians &&
1578 		    rt->dst.rate_tokens == ip_rt_redirect_number &&
1579 		    net_ratelimit())
1580 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1581 				&rt->rt_src, rt->rt_iif,
1582 				&rt->rt_dst, &rt->rt_gateway);
1583 #endif
1584 	}
1585 }
1586 
1587 static int ip_error(struct sk_buff *skb)
1588 {
1589 	struct rtable *rt = skb_rtable(skb);
1590 	unsigned long now;
1591 	int code;
1592 
1593 	switch (rt->dst.error) {
1594 		case EINVAL:
1595 		default:
1596 			goto out;
1597 		case EHOSTUNREACH:
1598 			code = ICMP_HOST_UNREACH;
1599 			break;
1600 		case ENETUNREACH:
1601 			code = ICMP_NET_UNREACH;
1602 			IP_INC_STATS_BH(dev_net(rt->dst.dev),
1603 					IPSTATS_MIB_INNOROUTES);
1604 			break;
1605 		case EACCES:
1606 			code = ICMP_PKT_FILTERED;
1607 			break;
1608 	}
1609 
1610 	now = jiffies;
1611 	rt->dst.rate_tokens += now - rt->dst.rate_last;
1612 	if (rt->dst.rate_tokens > ip_rt_error_burst)
1613 		rt->dst.rate_tokens = ip_rt_error_burst;
1614 	rt->dst.rate_last = now;
1615 	if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1616 		rt->dst.rate_tokens -= ip_rt_error_cost;
1617 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1618 	}
1619 
1620 out:	kfree_skb(skb);
1621 	return 0;
1622 }
1623 
1624 /*
1625  *	The last two values are not from the RFC but
1626  *	are needed for AMPRnet AX.25 paths.
1627  */
1628 
1629 static const unsigned short mtu_plateau[] =
1630 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1631 
1632 static inline unsigned short guess_mtu(unsigned short old_mtu)
1633 {
1634 	int i;
1635 
1636 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1637 		if (old_mtu > mtu_plateau[i])
1638 			return mtu_plateau[i];
1639 	return 68;
1640 }
1641 
1642 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1643 				 unsigned short new_mtu,
1644 				 struct net_device *dev)
1645 {
1646 	int i, k;
1647 	unsigned short old_mtu = ntohs(iph->tot_len);
1648 	struct rtable *rth;
1649 	int  ikeys[2] = { dev->ifindex, 0 };
1650 	__be32  skeys[2] = { iph->saddr, 0, };
1651 	__be32  daddr = iph->daddr;
1652 	unsigned short est_mtu = 0;
1653 
1654 	for (k = 0; k < 2; k++) {
1655 		for (i = 0; i < 2; i++) {
1656 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1657 						rt_genid(net));
1658 
1659 			rcu_read_lock();
1660 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1661 			     rth = rcu_dereference(rth->dst.rt_next)) {
1662 				unsigned short mtu = new_mtu;
1663 
1664 				if (rth->fl.fl4_dst != daddr ||
1665 				    rth->fl.fl4_src != skeys[i] ||
1666 				    rth->rt_dst != daddr ||
1667 				    rth->rt_src != iph->saddr ||
1668 				    rth->fl.oif != ikeys[k] ||
1669 				    rth->fl.iif != 0 ||
1670 				    dst_metric_locked(&rth->dst, RTAX_MTU) ||
1671 				    !net_eq(dev_net(rth->dst.dev), net) ||
1672 				    rt_is_expired(rth))
1673 					continue;
1674 
1675 				if (new_mtu < 68 || new_mtu >= old_mtu) {
1676 
1677 					/* BSD 4.2 compatibility hack :-( */
1678 					if (mtu == 0 &&
1679 					    old_mtu >= dst_mtu(&rth->dst) &&
1680 					    old_mtu >= 68 + (iph->ihl << 2))
1681 						old_mtu -= iph->ihl << 2;
1682 
1683 					mtu = guess_mtu(old_mtu);
1684 				}
1685 				if (mtu <= dst_mtu(&rth->dst)) {
1686 					if (mtu < dst_mtu(&rth->dst)) {
1687 						dst_confirm(&rth->dst);
1688 						if (mtu < ip_rt_min_pmtu) {
1689 							mtu = ip_rt_min_pmtu;
1690 							rth->dst.metrics[RTAX_LOCK-1] |=
1691 								(1 << RTAX_MTU);
1692 						}
1693 						rth->dst.metrics[RTAX_MTU-1] = mtu;
1694 						dst_set_expires(&rth->dst,
1695 							ip_rt_mtu_expires);
1696 					}
1697 					est_mtu = mtu;
1698 				}
1699 			}
1700 			rcu_read_unlock();
1701 		}
1702 	}
1703 	return est_mtu ? : new_mtu;
1704 }
1705 
1706 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1707 {
1708 	if (dst_mtu(dst) > mtu && mtu >= 68 &&
1709 	    !(dst_metric_locked(dst, RTAX_MTU))) {
1710 		if (mtu < ip_rt_min_pmtu) {
1711 			mtu = ip_rt_min_pmtu;
1712 			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1713 		}
1714 		dst->metrics[RTAX_MTU-1] = mtu;
1715 		dst_set_expires(dst, ip_rt_mtu_expires);
1716 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1717 	}
1718 }
1719 
1720 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1721 {
1722 	if (rt_is_expired((struct rtable *)dst))
1723 		return NULL;
1724 	return dst;
1725 }
1726 
1727 static void ipv4_dst_destroy(struct dst_entry *dst)
1728 {
1729 	struct rtable *rt = (struct rtable *) dst;
1730 	struct inet_peer *peer = rt->peer;
1731 	struct in_device *idev = rt->idev;
1732 
1733 	if (peer) {
1734 		rt->peer = NULL;
1735 		inet_putpeer(peer);
1736 	}
1737 
1738 	if (idev) {
1739 		rt->idev = NULL;
1740 		in_dev_put(idev);
1741 	}
1742 }
1743 
1744 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1745 			    int how)
1746 {
1747 	struct rtable *rt = (struct rtable *) dst;
1748 	struct in_device *idev = rt->idev;
1749 	if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1750 		struct in_device *loopback_idev =
1751 			in_dev_get(dev_net(dev)->loopback_dev);
1752 		if (loopback_idev) {
1753 			rt->idev = loopback_idev;
1754 			in_dev_put(idev);
1755 		}
1756 	}
1757 }
1758 
1759 static void ipv4_link_failure(struct sk_buff *skb)
1760 {
1761 	struct rtable *rt;
1762 
1763 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1764 
1765 	rt = skb_rtable(skb);
1766 	if (rt)
1767 		dst_set_expires(&rt->dst, 0);
1768 }
1769 
1770 static int ip_rt_bug(struct sk_buff *skb)
1771 {
1772 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1773 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1774 		skb->dev ? skb->dev->name : "?");
1775 	kfree_skb(skb);
1776 	return 0;
1777 }
1778 
1779 /*
1780    We do not cache source address of outgoing interface,
1781    because it is used only by IP RR, TS and SRR options,
1782    so that it out of fast path.
1783 
1784    BTW remember: "addr" is allowed to be not aligned
1785    in IP options!
1786  */
1787 
1788 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1789 {
1790 	__be32 src;
1791 	struct fib_result res;
1792 
1793 	if (rt->fl.iif == 0)
1794 		src = rt->rt_src;
1795 	else {
1796 		rcu_read_lock();
1797 		if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1798 			src = FIB_RES_PREFSRC(res);
1799 		else
1800 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1801 					RT_SCOPE_UNIVERSE);
1802 		rcu_read_unlock();
1803 	}
1804 	memcpy(addr, &src, 4);
1805 }
1806 
1807 #ifdef CONFIG_NET_CLS_ROUTE
1808 static void set_class_tag(struct rtable *rt, u32 tag)
1809 {
1810 	if (!(rt->dst.tclassid & 0xFFFF))
1811 		rt->dst.tclassid |= tag & 0xFFFF;
1812 	if (!(rt->dst.tclassid & 0xFFFF0000))
1813 		rt->dst.tclassid |= tag & 0xFFFF0000;
1814 }
1815 #endif
1816 
1817 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1818 {
1819 	struct fib_info *fi = res->fi;
1820 
1821 	if (fi) {
1822 		if (FIB_RES_GW(*res) &&
1823 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1824 			rt->rt_gateway = FIB_RES_GW(*res);
1825 		memcpy(rt->dst.metrics, fi->fib_metrics,
1826 		       sizeof(rt->dst.metrics));
1827 		if (fi->fib_mtu == 0) {
1828 			rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1829 			if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1830 			    rt->rt_gateway != rt->rt_dst &&
1831 			    rt->dst.dev->mtu > 576)
1832 				rt->dst.metrics[RTAX_MTU-1] = 576;
1833 		}
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1836 #endif
1837 	} else
1838 		rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1839 
1840 	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1841 		rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1842 	if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1843 		rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1844 	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1845 		rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1846 				       ip_rt_min_advmss);
1847 	if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1848 		rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1849 
1850 #ifdef CONFIG_NET_CLS_ROUTE
1851 #ifdef CONFIG_IP_MULTIPLE_TABLES
1852 	set_class_tag(rt, fib_rules_tclass(res));
1853 #endif
1854 	set_class_tag(rt, itag);
1855 #endif
1856 	rt->rt_type = res->type;
1857 }
1858 
1859 /* called in rcu_read_lock() section */
1860 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1861 				u8 tos, struct net_device *dev, int our)
1862 {
1863 	unsigned int hash;
1864 	struct rtable *rth;
1865 	__be32 spec_dst;
1866 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1867 	u32 itag = 0;
1868 	int err;
1869 
1870 	/* Primary sanity checks. */
1871 
1872 	if (in_dev == NULL)
1873 		return -EINVAL;
1874 
1875 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1876 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1877 		goto e_inval;
1878 
1879 	if (ipv4_is_zeronet(saddr)) {
1880 		if (!ipv4_is_local_multicast(daddr))
1881 			goto e_inval;
1882 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1883 	} else {
1884 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1885 					  &itag, 0);
1886 		if (err < 0)
1887 			goto e_err;
1888 	}
1889 	rth = dst_alloc(&ipv4_dst_ops);
1890 	if (!rth)
1891 		goto e_nobufs;
1892 
1893 	rth->dst.output = ip_rt_bug;
1894 	rth->dst.obsolete = -1;
1895 
1896 	atomic_set(&rth->dst.__refcnt, 1);
1897 	rth->dst.flags= DST_HOST;
1898 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1899 		rth->dst.flags |= DST_NOPOLICY;
1900 	rth->fl.fl4_dst	= daddr;
1901 	rth->rt_dst	= daddr;
1902 	rth->fl.fl4_tos	= tos;
1903 	rth->fl.mark    = skb->mark;
1904 	rth->fl.fl4_src	= saddr;
1905 	rth->rt_src	= saddr;
1906 #ifdef CONFIG_NET_CLS_ROUTE
1907 	rth->dst.tclassid = itag;
1908 #endif
1909 	rth->rt_iif	=
1910 	rth->fl.iif	= dev->ifindex;
1911 	rth->dst.dev	= init_net.loopback_dev;
1912 	dev_hold(rth->dst.dev);
1913 	rth->idev	= in_dev_get(rth->dst.dev);
1914 	rth->fl.oif	= 0;
1915 	rth->rt_gateway	= daddr;
1916 	rth->rt_spec_dst= spec_dst;
1917 	rth->rt_genid	= rt_genid(dev_net(dev));
1918 	rth->rt_flags	= RTCF_MULTICAST;
1919 	rth->rt_type	= RTN_MULTICAST;
1920 	if (our) {
1921 		rth->dst.input= ip_local_deliver;
1922 		rth->rt_flags |= RTCF_LOCAL;
1923 	}
1924 
1925 #ifdef CONFIG_IP_MROUTE
1926 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1927 		rth->dst.input = ip_mr_input;
1928 #endif
1929 	RT_CACHE_STAT_INC(in_slow_mc);
1930 
1931 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1932 	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1933 
1934 e_nobufs:
1935 	return -ENOBUFS;
1936 e_inval:
1937 	return -EINVAL;
1938 e_err:
1939 	return err;
1940 }
1941 
1942 
1943 static void ip_handle_martian_source(struct net_device *dev,
1944 				     struct in_device *in_dev,
1945 				     struct sk_buff *skb,
1946 				     __be32 daddr,
1947 				     __be32 saddr)
1948 {
1949 	RT_CACHE_STAT_INC(in_martian_src);
1950 #ifdef CONFIG_IP_ROUTE_VERBOSE
1951 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1952 		/*
1953 		 *	RFC1812 recommendation, if source is martian,
1954 		 *	the only hint is MAC header.
1955 		 */
1956 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1957 			&daddr, &saddr, dev->name);
1958 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1959 			int i;
1960 			const unsigned char *p = skb_mac_header(skb);
1961 			printk(KERN_WARNING "ll header: ");
1962 			for (i = 0; i < dev->hard_header_len; i++, p++) {
1963 				printk("%02x", *p);
1964 				if (i < (dev->hard_header_len - 1))
1965 					printk(":");
1966 			}
1967 			printk("\n");
1968 		}
1969 	}
1970 #endif
1971 }
1972 
1973 /* called in rcu_read_lock() section */
1974 static int __mkroute_input(struct sk_buff *skb,
1975 			   struct fib_result *res,
1976 			   struct in_device *in_dev,
1977 			   __be32 daddr, __be32 saddr, u32 tos,
1978 			   struct rtable **result)
1979 {
1980 	struct rtable *rth;
1981 	int err;
1982 	struct in_device *out_dev;
1983 	unsigned int flags = 0;
1984 	__be32 spec_dst;
1985 	u32 itag;
1986 
1987 	/* get a working reference to the output device */
1988 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1989 	if (out_dev == NULL) {
1990 		if (net_ratelimit())
1991 			printk(KERN_CRIT "Bug in ip_route_input" \
1992 			       "_slow(). Please, report\n");
1993 		return -EINVAL;
1994 	}
1995 
1996 
1997 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1998 				  in_dev->dev, &spec_dst, &itag, skb->mark);
1999 	if (err < 0) {
2000 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2001 					 saddr);
2002 
2003 		goto cleanup;
2004 	}
2005 
2006 	if (err)
2007 		flags |= RTCF_DIRECTSRC;
2008 
2009 	if (out_dev == in_dev && err &&
2010 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2011 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2012 		flags |= RTCF_DOREDIRECT;
2013 
2014 	if (skb->protocol != htons(ETH_P_IP)) {
2015 		/* Not IP (i.e. ARP). Do not create route, if it is
2016 		 * invalid for proxy arp. DNAT routes are always valid.
2017 		 *
2018 		 * Proxy arp feature have been extended to allow, ARP
2019 		 * replies back to the same interface, to support
2020 		 * Private VLAN switch technologies. See arp.c.
2021 		 */
2022 		if (out_dev == in_dev &&
2023 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2024 			err = -EINVAL;
2025 			goto cleanup;
2026 		}
2027 	}
2028 
2029 
2030 	rth = dst_alloc(&ipv4_dst_ops);
2031 	if (!rth) {
2032 		err = -ENOBUFS;
2033 		goto cleanup;
2034 	}
2035 
2036 	atomic_set(&rth->dst.__refcnt, 1);
2037 	rth->dst.flags= DST_HOST;
2038 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2039 		rth->dst.flags |= DST_NOPOLICY;
2040 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2041 		rth->dst.flags |= DST_NOXFRM;
2042 	rth->fl.fl4_dst	= daddr;
2043 	rth->rt_dst	= daddr;
2044 	rth->fl.fl4_tos	= tos;
2045 	rth->fl.mark    = skb->mark;
2046 	rth->fl.fl4_src	= saddr;
2047 	rth->rt_src	= saddr;
2048 	rth->rt_gateway	= daddr;
2049 	rth->rt_iif 	=
2050 		rth->fl.iif	= in_dev->dev->ifindex;
2051 	rth->dst.dev	= (out_dev)->dev;
2052 	dev_hold(rth->dst.dev);
2053 	rth->idev	= in_dev_get(rth->dst.dev);
2054 	rth->fl.oif 	= 0;
2055 	rth->rt_spec_dst= spec_dst;
2056 
2057 	rth->dst.obsolete = -1;
2058 	rth->dst.input = ip_forward;
2059 	rth->dst.output = ip_output;
2060 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2061 
2062 	rt_set_nexthop(rth, res, itag);
2063 
2064 	rth->rt_flags = flags;
2065 
2066 	*result = rth;
2067 	err = 0;
2068  cleanup:
2069 	return err;
2070 }
2071 
2072 static int ip_mkroute_input(struct sk_buff *skb,
2073 			    struct fib_result *res,
2074 			    const struct flowi *fl,
2075 			    struct in_device *in_dev,
2076 			    __be32 daddr, __be32 saddr, u32 tos)
2077 {
2078 	struct rtable* rth = NULL;
2079 	int err;
2080 	unsigned hash;
2081 
2082 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2083 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2084 		fib_select_multipath(fl, res);
2085 #endif
2086 
2087 	/* create a routing cache entry */
2088 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2089 	if (err)
2090 		return err;
2091 
2092 	/* put it into the cache */
2093 	hash = rt_hash(daddr, saddr, fl->iif,
2094 		       rt_genid(dev_net(rth->dst.dev)));
2095 	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2096 }
2097 
2098 /*
2099  *	NOTE. We drop all the packets that has local source
2100  *	addresses, because every properly looped back packet
2101  *	must have correct destination already attached by output routine.
2102  *
2103  *	Such approach solves two big problems:
2104  *	1. Not simplex devices are handled properly.
2105  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2106  *	called with rcu_read_lock()
2107  */
2108 
2109 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110 			       u8 tos, struct net_device *dev)
2111 {
2112 	struct fib_result res;
2113 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2114 	struct flowi fl = { .nl_u = { .ip4_u =
2115 				      { .daddr = daddr,
2116 					.saddr = saddr,
2117 					.tos = tos,
2118 					.scope = RT_SCOPE_UNIVERSE,
2119 				      } },
2120 			    .mark = skb->mark,
2121 			    .iif = dev->ifindex };
2122 	unsigned	flags = 0;
2123 	u32		itag = 0;
2124 	struct rtable * rth;
2125 	unsigned	hash;
2126 	__be32		spec_dst;
2127 	int		err = -EINVAL;
2128 	struct net    * net = dev_net(dev);
2129 
2130 	/* IP on this device is disabled. */
2131 
2132 	if (!in_dev)
2133 		goto out;
2134 
2135 	/* Check for the most weird martians, which can be not detected
2136 	   by fib_lookup.
2137 	 */
2138 
2139 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2140 	    ipv4_is_loopback(saddr))
2141 		goto martian_source;
2142 
2143 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2144 		goto brd_input;
2145 
2146 	/* Accept zero addresses only to limited broadcast;
2147 	 * I even do not know to fix it or not. Waiting for complains :-)
2148 	 */
2149 	if (ipv4_is_zeronet(saddr))
2150 		goto martian_source;
2151 
2152 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2153 		goto martian_destination;
2154 
2155 	/*
2156 	 *	Now we are ready to route packet.
2157 	 */
2158 	err = fib_lookup(net, &fl, &res);
2159 	if (err != 0) {
2160 		if (!IN_DEV_FORWARD(in_dev))
2161 			goto e_hostunreach;
2162 		goto no_route;
2163 	}
2164 
2165 	RT_CACHE_STAT_INC(in_slow_tot);
2166 
2167 	if (res.type == RTN_BROADCAST)
2168 		goto brd_input;
2169 
2170 	if (res.type == RTN_LOCAL) {
2171 		err = fib_validate_source(saddr, daddr, tos,
2172 					  net->loopback_dev->ifindex,
2173 					  dev, &spec_dst, &itag, skb->mark);
2174 		if (err < 0)
2175 			goto martian_source_keep_err;
2176 		if (err)
2177 			flags |= RTCF_DIRECTSRC;
2178 		spec_dst = daddr;
2179 		goto local_input;
2180 	}
2181 
2182 	if (!IN_DEV_FORWARD(in_dev))
2183 		goto e_hostunreach;
2184 	if (res.type != RTN_UNICAST)
2185 		goto martian_destination;
2186 
2187 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2188 out:	return err;
2189 
2190 brd_input:
2191 	if (skb->protocol != htons(ETH_P_IP))
2192 		goto e_inval;
2193 
2194 	if (ipv4_is_zeronet(saddr))
2195 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2196 	else {
2197 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2198 					  &itag, skb->mark);
2199 		if (err < 0)
2200 			goto martian_source_keep_err;
2201 		if (err)
2202 			flags |= RTCF_DIRECTSRC;
2203 	}
2204 	flags |= RTCF_BROADCAST;
2205 	res.type = RTN_BROADCAST;
2206 	RT_CACHE_STAT_INC(in_brd);
2207 
2208 local_input:
2209 	rth = dst_alloc(&ipv4_dst_ops);
2210 	if (!rth)
2211 		goto e_nobufs;
2212 
2213 	rth->dst.output= ip_rt_bug;
2214 	rth->dst.obsolete = -1;
2215 	rth->rt_genid = rt_genid(net);
2216 
2217 	atomic_set(&rth->dst.__refcnt, 1);
2218 	rth->dst.flags= DST_HOST;
2219 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2220 		rth->dst.flags |= DST_NOPOLICY;
2221 	rth->fl.fl4_dst	= daddr;
2222 	rth->rt_dst	= daddr;
2223 	rth->fl.fl4_tos	= tos;
2224 	rth->fl.mark    = skb->mark;
2225 	rth->fl.fl4_src	= saddr;
2226 	rth->rt_src	= saddr;
2227 #ifdef CONFIG_NET_CLS_ROUTE
2228 	rth->dst.tclassid = itag;
2229 #endif
2230 	rth->rt_iif	=
2231 	rth->fl.iif	= dev->ifindex;
2232 	rth->dst.dev	= net->loopback_dev;
2233 	dev_hold(rth->dst.dev);
2234 	rth->idev	= in_dev_get(rth->dst.dev);
2235 	rth->rt_gateway	= daddr;
2236 	rth->rt_spec_dst= spec_dst;
2237 	rth->dst.input= ip_local_deliver;
2238 	rth->rt_flags 	= flags|RTCF_LOCAL;
2239 	if (res.type == RTN_UNREACHABLE) {
2240 		rth->dst.input= ip_error;
2241 		rth->dst.error= -err;
2242 		rth->rt_flags 	&= ~RTCF_LOCAL;
2243 	}
2244 	rth->rt_type	= res.type;
2245 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2246 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2247 	goto out;
2248 
2249 no_route:
2250 	RT_CACHE_STAT_INC(in_no_route);
2251 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2252 	res.type = RTN_UNREACHABLE;
2253 	if (err == -ESRCH)
2254 		err = -ENETUNREACH;
2255 	goto local_input;
2256 
2257 	/*
2258 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2259 	 */
2260 martian_destination:
2261 	RT_CACHE_STAT_INC(in_martian_dst);
2262 #ifdef CONFIG_IP_ROUTE_VERBOSE
2263 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2264 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2265 			&daddr, &saddr, dev->name);
2266 #endif
2267 
2268 e_hostunreach:
2269 	err = -EHOSTUNREACH;
2270 	goto out;
2271 
2272 e_inval:
2273 	err = -EINVAL;
2274 	goto out;
2275 
2276 e_nobufs:
2277 	err = -ENOBUFS;
2278 	goto out;
2279 
2280 martian_source:
2281 	err = -EINVAL;
2282 martian_source_keep_err:
2283 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2284 	goto out;
2285 }
2286 
2287 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288 			   u8 tos, struct net_device *dev, bool noref)
2289 {
2290 	struct rtable * rth;
2291 	unsigned	hash;
2292 	int iif = dev->ifindex;
2293 	struct net *net;
2294 	int res;
2295 
2296 	net = dev_net(dev);
2297 
2298 	rcu_read_lock();
2299 
2300 	if (!rt_caching(net))
2301 		goto skip_cache;
2302 
2303 	tos &= IPTOS_RT_MASK;
2304 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2305 
2306 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2307 	     rth = rcu_dereference(rth->dst.rt_next)) {
2308 		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2309 		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2310 		     (rth->fl.iif ^ iif) |
2311 		     rth->fl.oif |
2312 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
2313 		    rth->fl.mark == skb->mark &&
2314 		    net_eq(dev_net(rth->dst.dev), net) &&
2315 		    !rt_is_expired(rth)) {
2316 			if (noref) {
2317 				dst_use_noref(&rth->dst, jiffies);
2318 				skb_dst_set_noref(skb, &rth->dst);
2319 			} else {
2320 				dst_use(&rth->dst, jiffies);
2321 				skb_dst_set(skb, &rth->dst);
2322 			}
2323 			RT_CACHE_STAT_INC(in_hit);
2324 			rcu_read_unlock();
2325 			return 0;
2326 		}
2327 		RT_CACHE_STAT_INC(in_hlist_search);
2328 	}
2329 
2330 skip_cache:
2331 	/* Multicast recognition logic is moved from route cache to here.
2332 	   The problem was that too many Ethernet cards have broken/missing
2333 	   hardware multicast filters :-( As result the host on multicasting
2334 	   network acquires a lot of useless route cache entries, sort of
2335 	   SDR messages from all the world. Now we try to get rid of them.
2336 	   Really, provided software IP multicast filter is organized
2337 	   reasonably (at least, hashed), it does not result in a slowdown
2338 	   comparing with route cache reject entries.
2339 	   Note, that multicast routers are not affected, because
2340 	   route cache entry is created eventually.
2341 	 */
2342 	if (ipv4_is_multicast(daddr)) {
2343 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2344 
2345 		if (in_dev) {
2346 			int our = ip_check_mc(in_dev, daddr, saddr,
2347 					      ip_hdr(skb)->protocol);
2348 			if (our
2349 #ifdef CONFIG_IP_MROUTE
2350 				||
2351 			    (!ipv4_is_local_multicast(daddr) &&
2352 			     IN_DEV_MFORWARD(in_dev))
2353 #endif
2354 			   ) {
2355 				int res = ip_route_input_mc(skb, daddr, saddr,
2356 							    tos, dev, our);
2357 				rcu_read_unlock();
2358 				return res;
2359 			}
2360 		}
2361 		rcu_read_unlock();
2362 		return -EINVAL;
2363 	}
2364 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2365 	rcu_read_unlock();
2366 	return res;
2367 }
2368 EXPORT_SYMBOL(ip_route_input_common);
2369 
2370 /* called with rcu_read_lock() */
2371 static int __mkroute_output(struct rtable **result,
2372 			    struct fib_result *res,
2373 			    const struct flowi *fl,
2374 			    const struct flowi *oldflp,
2375 			    struct net_device *dev_out,
2376 			    unsigned flags)
2377 {
2378 	struct rtable *rth;
2379 	struct in_device *in_dev;
2380 	u32 tos = RT_FL_TOS(oldflp);
2381 
2382 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2383 		return -EINVAL;
2384 
2385 	if (ipv4_is_lbcast(fl->fl4_dst))
2386 		res->type = RTN_BROADCAST;
2387 	else if (ipv4_is_multicast(fl->fl4_dst))
2388 		res->type = RTN_MULTICAST;
2389 	else if (ipv4_is_zeronet(fl->fl4_dst))
2390 		return -EINVAL;
2391 
2392 	if (dev_out->flags & IFF_LOOPBACK)
2393 		flags |= RTCF_LOCAL;
2394 
2395 	in_dev = __in_dev_get_rcu(dev_out);
2396 	if (!in_dev)
2397 		return -EINVAL;
2398 
2399 	if (res->type == RTN_BROADCAST) {
2400 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2401 		res->fi = NULL;
2402 	} else if (res->type == RTN_MULTICAST) {
2403 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2404 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2405 				 oldflp->proto))
2406 			flags &= ~RTCF_LOCAL;
2407 		/* If multicast route do not exist use
2408 		 * default one, but do not gateway in this case.
2409 		 * Yes, it is hack.
2410 		 */
2411 		if (res->fi && res->prefixlen < 4)
2412 			res->fi = NULL;
2413 	}
2414 
2415 
2416 	rth = dst_alloc(&ipv4_dst_ops);
2417 	if (!rth)
2418 		return -ENOBUFS;
2419 
2420 	in_dev_hold(in_dev);
2421 	rth->idev = in_dev;
2422 
2423 	atomic_set(&rth->dst.__refcnt, 1);
2424 	rth->dst.flags= DST_HOST;
2425 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2426 		rth->dst.flags |= DST_NOXFRM;
2427 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2428 		rth->dst.flags |= DST_NOPOLICY;
2429 
2430 	rth->fl.fl4_dst	= oldflp->fl4_dst;
2431 	rth->fl.fl4_tos	= tos;
2432 	rth->fl.fl4_src	= oldflp->fl4_src;
2433 	rth->fl.oif	= oldflp->oif;
2434 	rth->fl.mark    = oldflp->mark;
2435 	rth->rt_dst	= fl->fl4_dst;
2436 	rth->rt_src	= fl->fl4_src;
2437 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
2438 	/* get references to the devices that are to be hold by the routing
2439 	   cache entry */
2440 	rth->dst.dev	= dev_out;
2441 	dev_hold(dev_out);
2442 	rth->rt_gateway = fl->fl4_dst;
2443 	rth->rt_spec_dst= fl->fl4_src;
2444 
2445 	rth->dst.output=ip_output;
2446 	rth->dst.obsolete = -1;
2447 	rth->rt_genid = rt_genid(dev_net(dev_out));
2448 
2449 	RT_CACHE_STAT_INC(out_slow_tot);
2450 
2451 	if (flags & RTCF_LOCAL) {
2452 		rth->dst.input = ip_local_deliver;
2453 		rth->rt_spec_dst = fl->fl4_dst;
2454 	}
2455 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2456 		rth->rt_spec_dst = fl->fl4_src;
2457 		if (flags & RTCF_LOCAL &&
2458 		    !(dev_out->flags & IFF_LOOPBACK)) {
2459 			rth->dst.output = ip_mc_output;
2460 			RT_CACHE_STAT_INC(out_slow_mc);
2461 		}
2462 #ifdef CONFIG_IP_MROUTE
2463 		if (res->type == RTN_MULTICAST) {
2464 			if (IN_DEV_MFORWARD(in_dev) &&
2465 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2466 				rth->dst.input = ip_mr_input;
2467 				rth->dst.output = ip_mc_output;
2468 			}
2469 		}
2470 #endif
2471 	}
2472 
2473 	rt_set_nexthop(rth, res, 0);
2474 
2475 	rth->rt_flags = flags;
2476 	*result = rth;
2477 	return 0;
2478 }
2479 
2480 /* called with rcu_read_lock() */
2481 static int ip_mkroute_output(struct rtable **rp,
2482 			     struct fib_result *res,
2483 			     const struct flowi *fl,
2484 			     const struct flowi *oldflp,
2485 			     struct net_device *dev_out,
2486 			     unsigned flags)
2487 {
2488 	struct rtable *rth = NULL;
2489 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2490 	unsigned hash;
2491 	if (err == 0) {
2492 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2493 			       rt_genid(dev_net(dev_out)));
2494 		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2495 	}
2496 
2497 	return err;
2498 }
2499 
2500 /*
2501  * Major route resolver routine.
2502  * called with rcu_read_lock();
2503  */
2504 
2505 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2506 				const struct flowi *oldflp)
2507 {
2508 	u32 tos	= RT_FL_TOS(oldflp);
2509 	struct flowi fl = { .nl_u = { .ip4_u =
2510 				      { .daddr = oldflp->fl4_dst,
2511 					.saddr = oldflp->fl4_src,
2512 					.tos = tos & IPTOS_RT_MASK,
2513 					.scope = ((tos & RTO_ONLINK) ?
2514 						  RT_SCOPE_LINK :
2515 						  RT_SCOPE_UNIVERSE),
2516 				      } },
2517 			    .mark = oldflp->mark,
2518 			    .iif = net->loopback_dev->ifindex,
2519 			    .oif = oldflp->oif };
2520 	struct fib_result res;
2521 	unsigned int flags = 0;
2522 	struct net_device *dev_out = NULL;
2523 	int err;
2524 
2525 
2526 	res.fi		= NULL;
2527 #ifdef CONFIG_IP_MULTIPLE_TABLES
2528 	res.r		= NULL;
2529 #endif
2530 
2531 	if (oldflp->fl4_src) {
2532 		err = -EINVAL;
2533 		if (ipv4_is_multicast(oldflp->fl4_src) ||
2534 		    ipv4_is_lbcast(oldflp->fl4_src) ||
2535 		    ipv4_is_zeronet(oldflp->fl4_src))
2536 			goto out;
2537 
2538 		/* I removed check for oif == dev_out->oif here.
2539 		   It was wrong for two reasons:
2540 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2541 		      is assigned to multiple interfaces.
2542 		   2. Moreover, we are allowed to send packets with saddr
2543 		      of another iface. --ANK
2544 		 */
2545 
2546 		if (oldflp->oif == 0 &&
2547 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
2548 		     ipv4_is_lbcast(oldflp->fl4_dst))) {
2549 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2550 			dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2551 			if (dev_out == NULL)
2552 				goto out;
2553 
2554 			/* Special hack: user can direct multicasts
2555 			   and limited broadcast via necessary interface
2556 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2557 			   This hack is not just for fun, it allows
2558 			   vic,vat and friends to work.
2559 			   They bind socket to loopback, set ttl to zero
2560 			   and expect that it will work.
2561 			   From the viewpoint of routing cache they are broken,
2562 			   because we are not allowed to build multicast path
2563 			   with loopback source addr (look, routing cache
2564 			   cannot know, that ttl is zero, so that packet
2565 			   will not leave this host and route is valid).
2566 			   Luckily, this hack is good workaround.
2567 			 */
2568 
2569 			fl.oif = dev_out->ifindex;
2570 			goto make_route;
2571 		}
2572 
2573 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2574 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2575 			if (!__ip_dev_find(net, oldflp->fl4_src, false))
2576 				goto out;
2577 		}
2578 	}
2579 
2580 
2581 	if (oldflp->oif) {
2582 		dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2583 		err = -ENODEV;
2584 		if (dev_out == NULL)
2585 			goto out;
2586 
2587 		/* RACE: Check return value of inet_select_addr instead. */
2588 		if (rcu_dereference(dev_out->ip_ptr) == NULL)
2589 			goto out;	/* Wrong error code */
2590 
2591 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2592 		    ipv4_is_lbcast(oldflp->fl4_dst)) {
2593 			if (!fl.fl4_src)
2594 				fl.fl4_src = inet_select_addr(dev_out, 0,
2595 							      RT_SCOPE_LINK);
2596 			goto make_route;
2597 		}
2598 		if (!fl.fl4_src) {
2599 			if (ipv4_is_multicast(oldflp->fl4_dst))
2600 				fl.fl4_src = inet_select_addr(dev_out, 0,
2601 							      fl.fl4_scope);
2602 			else if (!oldflp->fl4_dst)
2603 				fl.fl4_src = inet_select_addr(dev_out, 0,
2604 							      RT_SCOPE_HOST);
2605 		}
2606 	}
2607 
2608 	if (!fl.fl4_dst) {
2609 		fl.fl4_dst = fl.fl4_src;
2610 		if (!fl.fl4_dst)
2611 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2612 		dev_out = net->loopback_dev;
2613 		fl.oif = net->loopback_dev->ifindex;
2614 		res.type = RTN_LOCAL;
2615 		flags |= RTCF_LOCAL;
2616 		goto make_route;
2617 	}
2618 
2619 	if (fib_lookup(net, &fl, &res)) {
2620 		res.fi = NULL;
2621 		if (oldflp->oif) {
2622 			/* Apparently, routing tables are wrong. Assume,
2623 			   that the destination is on link.
2624 
2625 			   WHY? DW.
2626 			   Because we are allowed to send to iface
2627 			   even if it has NO routes and NO assigned
2628 			   addresses. When oif is specified, routing
2629 			   tables are looked up with only one purpose:
2630 			   to catch if destination is gatewayed, rather than
2631 			   direct. Moreover, if MSG_DONTROUTE is set,
2632 			   we send packet, ignoring both routing tables
2633 			   and ifaddr state. --ANK
2634 
2635 
2636 			   We could make it even if oif is unknown,
2637 			   likely IPv6, but we do not.
2638 			 */
2639 
2640 			if (fl.fl4_src == 0)
2641 				fl.fl4_src = inet_select_addr(dev_out, 0,
2642 							      RT_SCOPE_LINK);
2643 			res.type = RTN_UNICAST;
2644 			goto make_route;
2645 		}
2646 		err = -ENETUNREACH;
2647 		goto out;
2648 	}
2649 
2650 	if (res.type == RTN_LOCAL) {
2651 		if (!fl.fl4_src)
2652 			fl.fl4_src = fl.fl4_dst;
2653 		dev_out = net->loopback_dev;
2654 		fl.oif = dev_out->ifindex;
2655 		res.fi = NULL;
2656 		flags |= RTCF_LOCAL;
2657 		goto make_route;
2658 	}
2659 
2660 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2661 	if (res.fi->fib_nhs > 1 && fl.oif == 0)
2662 		fib_select_multipath(&fl, &res);
2663 	else
2664 #endif
2665 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2666 		fib_select_default(net, &fl, &res);
2667 
2668 	if (!fl.fl4_src)
2669 		fl.fl4_src = FIB_RES_PREFSRC(res);
2670 
2671 	dev_out = FIB_RES_DEV(res);
2672 	fl.oif = dev_out->ifindex;
2673 
2674 
2675 make_route:
2676 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2677 
2678 out:	return err;
2679 }
2680 
2681 int __ip_route_output_key(struct net *net, struct rtable **rp,
2682 			  const struct flowi *flp)
2683 {
2684 	unsigned int hash;
2685 	int res;
2686 	struct rtable *rth;
2687 
2688 	if (!rt_caching(net))
2689 		goto slow_output;
2690 
2691 	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2692 
2693 	rcu_read_lock_bh();
2694 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2695 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2696 		if (rth->fl.fl4_dst == flp->fl4_dst &&
2697 		    rth->fl.fl4_src == flp->fl4_src &&
2698 		    rth->fl.iif == 0 &&
2699 		    rth->fl.oif == flp->oif &&
2700 		    rth->fl.mark == flp->mark &&
2701 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2702 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2703 		    net_eq(dev_net(rth->dst.dev), net) &&
2704 		    !rt_is_expired(rth)) {
2705 			dst_use(&rth->dst, jiffies);
2706 			RT_CACHE_STAT_INC(out_hit);
2707 			rcu_read_unlock_bh();
2708 			*rp = rth;
2709 			return 0;
2710 		}
2711 		RT_CACHE_STAT_INC(out_hlist_search);
2712 	}
2713 	rcu_read_unlock_bh();
2714 
2715 slow_output:
2716 	rcu_read_lock();
2717 	res = ip_route_output_slow(net, rp, flp);
2718 	rcu_read_unlock();
2719 	return res;
2720 }
2721 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2722 
2723 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2724 {
2725 	return NULL;
2726 }
2727 
2728 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2729 {
2730 }
2731 
2732 static struct dst_ops ipv4_dst_blackhole_ops = {
2733 	.family			=	AF_INET,
2734 	.protocol		=	cpu_to_be16(ETH_P_IP),
2735 	.destroy		=	ipv4_dst_destroy,
2736 	.check			=	ipv4_blackhole_dst_check,
2737 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2738 };
2739 
2740 
2741 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2742 {
2743 	struct rtable *ort = *rp;
2744 	struct rtable *rt = (struct rtable *)
2745 		dst_alloc(&ipv4_dst_blackhole_ops);
2746 
2747 	if (rt) {
2748 		struct dst_entry *new = &rt->dst;
2749 
2750 		atomic_set(&new->__refcnt, 1);
2751 		new->__use = 1;
2752 		new->input = dst_discard;
2753 		new->output = dst_discard;
2754 		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2755 
2756 		new->dev = ort->dst.dev;
2757 		if (new->dev)
2758 			dev_hold(new->dev);
2759 
2760 		rt->fl = ort->fl;
2761 
2762 		rt->idev = ort->idev;
2763 		if (rt->idev)
2764 			in_dev_hold(rt->idev);
2765 		rt->rt_genid = rt_genid(net);
2766 		rt->rt_flags = ort->rt_flags;
2767 		rt->rt_type = ort->rt_type;
2768 		rt->rt_dst = ort->rt_dst;
2769 		rt->rt_src = ort->rt_src;
2770 		rt->rt_iif = ort->rt_iif;
2771 		rt->rt_gateway = ort->rt_gateway;
2772 		rt->rt_spec_dst = ort->rt_spec_dst;
2773 		rt->peer = ort->peer;
2774 		if (rt->peer)
2775 			atomic_inc(&rt->peer->refcnt);
2776 
2777 		dst_free(new);
2778 	}
2779 
2780 	dst_release(&(*rp)->dst);
2781 	*rp = rt;
2782 	return rt ? 0 : -ENOMEM;
2783 }
2784 
2785 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2786 			 struct sock *sk, int flags)
2787 {
2788 	int err;
2789 
2790 	if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2791 		return err;
2792 
2793 	if (flp->proto) {
2794 		if (!flp->fl4_src)
2795 			flp->fl4_src = (*rp)->rt_src;
2796 		if (!flp->fl4_dst)
2797 			flp->fl4_dst = (*rp)->rt_dst;
2798 		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2799 				    flags ? XFRM_LOOKUP_WAIT : 0);
2800 		if (err == -EREMOTE)
2801 			err = ipv4_dst_blackhole(net, rp, flp);
2802 
2803 		return err;
2804 	}
2805 
2806 	return 0;
2807 }
2808 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2809 
2810 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2811 {
2812 	return ip_route_output_flow(net, rp, flp, NULL, 0);
2813 }
2814 EXPORT_SYMBOL(ip_route_output_key);
2815 
2816 static int rt_fill_info(struct net *net,
2817 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2818 			int nowait, unsigned int flags)
2819 {
2820 	struct rtable *rt = skb_rtable(skb);
2821 	struct rtmsg *r;
2822 	struct nlmsghdr *nlh;
2823 	long expires;
2824 	u32 id = 0, ts = 0, tsage = 0, error;
2825 
2826 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2827 	if (nlh == NULL)
2828 		return -EMSGSIZE;
2829 
2830 	r = nlmsg_data(nlh);
2831 	r->rtm_family	 = AF_INET;
2832 	r->rtm_dst_len	= 32;
2833 	r->rtm_src_len	= 0;
2834 	r->rtm_tos	= rt->fl.fl4_tos;
2835 	r->rtm_table	= RT_TABLE_MAIN;
2836 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2837 	r->rtm_type	= rt->rt_type;
2838 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2839 	r->rtm_protocol = RTPROT_UNSPEC;
2840 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2841 	if (rt->rt_flags & RTCF_NOTIFY)
2842 		r->rtm_flags |= RTM_F_NOTIFY;
2843 
2844 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2845 
2846 	if (rt->fl.fl4_src) {
2847 		r->rtm_src_len = 32;
2848 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2849 	}
2850 	if (rt->dst.dev)
2851 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2852 #ifdef CONFIG_NET_CLS_ROUTE
2853 	if (rt->dst.tclassid)
2854 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2855 #endif
2856 	if (rt->fl.iif)
2857 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2858 	else if (rt->rt_src != rt->fl.fl4_src)
2859 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2860 
2861 	if (rt->rt_dst != rt->rt_gateway)
2862 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2863 
2864 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2865 		goto nla_put_failure;
2866 
2867 	if (rt->fl.mark)
2868 		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2869 
2870 	error = rt->dst.error;
2871 	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2872 	if (rt->peer) {
2873 		inet_peer_refcheck(rt->peer);
2874 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2875 		if (rt->peer->tcp_ts_stamp) {
2876 			ts = rt->peer->tcp_ts;
2877 			tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2878 		}
2879 	}
2880 
2881 	if (rt->fl.iif) {
2882 #ifdef CONFIG_IP_MROUTE
2883 		__be32 dst = rt->rt_dst;
2884 
2885 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2886 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2887 			int err = ipmr_get_route(net, skb, r, nowait);
2888 			if (err <= 0) {
2889 				if (!nowait) {
2890 					if (err == 0)
2891 						return 0;
2892 					goto nla_put_failure;
2893 				} else {
2894 					if (err == -EMSGSIZE)
2895 						goto nla_put_failure;
2896 					error = err;
2897 				}
2898 			}
2899 		} else
2900 #endif
2901 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2902 	}
2903 
2904 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2905 			       expires, error) < 0)
2906 		goto nla_put_failure;
2907 
2908 	return nlmsg_end(skb, nlh);
2909 
2910 nla_put_failure:
2911 	nlmsg_cancel(skb, nlh);
2912 	return -EMSGSIZE;
2913 }
2914 
2915 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2916 {
2917 	struct net *net = sock_net(in_skb->sk);
2918 	struct rtmsg *rtm;
2919 	struct nlattr *tb[RTA_MAX+1];
2920 	struct rtable *rt = NULL;
2921 	__be32 dst = 0;
2922 	__be32 src = 0;
2923 	u32 iif;
2924 	int err;
2925 	int mark;
2926 	struct sk_buff *skb;
2927 
2928 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2929 	if (err < 0)
2930 		goto errout;
2931 
2932 	rtm = nlmsg_data(nlh);
2933 
2934 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2935 	if (skb == NULL) {
2936 		err = -ENOBUFS;
2937 		goto errout;
2938 	}
2939 
2940 	/* Reserve room for dummy headers, this skb can pass
2941 	   through good chunk of routing engine.
2942 	 */
2943 	skb_reset_mac_header(skb);
2944 	skb_reset_network_header(skb);
2945 
2946 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2947 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2948 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2949 
2950 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2951 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2952 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2953 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2954 
2955 	if (iif) {
2956 		struct net_device *dev;
2957 
2958 		dev = __dev_get_by_index(net, iif);
2959 		if (dev == NULL) {
2960 			err = -ENODEV;
2961 			goto errout_free;
2962 		}
2963 
2964 		skb->protocol	= htons(ETH_P_IP);
2965 		skb->dev	= dev;
2966 		skb->mark	= mark;
2967 		local_bh_disable();
2968 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2969 		local_bh_enable();
2970 
2971 		rt = skb_rtable(skb);
2972 		if (err == 0 && rt->dst.error)
2973 			err = -rt->dst.error;
2974 	} else {
2975 		struct flowi fl = {
2976 			.nl_u = {
2977 				.ip4_u = {
2978 					.daddr = dst,
2979 					.saddr = src,
2980 					.tos = rtm->rtm_tos,
2981 				},
2982 			},
2983 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2984 			.mark = mark,
2985 		};
2986 		err = ip_route_output_key(net, &rt, &fl);
2987 	}
2988 
2989 	if (err)
2990 		goto errout_free;
2991 
2992 	skb_dst_set(skb, &rt->dst);
2993 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2994 		rt->rt_flags |= RTCF_NOTIFY;
2995 
2996 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2997 			   RTM_NEWROUTE, 0, 0);
2998 	if (err <= 0)
2999 		goto errout_free;
3000 
3001 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3002 errout:
3003 	return err;
3004 
3005 errout_free:
3006 	kfree_skb(skb);
3007 	goto errout;
3008 }
3009 
3010 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3011 {
3012 	struct rtable *rt;
3013 	int h, s_h;
3014 	int idx, s_idx;
3015 	struct net *net;
3016 
3017 	net = sock_net(skb->sk);
3018 
3019 	s_h = cb->args[0];
3020 	if (s_h < 0)
3021 		s_h = 0;
3022 	s_idx = idx = cb->args[1];
3023 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3024 		if (!rt_hash_table[h].chain)
3025 			continue;
3026 		rcu_read_lock_bh();
3027 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3028 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3029 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3030 				continue;
3031 			if (rt_is_expired(rt))
3032 				continue;
3033 			skb_dst_set_noref(skb, &rt->dst);
3034 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3035 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3036 					 1, NLM_F_MULTI) <= 0) {
3037 				skb_dst_drop(skb);
3038 				rcu_read_unlock_bh();
3039 				goto done;
3040 			}
3041 			skb_dst_drop(skb);
3042 		}
3043 		rcu_read_unlock_bh();
3044 	}
3045 
3046 done:
3047 	cb->args[0] = h;
3048 	cb->args[1] = idx;
3049 	return skb->len;
3050 }
3051 
3052 void ip_rt_multicast_event(struct in_device *in_dev)
3053 {
3054 	rt_cache_flush(dev_net(in_dev->dev), 0);
3055 }
3056 
3057 #ifdef CONFIG_SYSCTL
3058 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3059 					void __user *buffer,
3060 					size_t *lenp, loff_t *ppos)
3061 {
3062 	if (write) {
3063 		int flush_delay;
3064 		ctl_table ctl;
3065 		struct net *net;
3066 
3067 		memcpy(&ctl, __ctl, sizeof(ctl));
3068 		ctl.data = &flush_delay;
3069 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3070 
3071 		net = (struct net *)__ctl->extra1;
3072 		rt_cache_flush(net, flush_delay);
3073 		return 0;
3074 	}
3075 
3076 	return -EINVAL;
3077 }
3078 
3079 static ctl_table ipv4_route_table[] = {
3080 	{
3081 		.procname	= "gc_thresh",
3082 		.data		= &ipv4_dst_ops.gc_thresh,
3083 		.maxlen		= sizeof(int),
3084 		.mode		= 0644,
3085 		.proc_handler	= proc_dointvec,
3086 	},
3087 	{
3088 		.procname	= "max_size",
3089 		.data		= &ip_rt_max_size,
3090 		.maxlen		= sizeof(int),
3091 		.mode		= 0644,
3092 		.proc_handler	= proc_dointvec,
3093 	},
3094 	{
3095 		/*  Deprecated. Use gc_min_interval_ms */
3096 
3097 		.procname	= "gc_min_interval",
3098 		.data		= &ip_rt_gc_min_interval,
3099 		.maxlen		= sizeof(int),
3100 		.mode		= 0644,
3101 		.proc_handler	= proc_dointvec_jiffies,
3102 	},
3103 	{
3104 		.procname	= "gc_min_interval_ms",
3105 		.data		= &ip_rt_gc_min_interval,
3106 		.maxlen		= sizeof(int),
3107 		.mode		= 0644,
3108 		.proc_handler	= proc_dointvec_ms_jiffies,
3109 	},
3110 	{
3111 		.procname	= "gc_timeout",
3112 		.data		= &ip_rt_gc_timeout,
3113 		.maxlen		= sizeof(int),
3114 		.mode		= 0644,
3115 		.proc_handler	= proc_dointvec_jiffies,
3116 	},
3117 	{
3118 		.procname	= "gc_interval",
3119 		.data		= &ip_rt_gc_interval,
3120 		.maxlen		= sizeof(int),
3121 		.mode		= 0644,
3122 		.proc_handler	= proc_dointvec_jiffies,
3123 	},
3124 	{
3125 		.procname	= "redirect_load",
3126 		.data		= &ip_rt_redirect_load,
3127 		.maxlen		= sizeof(int),
3128 		.mode		= 0644,
3129 		.proc_handler	= proc_dointvec,
3130 	},
3131 	{
3132 		.procname	= "redirect_number",
3133 		.data		= &ip_rt_redirect_number,
3134 		.maxlen		= sizeof(int),
3135 		.mode		= 0644,
3136 		.proc_handler	= proc_dointvec,
3137 	},
3138 	{
3139 		.procname	= "redirect_silence",
3140 		.data		= &ip_rt_redirect_silence,
3141 		.maxlen		= sizeof(int),
3142 		.mode		= 0644,
3143 		.proc_handler	= proc_dointvec,
3144 	},
3145 	{
3146 		.procname	= "error_cost",
3147 		.data		= &ip_rt_error_cost,
3148 		.maxlen		= sizeof(int),
3149 		.mode		= 0644,
3150 		.proc_handler	= proc_dointvec,
3151 	},
3152 	{
3153 		.procname	= "error_burst",
3154 		.data		= &ip_rt_error_burst,
3155 		.maxlen		= sizeof(int),
3156 		.mode		= 0644,
3157 		.proc_handler	= proc_dointvec,
3158 	},
3159 	{
3160 		.procname	= "gc_elasticity",
3161 		.data		= &ip_rt_gc_elasticity,
3162 		.maxlen		= sizeof(int),
3163 		.mode		= 0644,
3164 		.proc_handler	= proc_dointvec,
3165 	},
3166 	{
3167 		.procname	= "mtu_expires",
3168 		.data		= &ip_rt_mtu_expires,
3169 		.maxlen		= sizeof(int),
3170 		.mode		= 0644,
3171 		.proc_handler	= proc_dointvec_jiffies,
3172 	},
3173 	{
3174 		.procname	= "min_pmtu",
3175 		.data		= &ip_rt_min_pmtu,
3176 		.maxlen		= sizeof(int),
3177 		.mode		= 0644,
3178 		.proc_handler	= proc_dointvec,
3179 	},
3180 	{
3181 		.procname	= "min_adv_mss",
3182 		.data		= &ip_rt_min_advmss,
3183 		.maxlen		= sizeof(int),
3184 		.mode		= 0644,
3185 		.proc_handler	= proc_dointvec,
3186 	},
3187 	{ }
3188 };
3189 
3190 static struct ctl_table empty[1];
3191 
3192 static struct ctl_table ipv4_skeleton[] =
3193 {
3194 	{ .procname = "route",
3195 	  .mode = 0555, .child = ipv4_route_table},
3196 	{ .procname = "neigh",
3197 	  .mode = 0555, .child = empty},
3198 	{ }
3199 };
3200 
3201 static __net_initdata struct ctl_path ipv4_path[] = {
3202 	{ .procname = "net", },
3203 	{ .procname = "ipv4", },
3204 	{ },
3205 };
3206 
3207 static struct ctl_table ipv4_route_flush_table[] = {
3208 	{
3209 		.procname	= "flush",
3210 		.maxlen		= sizeof(int),
3211 		.mode		= 0200,
3212 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3213 	},
3214 	{ },
3215 };
3216 
3217 static __net_initdata struct ctl_path ipv4_route_path[] = {
3218 	{ .procname = "net", },
3219 	{ .procname = "ipv4", },
3220 	{ .procname = "route", },
3221 	{ },
3222 };
3223 
3224 static __net_init int sysctl_route_net_init(struct net *net)
3225 {
3226 	struct ctl_table *tbl;
3227 
3228 	tbl = ipv4_route_flush_table;
3229 	if (!net_eq(net, &init_net)) {
3230 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3231 		if (tbl == NULL)
3232 			goto err_dup;
3233 	}
3234 	tbl[0].extra1 = net;
3235 
3236 	net->ipv4.route_hdr =
3237 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3238 	if (net->ipv4.route_hdr == NULL)
3239 		goto err_reg;
3240 	return 0;
3241 
3242 err_reg:
3243 	if (tbl != ipv4_route_flush_table)
3244 		kfree(tbl);
3245 err_dup:
3246 	return -ENOMEM;
3247 }
3248 
3249 static __net_exit void sysctl_route_net_exit(struct net *net)
3250 {
3251 	struct ctl_table *tbl;
3252 
3253 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3254 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3255 	BUG_ON(tbl == ipv4_route_flush_table);
3256 	kfree(tbl);
3257 }
3258 
3259 static __net_initdata struct pernet_operations sysctl_route_ops = {
3260 	.init = sysctl_route_net_init,
3261 	.exit = sysctl_route_net_exit,
3262 };
3263 #endif
3264 
3265 static __net_init int rt_genid_init(struct net *net)
3266 {
3267 	get_random_bytes(&net->ipv4.rt_genid,
3268 			 sizeof(net->ipv4.rt_genid));
3269 	return 0;
3270 }
3271 
3272 static __net_initdata struct pernet_operations rt_genid_ops = {
3273 	.init = rt_genid_init,
3274 };
3275 
3276 
3277 #ifdef CONFIG_NET_CLS_ROUTE
3278 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3279 #endif /* CONFIG_NET_CLS_ROUTE */
3280 
3281 static __initdata unsigned long rhash_entries;
3282 static int __init set_rhash_entries(char *str)
3283 {
3284 	if (!str)
3285 		return 0;
3286 	rhash_entries = simple_strtoul(str, &str, 0);
3287 	return 1;
3288 }
3289 __setup("rhash_entries=", set_rhash_entries);
3290 
3291 int __init ip_rt_init(void)
3292 {
3293 	int rc = 0;
3294 
3295 #ifdef CONFIG_NET_CLS_ROUTE
3296 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3297 	if (!ip_rt_acct)
3298 		panic("IP: failed to allocate ip_rt_acct\n");
3299 #endif
3300 
3301 	ipv4_dst_ops.kmem_cachep =
3302 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3303 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3304 
3305 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3306 
3307 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3308 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3309 
3310 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3311 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3312 
3313 	rt_hash_table = (struct rt_hash_bucket *)
3314 		alloc_large_system_hash("IP route cache",
3315 					sizeof(struct rt_hash_bucket),
3316 					rhash_entries,
3317 					(totalram_pages >= 128 * 1024) ?
3318 					15 : 17,
3319 					0,
3320 					&rt_hash_log,
3321 					&rt_hash_mask,
3322 					rhash_entries ? 0 : 512 * 1024);
3323 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3324 	rt_hash_lock_init();
3325 
3326 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3327 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3328 
3329 	devinet_init();
3330 	ip_fib_init();
3331 
3332 	/* All the timers, started at system startup tend
3333 	   to synchronize. Perturb it a bit.
3334 	 */
3335 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3336 	expires_ljiffies = jiffies;
3337 	schedule_delayed_work(&expires_work,
3338 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3339 
3340 	if (ip_rt_proc_init())
3341 		printk(KERN_ERR "Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343 	xfrm_init();
3344 	xfrm4_init(ip_rt_max_size);
3345 #endif
3346 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3347 
3348 #ifdef CONFIG_SYSCTL
3349 	register_pernet_subsys(&sysctl_route_ops);
3350 #endif
3351 	register_pernet_subsys(&rt_genid_ops);
3352 	return rc;
3353 }
3354 
3355 #ifdef CONFIG_SYSCTL
3356 /*
3357  * We really need to sanitize the damn ipv4 init order, then all
3358  * this nonsense will go away.
3359  */
3360 void __init ip_static_sysctl_init(void)
3361 {
3362 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3363 }
3364 #endif
3365